""" pygments.regexopt ~~~~~~~~~~~~~~~~~ An algorithm that generates optimized regexes for matching long lists of literal strings. :copyright: Copyright 2006-2024 by the Pygments team, see AUTHORS. :license: BSD, see LICENSE for details. """ import re from re import escape from os.path import commonprefix from itertools import groupby from operator import itemgetter CS_ESCAPE = re.compile(r'[\[\^\\\-\]]') FIRST_ELEMENT = itemgetter(0) def make_charset(letters): return '[' + CS_ESCAPE.sub(lambda m: '\\' + m.group(), ''.join(letters)) + ']' def regex_opt_inner(strings, open_paren): """Return a regex that matches any string in the sorted list of strings.""" close_paren = open_paren and ')' or '' # print strings, repr(open_paren) if not strings: # print '-> nothing left' return '' first = strings[0] if len(strings) == 1: # print '-> only 1 string' return open_paren + escape(first) + close_paren if not first: # print '-> first string empty' return open_paren + regex_opt_inner(strings[1:], '(?:') \ + '?' + close_paren if len(first) == 1: # multiple one-char strings? make a charset oneletter = [] rest = [] for s in strings: if len(s) == 1: oneletter.append(s) else: rest.append(s) if len(oneletter) > 1: # do we have more than one oneletter string? if rest: # print '-> 1-character + rest' return open_paren + regex_opt_inner(rest, '') + '|' \ + make_charset(oneletter) + close_paren # print '-> only 1-character' return open_paren + make_charset(oneletter) + close_paren prefix = commonprefix(strings) if prefix: plen = len(prefix) # we have a prefix for all strings # print '-> prefix:', prefix return open_paren + escape(prefix) \ + regex_opt_inner([s[plen:] for s in strings], '(?:') \ + close_paren # is there a suffix? strings_rev = [s[::-1] for s in strings] suffix = commonprefix(strings_rev) if suffix: slen = len(suffix) # print '-> suffix:', suffix[::-1] return open_paren \ + regex_opt_inner(sorted(s[:-slen] for s in strings), '(?:') \ + escape(suffix[::-1]) + close_paren # recurse on common 1-string prefixes # print '-> last resort' return open_paren + \ '|'.join(regex_opt_inner(list(group[1]), '') for group in groupby(strings, lambda s: s[0] == first[0])) \ + close_paren def regex_opt(strings, prefix='', suffix=''): """Return a compiled regex that matches any string in the given list. The strings to match must be literal strings, not regexes. They will be regex-escaped. *prefix* and *suffix* are pre- and appended to the final regex. """ strings = sorted(strings) return prefix + regex_opt_inner(strings, '(') + suffix
Name | Type | Size | Permission | Actions |
---|---|---|---|---|
__pycache__ | Folder | 0755 |
|
|
filters | Folder | 0755 |
|
|
formatters | Folder | 0755 |
|
|
lexers | Folder | 0755 |
|
|
styles | Folder | 0755 |
|
|
__init__.py | File | 2.91 KB | 0644 |
|
__main__.py | File | 353 B | 0644 |
|
cmdline.py | File | 23.1 KB | 0644 |
|
console.py | File | 1.68 KB | 0644 |
|
filter.py | File | 1.87 KB | 0644 |
|
formatter.py | File | 4.29 KB | 0644 |
|
lexer.py | File | 34.52 KB | 0644 |
|
modeline.py | File | 1005 B | 0644 |
|
plugin.py | File | 1.85 KB | 0644 |
|
regexopt.py | File | 3 KB | 0644 |
|
scanner.py | File | 3.02 KB | 0644 |
|
sphinxext.py | File | 7.79 KB | 0644 |
|
style.py | File | 6.27 KB | 0644 |
|
token.py | File | 6.08 KB | 0644 |
|
unistring.py | File | 61.73 KB | 0644 |
|
util.py | File | 9.8 KB | 0644 |
|