12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667 |
- """
- Iterator based sre token scanner
- """
- import re
- from re import VERBOSE, MULTILINE, DOTALL
- import sre_parse
- import sre_compile
- import sre_constants
- from sre_constants import BRANCH, SUBPATTERN
- __all__ = ['Scanner', 'pattern']
- FLAGS = (VERBOSE | MULTILINE | DOTALL)
- class Scanner(object):
- def __init__(self, lexicon, flags=FLAGS):
- self.actions = [None]
- # Combine phrases into a compound pattern
- s = sre_parse.Pattern()
- s.flags = flags
- p = []
- for idx, token in enumerate(lexicon):
- phrase = token.pattern
- try:
- subpattern = sre_parse.SubPattern(s,
- [(SUBPATTERN, (idx + 1, sre_parse.parse(phrase, flags)))])
- except sre_constants.error:
- raise
- p.append(subpattern)
- self.actions.append(token)
- s.groups = len(p) + 1 # NOTE(guido): Added to make SRE validation work
- p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
- self.scanner = sre_compile.compile(p)
- def iterscan(self, string, idx=0, context=None):
- """
- Yield match, end_idx for each match
- """
- match = self.scanner.scanner(string, idx).match
- actions = self.actions
- lastend = idx
- end = len(string)
- while True:
- m = match()
- if m is None:
- break
- matchbegin, matchend = m.span()
- if lastend == matchend:
- break
- action = actions[m.lastindex]
- if action is not None:
- rval, next_pos = action(m, context)
- if next_pos is not None and next_pos != matchend:
- # "fast forward" the scanner
- matchend = next_pos
- match = self.scanner.scanner(string, matchend).match
- yield rval, matchend
- lastend = matchend
- def pattern(pattern, flags=FLAGS):
- def decorator(fn):
- fn.pattern = pattern
- fn.regex = re.compile(pattern, flags)
- return fn
- return decorator
|