图片解析应用
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

943 lines
34 KiB

  1. """
  2. pygments.lexer
  3. ~~~~~~~~~~~~~~
  4. Base lexer classes.
  5. :copyright: Copyright 2006-2023 by the Pygments team, see AUTHORS.
  6. :license: BSD, see LICENSE for details.
  7. """
  8. import re
  9. import sys
  10. import time
  11. from pip._vendor.pygments.filter import apply_filters, Filter
  12. from pip._vendor.pygments.filters import get_filter_by_name
  13. from pip._vendor.pygments.token import Error, Text, Other, Whitespace, _TokenType
  14. from pip._vendor.pygments.util import get_bool_opt, get_int_opt, get_list_opt, \
  15. make_analysator, Future, guess_decode
  16. from pip._vendor.pygments.regexopt import regex_opt
  17. __all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer',
  18. 'LexerContext', 'include', 'inherit', 'bygroups', 'using', 'this',
  19. 'default', 'words', 'line_re']
  20. line_re = re.compile('.*?\n')
  21. _encoding_map = [(b'\xef\xbb\xbf', 'utf-8'),
  22. (b'\xff\xfe\0\0', 'utf-32'),
  23. (b'\0\0\xfe\xff', 'utf-32be'),
  24. (b'\xff\xfe', 'utf-16'),
  25. (b'\xfe\xff', 'utf-16be')]
  26. _default_analyse = staticmethod(lambda x: 0.0)
  27. class LexerMeta(type):
  28. """
  29. This metaclass automagically converts ``analyse_text`` methods into
  30. static methods which always return float values.
  31. """
  32. def __new__(mcs, name, bases, d):
  33. if 'analyse_text' in d:
  34. d['analyse_text'] = make_analysator(d['analyse_text'])
  35. return type.__new__(mcs, name, bases, d)
  36. class Lexer(metaclass=LexerMeta):
  37. """
  38. Lexer for a specific language.
  39. See also :doc:`lexerdevelopment`, a high-level guide to writing
  40. lexers.
  41. Lexer classes have attributes used for choosing the most appropriate
  42. lexer based on various criteria.
  43. .. autoattribute:: name
  44. :no-value:
  45. .. autoattribute:: aliases
  46. :no-value:
  47. .. autoattribute:: filenames
  48. :no-value:
  49. .. autoattribute:: alias_filenames
  50. .. autoattribute:: mimetypes
  51. :no-value:
  52. .. autoattribute:: priority
  53. Lexers included in Pygments should have an additional attribute:
  54. .. autoattribute:: url
  55. :no-value:
  56. You can pass options to the constructor. The basic options recognized
  57. by all lexers and processed by the base `Lexer` class are:
  58. ``stripnl``
  59. Strip leading and trailing newlines from the input (default: True).
  60. ``stripall``
  61. Strip all leading and trailing whitespace from the input
  62. (default: False).
  63. ``ensurenl``
  64. Make sure that the input ends with a newline (default: True). This
  65. is required for some lexers that consume input linewise.
  66. .. versionadded:: 1.3
  67. ``tabsize``
  68. If given and greater than 0, expand tabs in the input (default: 0).
  69. ``encoding``
  70. If given, must be an encoding name. This encoding will be used to
  71. convert the input string to Unicode, if it is not already a Unicode
  72. string (default: ``'guess'``, which uses a simple UTF-8 / Locale /
  73. Latin1 detection. Can also be ``'chardet'`` to use the chardet
  74. library, if it is installed.
  75. ``inencoding``
  76. Overrides the ``encoding`` if given.
  77. """
  78. #: Full name of the lexer, in human-readable form
  79. name = None
  80. #: A list of short, unique identifiers that can be used to look
  81. #: up the lexer from a list, e.g., using `get_lexer_by_name()`.
  82. aliases = []
  83. #: A list of `fnmatch` patterns that match filenames which contain
  84. #: content for this lexer. The patterns in this list should be unique among
  85. #: all lexers.
  86. filenames = []
  87. #: A list of `fnmatch` patterns that match filenames which may or may not
  88. #: contain content for this lexer. This list is used by the
  89. #: :func:`.guess_lexer_for_filename()` function, to determine which lexers
  90. #: are then included in guessing the correct one. That means that
  91. #: e.g. every lexer for HTML and a template language should include
  92. #: ``\*.html`` in this list.
  93. alias_filenames = []
  94. #: A list of MIME types for content that can be lexed with this lexer.
  95. mimetypes = []
  96. #: Priority, should multiple lexers match and no content is provided
  97. priority = 0
  98. #: URL of the language specification/definition. Used in the Pygments
  99. #: documentation.
  100. url = None
  101. def __init__(self, **options):
  102. """
  103. This constructor takes arbitrary options as keyword arguments.
  104. Every subclass must first process its own options and then call
  105. the `Lexer` constructor, since it processes the basic
  106. options like `stripnl`.
  107. An example looks like this:
  108. .. sourcecode:: python
  109. def __init__(self, **options):
  110. self.compress = options.get('compress', '')
  111. Lexer.__init__(self, **options)
  112. As these options must all be specifiable as strings (due to the
  113. command line usage), there are various utility functions
  114. available to help with that, see `Utilities`_.
  115. """
  116. self.options = options
  117. self.stripnl = get_bool_opt(options, 'stripnl', True)
  118. self.stripall = get_bool_opt(options, 'stripall', False)
  119. self.ensurenl = get_bool_opt(options, 'ensurenl', True)
  120. self.tabsize = get_int_opt(options, 'tabsize', 0)
  121. self.encoding = options.get('encoding', 'guess')
  122. self.encoding = options.get('inencoding') or self.encoding
  123. self.filters = []
  124. for filter_ in get_list_opt(options, 'filters', ()):
  125. self.add_filter(filter_)
  126. def __repr__(self):
  127. if self.options:
  128. return '<pygments.lexers.%s with %r>' % (self.__class__.__name__,
  129. self.options)
  130. else:
  131. return '<pygments.lexers.%s>' % self.__class__.__name__
  132. def add_filter(self, filter_, **options):
  133. """
  134. Add a new stream filter to this lexer.
  135. """
  136. if not isinstance(filter_, Filter):
  137. filter_ = get_filter_by_name(filter_, **options)
  138. self.filters.append(filter_)
  139. def analyse_text(text):
  140. """
  141. A static method which is called for lexer guessing.
  142. It should analyse the text and return a float in the range
  143. from ``0.0`` to ``1.0``. If it returns ``0.0``, the lexer
  144. will not be selected as the most probable one, if it returns
  145. ``1.0``, it will be selected immediately. This is used by
  146. `guess_lexer`.
  147. The `LexerMeta` metaclass automatically wraps this function so
  148. that it works like a static method (no ``self`` or ``cls``
  149. parameter) and the return value is automatically converted to
  150. `float`. If the return value is an object that is boolean `False`
  151. it's the same as if the return values was ``0.0``.
  152. """
  153. def get_tokens(self, text, unfiltered=False):
  154. """
  155. This method is the basic interface of a lexer. It is called by
  156. the `highlight()` function. It must process the text and return an
  157. iterable of ``(tokentype, value)`` pairs from `text`.
  158. Normally, you don't need to override this method. The default
  159. implementation processes the options recognized by all lexers
  160. (`stripnl`, `stripall` and so on), and then yields all tokens
  161. from `get_tokens_unprocessed()`, with the ``index`` dropped.
  162. If `unfiltered` is set to `True`, the filtering mechanism is
  163. bypassed even if filters are defined.
  164. """
  165. if not isinstance(text, str):
  166. if self.encoding == 'guess':
  167. text, _ = guess_decode(text)
  168. elif self.encoding == 'chardet':
  169. try:
  170. from pip._vendor import chardet
  171. except ImportError as e:
  172. raise ImportError('To enable chardet encoding guessing, '
  173. 'please install the chardet library '
  174. 'from http://chardet.feedparser.org/') from e
  175. # check for BOM first
  176. decoded = None
  177. for bom, encoding in _encoding_map:
  178. if text.startswith(bom):
  179. decoded = text[len(bom):].decode(encoding, 'replace')
  180. break
  181. # no BOM found, so use chardet
  182. if decoded is None:
  183. enc = chardet.detect(text[:1024]) # Guess using first 1KB
  184. decoded = text.decode(enc.get('encoding') or 'utf-8',
  185. 'replace')
  186. text = decoded
  187. else:
  188. text = text.decode(self.encoding)
  189. if text.startswith('\ufeff'):
  190. text = text[len('\ufeff'):]
  191. else:
  192. if text.startswith('\ufeff'):
  193. text = text[len('\ufeff'):]
  194. # text now *is* a unicode string
  195. text = text.replace('\r\n', '\n')
  196. text = text.replace('\r', '\n')
  197. if self.stripall:
  198. text = text.strip()
  199. elif self.stripnl:
  200. text = text.strip('\n')
  201. if self.tabsize > 0:
  202. text = text.expandtabs(self.tabsize)
  203. if self.ensurenl and not text.endswith('\n'):
  204. text += '\n'
  205. def streamer():
  206. for _, t, v in self.get_tokens_unprocessed(text):
  207. yield t, v
  208. stream = streamer()
  209. if not unfiltered:
  210. stream = apply_filters(stream, self.filters, self)
  211. return stream
  212. def get_tokens_unprocessed(self, text):
  213. """
  214. This method should process the text and return an iterable of
  215. ``(index, tokentype, value)`` tuples where ``index`` is the starting
  216. position of the token within the input text.
  217. It must be overridden by subclasses. It is recommended to
  218. implement it as a generator to maximize effectiveness.
  219. """
  220. raise NotImplementedError
  221. class DelegatingLexer(Lexer):
  222. """
  223. This lexer takes two lexer as arguments. A root lexer and
  224. a language lexer. First everything is scanned using the language
  225. lexer, afterwards all ``Other`` tokens are lexed using the root
  226. lexer.
  227. The lexers from the ``template`` lexer package use this base lexer.
  228. """
  229. def __init__(self, _root_lexer, _language_lexer, _needle=Other, **options):
  230. self.root_lexer = _root_lexer(**options)
  231. self.language_lexer = _language_lexer(**options)
  232. self.needle = _needle
  233. Lexer.__init__(self, **options)
  234. def get_tokens_unprocessed(self, text):
  235. buffered = ''
  236. insertions = []
  237. lng_buffer = []
  238. for i, t, v in self.language_lexer.get_tokens_unprocessed(text):
  239. if t is self.needle:
  240. if lng_buffer:
  241. insertions.append((len(buffered), lng_buffer))
  242. lng_buffer = []
  243. buffered += v
  244. else:
  245. lng_buffer.append((i, t, v))
  246. if lng_buffer:
  247. insertions.append((len(buffered), lng_buffer))
  248. return do_insertions(insertions,
  249. self.root_lexer.get_tokens_unprocessed(buffered))
  250. # ------------------------------------------------------------------------------
  251. # RegexLexer and ExtendedRegexLexer
  252. #
  253. class include(str): # pylint: disable=invalid-name
  254. """
  255. Indicates that a state should include rules from another state.
  256. """
  257. pass
  258. class _inherit:
  259. """
  260. Indicates the a state should inherit from its superclass.
  261. """
  262. def __repr__(self):
  263. return 'inherit'
  264. inherit = _inherit() # pylint: disable=invalid-name
  265. class combined(tuple): # pylint: disable=invalid-name
  266. """
  267. Indicates a state combined from multiple states.
  268. """
  269. def __new__(cls, *args):
  270. return tuple.__new__(cls, args)
  271. def __init__(self, *args):
  272. # tuple.__init__ doesn't do anything
  273. pass
  274. class _PseudoMatch:
  275. """
  276. A pseudo match object constructed from a string.
  277. """
  278. def __init__(self, start, text):
  279. self._text = text
  280. self._start = start
  281. def start(self, arg=None):
  282. return self._start
  283. def end(self, arg=None):
  284. return self._start + len(self._text)
  285. def group(self, arg=None):
  286. if arg:
  287. raise IndexError('No such group')
  288. return self._text
  289. def groups(self):
  290. return (self._text,)
  291. def groupdict(self):
  292. return {}
  293. def bygroups(*args):
  294. """
  295. Callback that yields multiple actions for each group in the match.
  296. """
  297. def callback(lexer, match, ctx=None):
  298. for i, action in enumerate(args):
  299. if action is None:
  300. continue
  301. elif type(action) is _TokenType:
  302. data = match.group(i + 1)
  303. if data:
  304. yield match.start(i + 1), action, data
  305. else:
  306. data = match.group(i + 1)
  307. if data is not None:
  308. if ctx:
  309. ctx.pos = match.start(i + 1)
  310. for item in action(lexer,
  311. _PseudoMatch(match.start(i + 1), data), ctx):
  312. if item:
  313. yield item
  314. if ctx:
  315. ctx.pos = match.end()
  316. return callback
  317. class _This:
  318. """
  319. Special singleton used for indicating the caller class.
  320. Used by ``using``.
  321. """
  322. this = _This()
  323. def using(_other, **kwargs):
  324. """
  325. Callback that processes the match with a different lexer.
  326. The keyword arguments are forwarded to the lexer, except `state` which
  327. is handled separately.
  328. `state` specifies the state that the new lexer will start in, and can
  329. be an enumerable such as ('root', 'inline', 'string') or a simple
  330. string which is assumed to be on top of the root state.
  331. Note: For that to work, `_other` must not be an `ExtendedRegexLexer`.
  332. """
  333. gt_kwargs = {}
  334. if 'state' in kwargs:
  335. s = kwargs.pop('state')
  336. if isinstance(s, (list, tuple)):
  337. gt_kwargs['stack'] = s
  338. else:
  339. gt_kwargs['stack'] = ('root', s)
  340. if _other is this:
  341. def callback(lexer, match, ctx=None):
  342. # if keyword arguments are given the callback
  343. # function has to create a new lexer instance
  344. if kwargs:
  345. # XXX: cache that somehow
  346. kwargs.update(lexer.options)
  347. lx = lexer.__class__(**kwargs)
  348. else:
  349. lx = lexer
  350. s = match.start()
  351. for i, t, v in lx.get_tokens_unprocessed(match.group(), **gt_kwargs):
  352. yield i + s, t, v
  353. if ctx:
  354. ctx.pos = match.end()
  355. else:
  356. def callback(lexer, match, ctx=None):
  357. # XXX: cache that somehow
  358. kwargs.update(lexer.options)
  359. lx = _other(**kwargs)
  360. s = match.start()
  361. for i, t, v in lx.get_tokens_unprocessed(match.group(), **gt_kwargs):
  362. yield i + s, t, v
  363. if ctx:
  364. ctx.pos = match.end()
  365. return callback
  366. class default:
  367. """
  368. Indicates a state or state action (e.g. #pop) to apply.
  369. For example default('#pop') is equivalent to ('', Token, '#pop')
  370. Note that state tuples may be used as well.
  371. .. versionadded:: 2.0
  372. """
  373. def __init__(self, state):
  374. self.state = state
  375. class words(Future):
  376. """
  377. Indicates a list of literal words that is transformed into an optimized
  378. regex that matches any of the words.
  379. .. versionadded:: 2.0
  380. """
  381. def __init__(self, words, prefix='', suffix=''):
  382. self.words = words
  383. self.prefix = prefix
  384. self.suffix = suffix
  385. def get(self):
  386. return regex_opt(self.words, prefix=self.prefix, suffix=self.suffix)
  387. class RegexLexerMeta(LexerMeta):
  388. """
  389. Metaclass for RegexLexer, creates the self._tokens attribute from
  390. self.tokens on the first instantiation.
  391. """
  392. def _process_regex(cls, regex, rflags, state):
  393. """Preprocess the regular expression component of a token definition."""
  394. if isinstance(regex, Future):
  395. regex = regex.get()
  396. return re.compile(regex, rflags).match
  397. def _process_token(cls, token):
  398. """Preprocess the token component of a token definition."""
  399. assert type(token) is _TokenType or callable(token), \
  400. 'token type must be simple type or callable, not %r' % (token,)
  401. return token
  402. def _process_new_state(cls, new_state, unprocessed, processed):
  403. """Preprocess the state transition action of a token definition."""
  404. if isinstance(new_state, str):
  405. # an existing state
  406. if new_state == '#pop':
  407. return -1
  408. elif new_state in unprocessed:
  409. return (new_state,)
  410. elif new_state == '#push':
  411. return new_state
  412. elif new_state[:5] == '#pop:':
  413. return -int(new_state[5:])
  414. else:
  415. assert False, 'unknown new state %r' % new_state
  416. elif isinstance(new_state, combined):
  417. # combine a new state from existing ones
  418. tmp_state = '_tmp_%d' % cls._tmpname
  419. cls._tmpname += 1
  420. itokens = []
  421. for istate in new_state:
  422. assert istate != new_state, 'circular state ref %r' % istate
  423. itokens.extend(cls._process_state(unprocessed,
  424. processed, istate))
  425. processed[tmp_state] = itokens
  426. return (tmp_state,)
  427. elif isinstance(new_state, tuple):
  428. # push more than one state
  429. for istate in new_state:
  430. assert (istate in unprocessed or
  431. istate in ('#pop', '#push')), \
  432. 'unknown new state ' + istate
  433. return new_state
  434. else:
  435. assert False, 'unknown new state def %r' % new_state
  436. def _process_state(cls, unprocessed, processed, state):
  437. """Preprocess a single state definition."""
  438. assert type(state) is str, "wrong state name %r" % state
  439. assert state[0] != '#', "invalid state name %r" % state
  440. if state in processed:
  441. return processed[state]
  442. tokens = processed[state] = []
  443. rflags = cls.flags
  444. for tdef in unprocessed[state]:
  445. if isinstance(tdef, include):
  446. # it's a state reference
  447. assert tdef != state, "circular state reference %r" % state
  448. tokens.extend(cls._process_state(unprocessed, processed,
  449. str(tdef)))
  450. continue
  451. if isinstance(tdef, _inherit):
  452. # should be processed already, but may not in the case of:
  453. # 1. the state has no counterpart in any parent
  454. # 2. the state includes more than one 'inherit'
  455. continue
  456. if isinstance(tdef, default):
  457. new_state = cls._process_new_state(tdef.state, unprocessed, processed)
  458. tokens.append((re.compile('').match, None, new_state))
  459. continue
  460. assert type(tdef) is tuple, "wrong rule def %r" % tdef
  461. try:
  462. rex = cls._process_regex(tdef[0], rflags, state)
  463. except Exception as err:
  464. raise ValueError("uncompilable regex %r in state %r of %r: %s" %
  465. (tdef[0], state, cls, err)) from err
  466. token = cls._process_token(tdef[1])
  467. if len(tdef) == 2:
  468. new_state = None
  469. else:
  470. new_state = cls._process_new_state(tdef[2],
  471. unprocessed, processed)
  472. tokens.append((rex, token, new_state))
  473. return tokens
  474. def process_tokendef(cls, name, tokendefs=None):
  475. """Preprocess a dictionary of token definitions."""
  476. processed = cls._all_tokens[name] = {}
  477. tokendefs = tokendefs or cls.tokens[name]
  478. for state in list(tokendefs):
  479. cls._process_state(tokendefs, processed, state)
  480. return processed
  481. def get_tokendefs(cls):
  482. """
  483. Merge tokens from superclasses in MRO order, returning a single tokendef
  484. dictionary.
  485. Any state that is not defined by a subclass will be inherited
  486. automatically. States that *are* defined by subclasses will, by
  487. default, override that state in the superclass. If a subclass wishes to
  488. inherit definitions from a superclass, it can use the special value
  489. "inherit", which will cause the superclass' state definition to be
  490. included at that point in the state.
  491. """
  492. tokens = {}
  493. inheritable = {}
  494. for c in cls.__mro__:
  495. toks = c.__dict__.get('tokens', {})
  496. for state, items in toks.items():
  497. curitems = tokens.get(state)
  498. if curitems is None:
  499. # N.b. because this is assigned by reference, sufficiently
  500. # deep hierarchies are processed incrementally (e.g. for
  501. # A(B), B(C), C(RegexLexer), B will be premodified so X(B)
  502. # will not see any inherits in B).
  503. tokens[state] = items
  504. try:
  505. inherit_ndx = items.index(inherit)
  506. except ValueError:
  507. continue
  508. inheritable[state] = inherit_ndx
  509. continue
  510. inherit_ndx = inheritable.pop(state, None)
  511. if inherit_ndx is None:
  512. continue
  513. # Replace the "inherit" value with the items
  514. curitems[inherit_ndx:inherit_ndx+1] = items
  515. try:
  516. # N.b. this is the index in items (that is, the superclass
  517. # copy), so offset required when storing below.
  518. new_inh_ndx = items.index(inherit)
  519. except ValueError:
  520. pass
  521. else:
  522. inheritable[state] = inherit_ndx + new_inh_ndx
  523. return tokens
  524. def __call__(cls, *args, **kwds):
  525. """Instantiate cls after preprocessing its token definitions."""
  526. if '_tokens' not in cls.__dict__:
  527. cls._all_tokens = {}
  528. cls._tmpname = 0
  529. if hasattr(cls, 'token_variants') and cls.token_variants:
  530. # don't process yet
  531. pass
  532. else:
  533. cls._tokens = cls.process_tokendef('', cls.get_tokendefs())
  534. return type.__call__(cls, *args, **kwds)
  535. class RegexLexer(Lexer, metaclass=RegexLexerMeta):
  536. """
  537. Base for simple stateful regular expression-based lexers.
  538. Simplifies the lexing process so that you need only
  539. provide a list of states and regular expressions.
  540. """
  541. #: Flags for compiling the regular expressions.
  542. #: Defaults to MULTILINE.
  543. flags = re.MULTILINE
  544. #: At all time there is a stack of states. Initially, the stack contains
  545. #: a single state 'root'. The top of the stack is called "the current state".
  546. #:
  547. #: Dict of ``{'state': [(regex, tokentype, new_state), ...], ...}``
  548. #:
  549. #: ``new_state`` can be omitted to signify no state transition.
  550. #: If ``new_state`` is a string, it is pushed on the stack. This ensure
  551. #: the new current state is ``new_state``.
  552. #: If ``new_state`` is a tuple of strings, all of those strings are pushed
  553. #: on the stack and the current state will be the last element of the list.
  554. #: ``new_state`` can also be ``combined('state1', 'state2', ...)``
  555. #: to signify a new, anonymous state combined from the rules of two
  556. #: or more existing ones.
  557. #: Furthermore, it can be '#pop' to signify going back one step in
  558. #: the state stack, or '#push' to push the current state on the stack
  559. #: again. Note that if you push while in a combined state, the combined
  560. #: state itself is pushed, and not only the state in which the rule is
  561. #: defined.
  562. #:
  563. #: The tuple can also be replaced with ``include('state')``, in which
  564. #: case the rules from the state named by the string are included in the
  565. #: current one.
  566. tokens = {}
  567. def get_tokens_unprocessed(self, text, stack=('root',)):
  568. """
  569. Split ``text`` into (tokentype, text) pairs.
  570. ``stack`` is the initial stack (default: ``['root']``)
  571. """
  572. pos = 0
  573. tokendefs = self._tokens
  574. statestack = list(stack)
  575. statetokens = tokendefs[statestack[-1]]
  576. while 1:
  577. for rexmatch, action, new_state in statetokens:
  578. m = rexmatch(text, pos)
  579. if m:
  580. if action is not None:
  581. if type(action) is _TokenType:
  582. yield pos, action, m.group()
  583. else:
  584. yield from action(self, m)
  585. pos = m.end()
  586. if new_state is not None:
  587. # state transition
  588. if isinstance(new_state, tuple):
  589. for state in new_state:
  590. if state == '#pop':
  591. if len(statestack) > 1:
  592. statestack.pop()
  593. elif state == '#push':
  594. statestack.append(statestack[-1])
  595. else:
  596. statestack.append(state)
  597. elif isinstance(new_state, int):
  598. # pop, but keep at least one state on the stack
  599. # (random code leading to unexpected pops should
  600. # not allow exceptions)
  601. if abs(new_state) >= len(statestack):
  602. del statestack[1:]
  603. else:
  604. del statestack[new_state:]
  605. elif new_state == '#push':
  606. statestack.append(statestack[-1])
  607. else:
  608. assert False, "wrong state def: %r" % new_state
  609. statetokens = tokendefs[statestack[-1]]
  610. break
  611. else:
  612. # We are here only if all state tokens have been considered
  613. # and there was not a match on any of them.
  614. try:
  615. if text[pos] == '\n':
  616. # at EOL, reset state to "root"
  617. statestack = ['root']
  618. statetokens = tokendefs['root']
  619. yield pos, Whitespace, '\n'
  620. pos += 1
  621. continue
  622. yield pos, Error, text[pos]
  623. pos += 1
  624. except IndexError:
  625. break
  626. class LexerContext:
  627. """
  628. A helper object that holds lexer position data.
  629. """
  630. def __init__(self, text, pos, stack=None, end=None):
  631. self.text = text
  632. self.pos = pos
  633. self.end = end or len(text) # end=0 not supported ;-)
  634. self.stack = stack or ['root']
  635. def __repr__(self):
  636. return 'LexerContext(%r, %r, %r)' % (
  637. self.text, self.pos, self.stack)
  638. class ExtendedRegexLexer(RegexLexer):
  639. """
  640. A RegexLexer that uses a context object to store its state.
  641. """
  642. def get_tokens_unprocessed(self, text=None, context=None):
  643. """
  644. Split ``text`` into (tokentype, text) pairs.
  645. If ``context`` is given, use this lexer context instead.
  646. """
  647. tokendefs = self._tokens
  648. if not context:
  649. ctx = LexerContext(text, 0)
  650. statetokens = tokendefs['root']
  651. else:
  652. ctx = context
  653. statetokens = tokendefs[ctx.stack[-1]]
  654. text = ctx.text
  655. while 1:
  656. for rexmatch, action, new_state in statetokens:
  657. m = rexmatch(text, ctx.pos, ctx.end)
  658. if m:
  659. if action is not None:
  660. if type(action) is _TokenType:
  661. yield ctx.pos, action, m.group()
  662. ctx.pos = m.end()
  663. else:
  664. yield from action(self, m, ctx)
  665. if not new_state:
  666. # altered the state stack?
  667. statetokens = tokendefs[ctx.stack[-1]]
  668. # CAUTION: callback must set ctx.pos!
  669. if new_state is not None:
  670. # state transition
  671. if isinstance(new_state, tuple):
  672. for state in new_state:
  673. if state == '#pop':
  674. if len(ctx.stack) > 1:
  675. ctx.stack.pop()
  676. elif state == '#push':
  677. ctx.stack.append(ctx.stack[-1])
  678. else:
  679. ctx.stack.append(state)
  680. elif isinstance(new_state, int):
  681. # see RegexLexer for why this check is made
  682. if abs(new_state) >= len(ctx.stack):
  683. del ctx.stack[1:]
  684. else:
  685. del ctx.stack[new_state:]
  686. elif new_state == '#push':
  687. ctx.stack.append(ctx.stack[-1])
  688. else:
  689. assert False, "wrong state def: %r" % new_state
  690. statetokens = tokendefs[ctx.stack[-1]]
  691. break
  692. else:
  693. try:
  694. if ctx.pos >= ctx.end:
  695. break
  696. if text[ctx.pos] == '\n':
  697. # at EOL, reset state to "root"
  698. ctx.stack = ['root']
  699. statetokens = tokendefs['root']
  700. yield ctx.pos, Text, '\n'
  701. ctx.pos += 1
  702. continue
  703. yield ctx.pos, Error, text[ctx.pos]
  704. ctx.pos += 1
  705. except IndexError:
  706. break
  707. def do_insertions(insertions, tokens):
  708. """
  709. Helper for lexers which must combine the results of several
  710. sublexers.
  711. ``insertions`` is a list of ``(index, itokens)`` pairs.
  712. Each ``itokens`` iterable should be inserted at position
  713. ``index`` into the token stream given by the ``tokens``
  714. argument.
  715. The result is a combined token stream.
  716. TODO: clean up the code here.
  717. """
  718. insertions = iter(insertions)
  719. try:
  720. index, itokens = next(insertions)
  721. except StopIteration:
  722. # no insertions
  723. yield from tokens
  724. return
  725. realpos = None
  726. insleft = True
  727. # iterate over the token stream where we want to insert
  728. # the tokens from the insertion list.
  729. for i, t, v in tokens:
  730. # first iteration. store the position of first item
  731. if realpos is None:
  732. realpos = i
  733. oldi = 0
  734. while insleft and i + len(v) >= index:
  735. tmpval = v[oldi:index - i]
  736. if tmpval:
  737. yield realpos, t, tmpval
  738. realpos += len(tmpval)
  739. for it_index, it_token, it_value in itokens:
  740. yield realpos, it_token, it_value
  741. realpos += len(it_value)
  742. oldi = index - i
  743. try:
  744. index, itokens = next(insertions)
  745. except StopIteration:
  746. insleft = False
  747. break # not strictly necessary
  748. if oldi < len(v):
  749. yield realpos, t, v[oldi:]
  750. realpos += len(v) - oldi
  751. # leftover tokens
  752. while insleft:
  753. # no normal tokens, set realpos to zero
  754. realpos = realpos or 0
  755. for p, t, v in itokens:
  756. yield realpos, t, v
  757. realpos += len(v)
  758. try:
  759. index, itokens = next(insertions)
  760. except StopIteration:
  761. insleft = False
  762. break # not strictly necessary
  763. class ProfilingRegexLexerMeta(RegexLexerMeta):
  764. """Metaclass for ProfilingRegexLexer, collects regex timing info."""
  765. def _process_regex(cls, regex, rflags, state):
  766. if isinstance(regex, words):
  767. rex = regex_opt(regex.words, prefix=regex.prefix,
  768. suffix=regex.suffix)
  769. else:
  770. rex = regex
  771. compiled = re.compile(rex, rflags)
  772. def match_func(text, pos, endpos=sys.maxsize):
  773. info = cls._prof_data[-1].setdefault((state, rex), [0, 0.0])
  774. t0 = time.time()
  775. res = compiled.match(text, pos, endpos)
  776. t1 = time.time()
  777. info[0] += 1
  778. info[1] += t1 - t0
  779. return res
  780. return match_func
  781. class ProfilingRegexLexer(RegexLexer, metaclass=ProfilingRegexLexerMeta):
  782. """Drop-in replacement for RegexLexer that does profiling of its regexes."""
  783. _prof_data = []
  784. _prof_sort_index = 4 # defaults to time per call
  785. def get_tokens_unprocessed(self, text, stack=('root',)):
  786. # this needs to be a stack, since using(this) will produce nested calls
  787. self.__class__._prof_data.append({})
  788. yield from RegexLexer.get_tokens_unprocessed(self, text, stack)
  789. rawdata = self.__class__._prof_data.pop()
  790. data = sorted(((s, repr(r).strip('u\'').replace('\\\\', '\\')[:65],
  791. n, 1000 * t, 1000 * t / n)
  792. for ((s, r), (n, t)) in rawdata.items()),
  793. key=lambda x: x[self._prof_sort_index],
  794. reverse=True)
  795. sum_total = sum(x[3] for x in data)
  796. print()
  797. print('Profiling result for %s lexing %d chars in %.3f ms' %
  798. (self.__class__.__name__, len(text), sum_total))
  799. print('=' * 110)
  800. print('%-20s %-64s ncalls tottime percall' % ('state', 'regex'))
  801. print('-' * 110)
  802. for d in data:
  803. print('%-20s %-65s %5d %8.4f %8.4f' % d)
  804. print('=' * 110)