regex_executor.tcc 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568
  1. // class template regex -*- C++ -*-
  2. // Copyright (C) 2013-2022 Free Software Foundation, Inc.
  3. //
  4. // This file is part of the GNU ISO C++ Library. This library is free
  5. // software; you can redistribute it and/or modify it under the
  6. // terms of the GNU General Public License as published by the
  7. // Free Software Foundation; either version 3, or (at your option)
  8. // any later version.
  9. // This library is distributed in the hope that it will be useful,
  10. // but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. // GNU General Public License for more details.
  13. // Under Section 7 of GPL version 3, you are granted additional
  14. // permissions described in the GCC Runtime Library Exception, version
  15. // 3.1, as published by the Free Software Foundation.
  16. // You should have received a copy of the GNU General Public License and
  17. // a copy of the GCC Runtime Library Exception along with this program;
  18. // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
  19. // <http://www.gnu.org/licenses/>.
  20. /**
  21. * @file bits/regex_executor.tcc
  22. * This is an internal header file, included by other library headers.
  23. * Do not attempt to use it directly. @headername{regex}
  24. */
  25. namespace std _GLIBCXX_VISIBILITY(default)
  26. {
  27. _GLIBCXX_BEGIN_NAMESPACE_VERSION
  28. namespace __detail
  29. {
  30. template<typename _BiIter, typename _Alloc, typename _TraitsT,
  31. bool __dfs_mode>
  32. bool _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>::
  33. _M_search()
  34. {
  35. if (_M_search_from_first())
  36. return true;
  37. if (_M_flags & regex_constants::match_continuous)
  38. return false;
  39. _M_flags |= regex_constants::match_prev_avail;
  40. while (_M_begin != _M_end)
  41. {
  42. ++_M_begin;
  43. if (_M_search_from_first())
  44. return true;
  45. }
  46. return false;
  47. }
  48. // The _M_main function operates in different modes, DFS mode or BFS mode,
  49. // indicated by template parameter __dfs_mode, and dispatches to one of the
  50. // _M_main_dispatch overloads.
  51. //
  52. // ------------------------------------------------------------
  53. //
  54. // DFS mode:
  55. //
  56. // It applies a Depth-First-Search (aka backtracking) on given NFA and input
  57. // string.
  58. // At the very beginning the executor stands in the start state, then it
  59. // tries every possible state transition in current state recursively. Some
  60. // state transitions consume input string, say, a single-char-matcher or a
  61. // back-reference matcher; some don't, like assertion or other anchor nodes.
  62. // When the input is exhausted and/or the current state is an accepting
  63. // state, the whole executor returns true.
  64. //
  65. // TODO: This approach is exponentially slow for certain input.
  66. // Try to compile the NFA to a DFA.
  67. //
  68. // Time complexity: \Omega(match_length), O(2^(_M_nfa.size()))
  69. // Space complexity: \theta(match_results.size() + match_length)
  70. //
  71. template<typename _BiIter, typename _Alloc, typename _TraitsT,
  72. bool __dfs_mode>
  73. bool _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>::
  74. _M_main_dispatch(_Match_mode __match_mode, __dfs)
  75. {
  76. _M_has_sol = false;
  77. *_M_states._M_get_sol_pos() = _BiIter();
  78. _M_cur_results = _M_results;
  79. _M_dfs(__match_mode, _M_states._M_start);
  80. return _M_has_sol;
  81. }
  82. // ------------------------------------------------------------
  83. //
  84. // BFS mode:
  85. //
  86. // Russ Cox's article (http://swtch.com/~rsc/regexp/regexp1.html)
  87. // explained this algorithm clearly.
  88. //
  89. // It first computes epsilon closure (states that can be achieved without
  90. // consuming characters) for every state that's still matching,
  91. // using the same DFS algorithm, but doesn't re-enter states (using
  92. // _M_states._M_visited to check), nor follow _S_opcode_match.
  93. //
  94. // Then apply DFS using every _S_opcode_match (in _M_states._M_match_queue)
  95. // as the start state.
  96. //
  97. // It significantly reduces potential duplicate states, so has a better
  98. // upper bound; but it requires more overhead.
  99. //
  100. // Time complexity: \Omega(match_length * match_results.size())
  101. // O(match_length * _M_nfa.size() * match_results.size())
  102. // Space complexity: \Omega(_M_nfa.size() + match_results.size())
  103. // O(_M_nfa.size() * match_results.size())
  104. template<typename _BiIter, typename _Alloc, typename _TraitsT,
  105. bool __dfs_mode>
  106. bool _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>::
  107. _M_main_dispatch(_Match_mode __match_mode, __bfs)
  108. {
  109. _M_states._M_queue(_M_states._M_start, _M_results);
  110. bool __ret = false;
  111. while (1)
  112. {
  113. _M_has_sol = false;
  114. if (_M_states._M_match_queue.empty())
  115. break;
  116. std::fill_n(_M_states._M_visited_states, _M_nfa.size(), false);
  117. auto __old_queue = std::move(_M_states._M_match_queue);
  118. for (auto& __task : __old_queue)
  119. {
  120. _M_cur_results = std::move(__task.second);
  121. _M_dfs(__match_mode, __task.first);
  122. }
  123. if (__match_mode == _Match_mode::_Prefix)
  124. __ret |= _M_has_sol;
  125. if (_M_current == _M_end)
  126. break;
  127. ++_M_current;
  128. }
  129. if (__match_mode == _Match_mode::_Exact)
  130. __ret = _M_has_sol;
  131. _M_states._M_match_queue.clear();
  132. return __ret;
  133. }
  134. // Return whether now match the given sub-NFA.
  135. template<typename _BiIter, typename _Alloc, typename _TraitsT,
  136. bool __dfs_mode>
  137. bool _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>::
  138. _M_lookahead(_StateIdT __next)
  139. {
  140. // Backreferences may refer to captured content.
  141. // We may want to make this faster by not copying,
  142. // but let's not be clever prematurely.
  143. _ResultsVec __what(_M_cur_results);
  144. _Executor __sub(_M_current, _M_end, __what, _M_re, _M_flags);
  145. __sub._M_states._M_start = __next;
  146. if (__sub._M_search_from_first())
  147. {
  148. for (size_t __i = 0; __i < __what.size(); __i++)
  149. if (__what[__i].matched)
  150. _M_cur_results[__i] = __what[__i];
  151. return true;
  152. }
  153. return false;
  154. }
  155. // __rep_count records how many times (__rep_count.second)
  156. // this node is visited under certain input iterator
  157. // (__rep_count.first). This prevent the executor from entering
  158. // infinite loop by refusing to continue when it's already been
  159. // visited more than twice. It's `twice` instead of `once` because
  160. // we need to spare one more time for potential group capture.
  161. template<typename _BiIter, typename _Alloc, typename _TraitsT,
  162. bool __dfs_mode>
  163. void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>::
  164. _M_rep_once_more(_Match_mode __match_mode, _StateIdT __i)
  165. {
  166. const auto& __state = _M_nfa[__i];
  167. auto& __rep_count = _M_rep_count[__i];
  168. if (__rep_count.second == 0 || __rep_count.first != _M_current)
  169. {
  170. auto __back = __rep_count;
  171. __rep_count.first = _M_current;
  172. __rep_count.second = 1;
  173. _M_dfs(__match_mode, __state._M_alt);
  174. __rep_count = __back;
  175. }
  176. else
  177. {
  178. if (__rep_count.second < 2)
  179. {
  180. __rep_count.second++;
  181. _M_dfs(__match_mode, __state._M_alt);
  182. __rep_count.second--;
  183. }
  184. }
  185. }
  186. // _M_alt branch is "match once more", while _M_next is "get me out
  187. // of this quantifier". Executing _M_next first or _M_alt first don't
  188. // mean the same thing, and we need to choose the correct order under
  189. // given greedy mode.
  190. template<typename _BiIter, typename _Alloc, typename _TraitsT,
  191. bool __dfs_mode>
  192. void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>::
  193. _M_handle_repeat(_Match_mode __match_mode, _StateIdT __i)
  194. {
  195. const auto& __state = _M_nfa[__i];
  196. // Greedy.
  197. if (!__state._M_neg)
  198. {
  199. _M_rep_once_more(__match_mode, __i);
  200. // If it's DFS executor and already accepted, we're done.
  201. if (!__dfs_mode || !_M_has_sol)
  202. _M_dfs(__match_mode, __state._M_next);
  203. }
  204. else // Non-greedy mode
  205. {
  206. if (__dfs_mode)
  207. {
  208. // vice-versa.
  209. _M_dfs(__match_mode, __state._M_next);
  210. if (!_M_has_sol)
  211. _M_rep_once_more(__match_mode, __i);
  212. }
  213. else
  214. {
  215. // DON'T attempt anything, because there's already another
  216. // state with higher priority accepted. This state cannot
  217. // be better by attempting its next node.
  218. if (!_M_has_sol)
  219. {
  220. _M_dfs(__match_mode, __state._M_next);
  221. // DON'T attempt anything if it's already accepted. An
  222. // accepted state *must* be better than a solution that
  223. // matches a non-greedy quantifier one more time.
  224. if (!_M_has_sol)
  225. _M_rep_once_more(__match_mode, __i);
  226. }
  227. }
  228. }
  229. }
  230. template<typename _BiIter, typename _Alloc, typename _TraitsT,
  231. bool __dfs_mode>
  232. void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>::
  233. _M_handle_subexpr_begin(_Match_mode __match_mode, _StateIdT __i)
  234. {
  235. const auto& __state = _M_nfa[__i];
  236. auto& __res = _M_cur_results[__state._M_subexpr];
  237. auto __back = __res.first;
  238. __res.first = _M_current;
  239. _M_dfs(__match_mode, __state._M_next);
  240. __res.first = __back;
  241. }
  242. template<typename _BiIter, typename _Alloc, typename _TraitsT,
  243. bool __dfs_mode>
  244. void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>::
  245. _M_handle_subexpr_end(_Match_mode __match_mode, _StateIdT __i)
  246. {
  247. const auto& __state = _M_nfa[__i];
  248. auto& __res = _M_cur_results[__state._M_subexpr];
  249. auto __back = __res;
  250. __res.second = _M_current;
  251. __res.matched = true;
  252. _M_dfs(__match_mode, __state._M_next);
  253. __res = __back;
  254. }
  255. template<typename _BiIter, typename _Alloc, typename _TraitsT,
  256. bool __dfs_mode>
  257. inline void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>::
  258. _M_handle_line_begin_assertion(_Match_mode __match_mode, _StateIdT __i)
  259. {
  260. const auto& __state = _M_nfa[__i];
  261. if (_M_at_begin())
  262. _M_dfs(__match_mode, __state._M_next);
  263. }
  264. template<typename _BiIter, typename _Alloc, typename _TraitsT,
  265. bool __dfs_mode>
  266. inline void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>::
  267. _M_handle_line_end_assertion(_Match_mode __match_mode, _StateIdT __i)
  268. {
  269. const auto& __state = _M_nfa[__i];
  270. if (_M_at_end())
  271. _M_dfs(__match_mode, __state._M_next);
  272. }
  273. template<typename _BiIter, typename _Alloc, typename _TraitsT,
  274. bool __dfs_mode>
  275. inline void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>::
  276. _M_handle_word_boundary(_Match_mode __match_mode, _StateIdT __i)
  277. {
  278. const auto& __state = _M_nfa[__i];
  279. if (_M_word_boundary() == !__state._M_neg)
  280. _M_dfs(__match_mode, __state._M_next);
  281. }
  282. // Here __state._M_alt offers a single start node for a sub-NFA.
  283. // We recursively invoke our algorithm to match the sub-NFA.
  284. template<typename _BiIter, typename _Alloc, typename _TraitsT,
  285. bool __dfs_mode>
  286. void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>::
  287. _M_handle_subexpr_lookahead(_Match_mode __match_mode, _StateIdT __i)
  288. {
  289. const auto& __state = _M_nfa[__i];
  290. if (_M_lookahead(__state._M_alt) == !__state._M_neg)
  291. _M_dfs(__match_mode, __state._M_next);
  292. }
  293. template<typename _BiIter, typename _Alloc, typename _TraitsT,
  294. bool __dfs_mode>
  295. void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>::
  296. _M_handle_match(_Match_mode __match_mode, _StateIdT __i)
  297. {
  298. const auto& __state = _M_nfa[__i];
  299. if (_M_current == _M_end)
  300. return;
  301. if (__dfs_mode)
  302. {
  303. if (__state._M_matches(*_M_current))
  304. {
  305. ++_M_current;
  306. _M_dfs(__match_mode, __state._M_next);
  307. --_M_current;
  308. }
  309. }
  310. else
  311. if (__state._M_matches(*_M_current))
  312. _M_states._M_queue(__state._M_next, _M_cur_results);
  313. }
  314. template<typename _BiIter, typename _TraitsT>
  315. struct _Backref_matcher
  316. {
  317. _Backref_matcher(bool __icase, const _TraitsT& __traits)
  318. : _M_traits(__traits) { }
  319. bool
  320. _M_apply(_BiIter __expected_begin,
  321. _BiIter __expected_end, _BiIter __actual_begin,
  322. _BiIter __actual_end)
  323. {
  324. return _M_traits.transform(__expected_begin, __expected_end)
  325. == _M_traits.transform(__actual_begin, __actual_end);
  326. }
  327. const _TraitsT& _M_traits;
  328. };
  329. template<typename _BiIter, typename _CharT>
  330. struct _Backref_matcher<_BiIter, std::regex_traits<_CharT>>
  331. {
  332. using _TraitsT = std::regex_traits<_CharT>;
  333. _Backref_matcher(bool __icase, const _TraitsT& __traits)
  334. : _M_icase(__icase), _M_traits(__traits) { }
  335. bool
  336. _M_apply(_BiIter __expected_begin,
  337. _BiIter __expected_end, _BiIter __actual_begin,
  338. _BiIter __actual_end)
  339. {
  340. if (!_M_icase)
  341. return _GLIBCXX_STD_A::__equal4(__expected_begin, __expected_end,
  342. __actual_begin, __actual_end);
  343. typedef std::ctype<_CharT> __ctype_type;
  344. const auto& __fctyp = use_facet<__ctype_type>(_M_traits.getloc());
  345. return _GLIBCXX_STD_A::__equal4(__expected_begin, __expected_end,
  346. __actual_begin, __actual_end,
  347. [this, &__fctyp](_CharT __lhs, _CharT __rhs)
  348. {
  349. return __fctyp.tolower(__lhs)
  350. == __fctyp.tolower(__rhs);
  351. });
  352. }
  353. bool _M_icase;
  354. const _TraitsT& _M_traits;
  355. };
  356. // First fetch the matched result from _M_cur_results as __submatch;
  357. // then compare it with
  358. // (_M_current, _M_current + (__submatch.second - __submatch.first)).
  359. // If matched, keep going; else just return and try another state.
  360. template<typename _BiIter, typename _Alloc, typename _TraitsT,
  361. bool __dfs_mode>
  362. void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>::
  363. _M_handle_backref(_Match_mode __match_mode, _StateIdT __i)
  364. {
  365. __glibcxx_assert(__dfs_mode);
  366. const auto& __state = _M_nfa[__i];
  367. auto& __submatch = _M_cur_results[__state._M_backref_index];
  368. if (!__submatch.matched)
  369. return;
  370. auto __last = _M_current;
  371. for (auto __tmp = __submatch.first;
  372. __last != _M_end && __tmp != __submatch.second;
  373. ++__tmp)
  374. ++__last;
  375. if (_Backref_matcher<_BiIter, _TraitsT>(
  376. _M_re.flags() & regex_constants::icase,
  377. _M_re._M_automaton->_M_traits)._M_apply(
  378. __submatch.first, __submatch.second, _M_current, __last))
  379. {
  380. if (__last != _M_current)
  381. {
  382. auto __backup = _M_current;
  383. _M_current = __last;
  384. _M_dfs(__match_mode, __state._M_next);
  385. _M_current = __backup;
  386. }
  387. else
  388. _M_dfs(__match_mode, __state._M_next);
  389. }
  390. }
  391. template<typename _BiIter, typename _Alloc, typename _TraitsT,
  392. bool __dfs_mode>
  393. void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>::
  394. _M_handle_accept(_Match_mode __match_mode, _StateIdT)
  395. {
  396. if _GLIBCXX17_CONSTEXPR (__dfs_mode)
  397. {
  398. __glibcxx_assert(!_M_has_sol);
  399. if (__match_mode == _Match_mode::_Exact)
  400. _M_has_sol = _M_current == _M_end;
  401. else
  402. _M_has_sol = true;
  403. if (_M_current == _M_begin
  404. && (_M_flags & regex_constants::match_not_null))
  405. _M_has_sol = false;
  406. if (_M_has_sol)
  407. {
  408. if (_M_nfa._M_flags & regex_constants::ECMAScript)
  409. _M_results = _M_cur_results;
  410. else // POSIX
  411. {
  412. __glibcxx_assert(_M_states._M_get_sol_pos());
  413. // Here's POSIX's logic: match the longest one. However
  414. // we never know which one (lhs or rhs of "|") is longer
  415. // unless we try both of them and compare the results.
  416. // The member variable _M_sol_pos records the end
  417. // position of the last successful match. It's better
  418. // to be larger, because POSIX regex is always greedy.
  419. // TODO: This could be slow.
  420. if (*_M_states._M_get_sol_pos() == _BiIter()
  421. || std::distance(_M_begin,
  422. *_M_states._M_get_sol_pos())
  423. < std::distance(_M_begin, _M_current))
  424. {
  425. *_M_states._M_get_sol_pos() = _M_current;
  426. _M_results = _M_cur_results;
  427. }
  428. }
  429. }
  430. }
  431. else
  432. {
  433. if (_M_current == _M_begin
  434. && (_M_flags & regex_constants::match_not_null))
  435. return;
  436. if (__match_mode == _Match_mode::_Prefix || _M_current == _M_end)
  437. if (!_M_has_sol)
  438. {
  439. _M_has_sol = true;
  440. _M_results = _M_cur_results;
  441. }
  442. }
  443. }
  444. template<typename _BiIter, typename _Alloc, typename _TraitsT,
  445. bool __dfs_mode>
  446. void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>::
  447. _M_handle_alternative(_Match_mode __match_mode, _StateIdT __i)
  448. {
  449. const auto& __state = _M_nfa[__i];
  450. if (_M_nfa._M_flags & regex_constants::ECMAScript)
  451. {
  452. // TODO: Fix BFS support. It is wrong.
  453. _M_dfs(__match_mode, __state._M_alt);
  454. // Pick lhs if it matches. Only try rhs if it doesn't.
  455. if (!_M_has_sol)
  456. _M_dfs(__match_mode, __state._M_next);
  457. }
  458. else
  459. {
  460. // Try both and compare the result.
  461. // See "case _S_opcode_accept:" handling above.
  462. _M_dfs(__match_mode, __state._M_alt);
  463. auto __has_sol = _M_has_sol;
  464. _M_has_sol = false;
  465. _M_dfs(__match_mode, __state._M_next);
  466. _M_has_sol |= __has_sol;
  467. }
  468. }
  469. template<typename _BiIter, typename _Alloc, typename _TraitsT,
  470. bool __dfs_mode>
  471. void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>::
  472. _M_dfs(_Match_mode __match_mode, _StateIdT __i)
  473. {
  474. if (_M_states._M_visited(__i))
  475. return;
  476. switch (_M_nfa[__i]._M_opcode())
  477. {
  478. case _S_opcode_repeat:
  479. _M_handle_repeat(__match_mode, __i); break;
  480. case _S_opcode_subexpr_begin:
  481. _M_handle_subexpr_begin(__match_mode, __i); break;
  482. case _S_opcode_subexpr_end:
  483. _M_handle_subexpr_end(__match_mode, __i); break;
  484. case _S_opcode_line_begin_assertion:
  485. _M_handle_line_begin_assertion(__match_mode, __i); break;
  486. case _S_opcode_line_end_assertion:
  487. _M_handle_line_end_assertion(__match_mode, __i); break;
  488. case _S_opcode_word_boundary:
  489. _M_handle_word_boundary(__match_mode, __i); break;
  490. case _S_opcode_subexpr_lookahead:
  491. _M_handle_subexpr_lookahead(__match_mode, __i); break;
  492. case _S_opcode_match:
  493. _M_handle_match(__match_mode, __i); break;
  494. case _S_opcode_backref:
  495. _M_handle_backref(__match_mode, __i); break;
  496. case _S_opcode_accept:
  497. _M_handle_accept(__match_mode, __i); break;
  498. case _S_opcode_alternative:
  499. _M_handle_alternative(__match_mode, __i); break;
  500. default:
  501. __glibcxx_assert(false);
  502. }
  503. }
  504. // Return whether now is at some word boundary.
  505. template<typename _BiIter, typename _Alloc, typename _TraitsT,
  506. bool __dfs_mode>
  507. bool _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>::
  508. _M_word_boundary() const
  509. {
  510. if (_M_current == _M_begin && (_M_flags & regex_constants::match_not_bow))
  511. return false;
  512. if (_M_current == _M_end && (_M_flags & regex_constants::match_not_eow))
  513. return false;
  514. bool __left_is_word = false;
  515. if (_M_current != _M_begin
  516. || (_M_flags & regex_constants::match_prev_avail))
  517. {
  518. auto __prev = _M_current;
  519. if (_M_is_word(*std::prev(__prev)))
  520. __left_is_word = true;
  521. }
  522. bool __right_is_word =
  523. _M_current != _M_end && _M_is_word(*_M_current);
  524. return __left_is_word != __right_is_word;
  525. }
  526. } // namespace __detail
  527. _GLIBCXX_END_NAMESPACE_VERSION
  528. } // namespace