regex_scanner.tcc 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584
  1. // class template regex -*- C++ -*-
  2. // Copyright (C) 2013-2022 Free Software Foundation, Inc.
  3. //
  4. // This file is part of the GNU ISO C++ Library. This library is free
  5. // software; you can redistribute it and/or modify it under the
  6. // terms of the GNU General Public License as published by the
  7. // Free Software Foundation; either version 3, or (at your option)
  8. // any later version.
  9. // This library is distributed in the hope that it will be useful,
  10. // but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. // GNU General Public License for more details.
  13. // Under Section 7 of GPL version 3, you are granted additional
  14. // permissions described in the GCC Runtime Library Exception, version
  15. // 3.1, as published by the Free Software Foundation.
  16. // You should have received a copy of the GNU General Public License and
  17. // a copy of the GCC Runtime Library Exception along with this program;
  18. // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
  19. // <http://www.gnu.org/licenses/>.
  20. /**
  21. * @file bits/regex_scanner.tcc
  22. * This is an internal header file, included by other library headers.
  23. * Do not attempt to use it directly. @headername{regex}
  24. */
  25. // FIXME make comments doxygen format.
  26. // N3376 specified 6 regex styles: ECMAScript, basic, extended, grep, egrep
  27. // and awk
  28. // 1) grep is basic except '\n' is treated as '|'
  29. // 2) egrep is extended except '\n' is treated as '|'
  30. // 3) awk is extended except special escaping rules, and there's no
  31. // back-reference.
  32. //
  33. // References:
  34. //
  35. // ECMAScript: ECMA-262 15.10
  36. //
  37. // basic, extended:
  38. // http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html
  39. //
  40. // awk: http://pubs.opengroup.org/onlinepubs/000095399/utilities/awk.html
  41. namespace std _GLIBCXX_VISIBILITY(default)
  42. {
  43. _GLIBCXX_BEGIN_NAMESPACE_VERSION
  44. namespace __detail
  45. {
  46. template<typename _CharT>
  47. _Scanner<_CharT>::
  48. _Scanner(const _CharT* __begin, const _CharT* __end,
  49. _FlagT __flags, std::locale __loc)
  50. : _ScannerBase(__flags),
  51. _M_current(__begin), _M_end(__end),
  52. _M_ctype(std::use_facet<_CtypeT>(__loc)),
  53. _M_eat_escape(_M_is_ecma()
  54. ? &_Scanner::_M_eat_escape_ecma
  55. : &_Scanner::_M_eat_escape_posix)
  56. { _M_advance(); }
  57. template<typename _CharT>
  58. void
  59. _Scanner<_CharT>::
  60. _M_advance()
  61. {
  62. if (_M_current == _M_end)
  63. {
  64. _M_token = _S_token_eof;
  65. return;
  66. }
  67. if (_M_state == _S_state_normal)
  68. _M_scan_normal();
  69. else if (_M_state == _S_state_in_bracket)
  70. _M_scan_in_bracket();
  71. else if (_M_state == _S_state_in_brace)
  72. _M_scan_in_brace();
  73. else
  74. {
  75. __glibcxx_assert(!"unexpected state while processing regex");
  76. }
  77. }
  78. // Differences between styles:
  79. // 1) "\(", "\)", "\{" in basic. It's not escaping.
  80. // 2) "(?:", "(?=", "(?!" in ECMAScript.
  81. template<typename _CharT>
  82. void
  83. _Scanner<_CharT>::
  84. _M_scan_normal()
  85. {
  86. auto __c = *_M_current++;
  87. if (__builtin_strchr(_M_spec_char, _M_ctype.narrow(__c, ' ')) == nullptr)
  88. {
  89. _M_token = _S_token_ord_char;
  90. _M_value.assign(1, __c);
  91. return;
  92. }
  93. if (__c == '\\')
  94. {
  95. if (_M_current == _M_end)
  96. __throw_regex_error(
  97. regex_constants::error_escape,
  98. "Invalid escape at end of regular expression");
  99. if (!_M_is_basic()
  100. || (*_M_current != '('
  101. && *_M_current != ')'
  102. && *_M_current != '{'))
  103. {
  104. (this->*_M_eat_escape)();
  105. return;
  106. }
  107. __c = *_M_current++;
  108. }
  109. if (__c == '(')
  110. {
  111. if (_M_is_ecma() && *_M_current == '?')
  112. {
  113. if (++_M_current == _M_end)
  114. __throw_regex_error(regex_constants::error_paren);
  115. if (*_M_current == ':')
  116. {
  117. ++_M_current;
  118. _M_token = _S_token_subexpr_no_group_begin;
  119. }
  120. else if (*_M_current == '=')
  121. {
  122. ++_M_current;
  123. _M_token = _S_token_subexpr_lookahead_begin;
  124. _M_value.assign(1, 'p');
  125. }
  126. else if (*_M_current == '!')
  127. {
  128. ++_M_current;
  129. _M_token = _S_token_subexpr_lookahead_begin;
  130. _M_value.assign(1, 'n');
  131. }
  132. else
  133. __throw_regex_error(regex_constants::error_paren,
  134. "Invalid '(?...)' zero-width assertion "
  135. "in regular expression");
  136. }
  137. else if (_M_flags & regex_constants::nosubs)
  138. _M_token = _S_token_subexpr_no_group_begin;
  139. else
  140. _M_token = _S_token_subexpr_begin;
  141. }
  142. else if (__c == ')')
  143. _M_token = _S_token_subexpr_end;
  144. else if (__c == '[')
  145. {
  146. _M_state = _S_state_in_bracket;
  147. _M_at_bracket_start = true;
  148. if (_M_current != _M_end && *_M_current == '^')
  149. {
  150. _M_token = _S_token_bracket_neg_begin;
  151. ++_M_current;
  152. }
  153. else
  154. _M_token = _S_token_bracket_begin;
  155. }
  156. else if (__c == '{')
  157. {
  158. _M_state = _S_state_in_brace;
  159. _M_token = _S_token_interval_begin;
  160. }
  161. else if (__builtin_expect(__c == _CharT(0), false))
  162. {
  163. if (!_M_is_ecma())
  164. __throw_regex_error(regex_constants::_S_null);
  165. _M_token = _S_token_ord_char;
  166. _M_value.assign(1, __c);
  167. }
  168. else if (__c != ']' && __c != '}')
  169. {
  170. auto __it = _M_token_tbl;
  171. auto __narrowc = _M_ctype.narrow(__c, '\0');
  172. for (; __it->first != '\0'; ++__it)
  173. if (__it->first == __narrowc)
  174. {
  175. _M_token = __it->second;
  176. return;
  177. }
  178. __glibcxx_assert(!"unexpected special character in regex");
  179. }
  180. else
  181. {
  182. _M_token = _S_token_ord_char;
  183. _M_value.assign(1, __c);
  184. }
  185. }
  186. // Differences between styles:
  187. // 1) different semantics of "[]" and "[^]".
  188. // 2) Escaping in bracket expr.
  189. template<typename _CharT>
  190. void
  191. _Scanner<_CharT>::
  192. _M_scan_in_bracket()
  193. {
  194. if (_M_current == _M_end)
  195. __throw_regex_error(regex_constants::error_brack);
  196. auto __c = *_M_current++;
  197. if (__c == '-')
  198. _M_token = _S_token_bracket_dash;
  199. else if (__c == '[')
  200. {
  201. if (_M_current == _M_end)
  202. __throw_regex_error(regex_constants::error_brack,
  203. "Incomplete '[[' character class in "
  204. "regular expression");
  205. if (*_M_current == '.')
  206. {
  207. _M_token = _S_token_collsymbol;
  208. _M_eat_class(*_M_current++);
  209. }
  210. else if (*_M_current == ':')
  211. {
  212. _M_token = _S_token_char_class_name;
  213. _M_eat_class(*_M_current++);
  214. }
  215. else if (*_M_current == '=')
  216. {
  217. _M_token = _S_token_equiv_class_name;
  218. _M_eat_class(*_M_current++);
  219. }
  220. else
  221. {
  222. _M_token = _S_token_ord_char;
  223. _M_value.assign(1, __c);
  224. }
  225. }
  226. // In POSIX, when encountering "[]" or "[^]", the ']' is interpreted
  227. // literally. So "[]]" and "[^]]" are valid regexes. See the testcases
  228. // `.../empty_range.cc`.
  229. else if (__c == ']' && (_M_is_ecma() || !_M_at_bracket_start))
  230. {
  231. _M_token = _S_token_bracket_end;
  232. _M_state = _S_state_normal;
  233. }
  234. // ECMAScript and awk permits escaping in bracket.
  235. else if (__c == '\\' && (_M_is_ecma() || _M_is_awk()))
  236. (this->*_M_eat_escape)();
  237. else
  238. {
  239. _M_token = _S_token_ord_char;
  240. _M_value.assign(1, __c);
  241. }
  242. _M_at_bracket_start = false;
  243. }
  244. // Differences between styles:
  245. // 1) "\}" in basic style.
  246. template<typename _CharT>
  247. void
  248. _Scanner<_CharT>::
  249. _M_scan_in_brace()
  250. {
  251. if (_M_current == _M_end)
  252. __throw_regex_error(regex_constants::error_brace);
  253. auto __c = *_M_current++;
  254. if (_M_ctype.is(_CtypeT::digit, __c))
  255. {
  256. _M_token = _S_token_dup_count;
  257. _M_value.assign(1, __c);
  258. while (_M_current != _M_end
  259. && _M_ctype.is(_CtypeT::digit, *_M_current))
  260. _M_value += *_M_current++;
  261. }
  262. else if (__c == ',')
  263. _M_token = _S_token_comma;
  264. // basic use \}.
  265. else if (_M_is_basic())
  266. {
  267. if (__c == '\\' && _M_current != _M_end && *_M_current == '}')
  268. {
  269. _M_state = _S_state_normal;
  270. _M_token = _S_token_interval_end;
  271. ++_M_current;
  272. }
  273. else
  274. __throw_regex_error(regex_constants::error_badbrace);
  275. }
  276. else if (__c == '}')
  277. {
  278. _M_state = _S_state_normal;
  279. _M_token = _S_token_interval_end;
  280. }
  281. else
  282. __throw_regex_error(regex_constants::error_badbrace);
  283. }
  284. template<typename _CharT>
  285. void
  286. _Scanner<_CharT>::
  287. _M_eat_escape_ecma()
  288. {
  289. if (_M_current == _M_end)
  290. __throw_regex_error(regex_constants::error_escape);
  291. auto __c = *_M_current++;
  292. auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
  293. if (__pos != nullptr && (__c != 'b' || _M_state == _S_state_in_bracket))
  294. {
  295. _M_token = _S_token_ord_char;
  296. _M_value.assign(1, *__pos);
  297. }
  298. else if (__c == 'b')
  299. {
  300. _M_token = _S_token_word_bound;
  301. _M_value.assign(1, 'p');
  302. }
  303. else if (__c == 'B')
  304. {
  305. _M_token = _S_token_word_bound;
  306. _M_value.assign(1, 'n');
  307. }
  308. // N3376 28.13
  309. else if (__c == 'd'
  310. || __c == 'D'
  311. || __c == 's'
  312. || __c == 'S'
  313. || __c == 'w'
  314. || __c == 'W')
  315. {
  316. _M_token = _S_token_quoted_class;
  317. _M_value.assign(1, __c);
  318. }
  319. else if (__c == 'c')
  320. {
  321. if (_M_current == _M_end)
  322. __throw_regex_error(regex_constants::error_escape,
  323. "invalid '\\cX' control character in "
  324. "regular expression");
  325. _M_token = _S_token_ord_char;
  326. _M_value.assign(1, *_M_current++);
  327. }
  328. else if (__c == 'x' || __c == 'u')
  329. {
  330. _M_value.clear();
  331. const int __n = __c == 'x' ? 2 : 4;
  332. for (int __i = 0; __i < __n; __i++)
  333. {
  334. if (_M_current == _M_end
  335. || !_M_ctype.is(_CtypeT::xdigit, *_M_current))
  336. __throw_regex_error(regex_constants::error_escape,
  337. __n == 2
  338. ? "Invalid '\\xNN' control character in "
  339. "regular expression"
  340. : "Invalid '\\uNNNN' control character in "
  341. "regular expression");
  342. _M_value += *_M_current++;
  343. }
  344. _M_token = _S_token_hex_num;
  345. }
  346. // ECMAScript recognizes multi-digit back-references.
  347. else if (_M_ctype.is(_CtypeT::digit, __c))
  348. {
  349. _M_value.assign(1, __c);
  350. while (_M_current != _M_end
  351. && _M_ctype.is(_CtypeT::digit, *_M_current))
  352. _M_value += *_M_current++;
  353. _M_token = _S_token_backref;
  354. }
  355. else
  356. {
  357. _M_token = _S_token_ord_char;
  358. _M_value.assign(1, __c);
  359. }
  360. }
  361. // Differences between styles:
  362. // 1) Extended doesn't support backref, but basic does.
  363. template<typename _CharT>
  364. void
  365. _Scanner<_CharT>::
  366. _M_eat_escape_posix()
  367. {
  368. if (_M_current == _M_end)
  369. __throw_regex_error(regex_constants::error_escape);
  370. auto __c = *_M_current;
  371. auto __pos = __builtin_strchr(_M_spec_char, _M_ctype.narrow(__c, '\0'));
  372. if (__pos != nullptr && *__pos != '\0')
  373. {
  374. _M_token = _S_token_ord_char;
  375. _M_value.assign(1, __c);
  376. }
  377. // We MUST judge awk before handling backrefs. There's no backref in awk.
  378. else if (_M_is_awk())
  379. {
  380. _M_eat_escape_awk();
  381. return;
  382. }
  383. else if (_M_is_basic() && _M_ctype.is(_CtypeT::digit, __c) && __c != '0')
  384. {
  385. _M_token = _S_token_backref;
  386. _M_value.assign(1, __c);
  387. }
  388. else
  389. {
  390. #ifdef __STRICT_ANSI__
  391. // POSIX says it is undefined to escape ordinary characters
  392. __throw_regex_error(regex_constants::error_escape);
  393. #else
  394. _M_token = _S_token_ord_char;
  395. _M_value.assign(1, __c);
  396. #endif
  397. }
  398. ++_M_current;
  399. }
  400. template<typename _CharT>
  401. void
  402. _Scanner<_CharT>::
  403. _M_eat_escape_awk()
  404. {
  405. auto __c = *_M_current++;
  406. auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
  407. if (__pos != nullptr)
  408. {
  409. _M_token = _S_token_ord_char;
  410. _M_value.assign(1, *__pos);
  411. }
  412. // \ddd for oct representation
  413. else if (_M_ctype.is(_CtypeT::digit, __c)
  414. && __c != '8'
  415. && __c != '9')
  416. {
  417. _M_value.assign(1, __c);
  418. for (int __i = 0;
  419. __i < 2
  420. && _M_current != _M_end
  421. && _M_ctype.is(_CtypeT::digit, *_M_current)
  422. && *_M_current != '8'
  423. && *_M_current != '9';
  424. __i++)
  425. _M_value += *_M_current++;
  426. _M_token = _S_token_oct_num;
  427. return;
  428. }
  429. else
  430. __throw_regex_error(regex_constants::error_escape);
  431. }
  432. // Eats a character class or throws an exception.
  433. // __ch could be ':', '.' or '=', _M_current is the char after ']' when
  434. // returning.
  435. template<typename _CharT>
  436. void
  437. _Scanner<_CharT>::
  438. _M_eat_class(char __ch)
  439. {
  440. for (_M_value.clear(); _M_current != _M_end && *_M_current != __ch;)
  441. _M_value += *_M_current++;
  442. if (_M_current == _M_end
  443. || *_M_current++ != __ch
  444. || _M_current == _M_end // skip __ch
  445. || *_M_current++ != ']') // skip ']'
  446. {
  447. __throw_regex_error(__ch == ':' ? regex_constants::error_ctype
  448. : regex_constants::error_collate);
  449. }
  450. }
  451. #ifdef _GLIBCXX_DEBUG
  452. template<typename _CharT>
  453. std::ostream&
  454. _Scanner<_CharT>::
  455. _M_print(std::ostream& ostr)
  456. {
  457. switch (_M_token)
  458. {
  459. case _S_token_anychar:
  460. ostr << "any-character\n";
  461. break;
  462. case _S_token_backref:
  463. ostr << "backref\n";
  464. break;
  465. case _S_token_bracket_begin:
  466. ostr << "bracket-begin\n";
  467. break;
  468. case _S_token_bracket_neg_begin:
  469. ostr << "bracket-neg-begin\n";
  470. break;
  471. case _S_token_bracket_end:
  472. ostr << "bracket-end\n";
  473. break;
  474. case _S_token_char_class_name:
  475. ostr << "char-class-name \"" << _M_value << "\"\n";
  476. break;
  477. case _S_token_closure0:
  478. ostr << "closure0\n";
  479. break;
  480. case _S_token_closure1:
  481. ostr << "closure1\n";
  482. break;
  483. case _S_token_collsymbol:
  484. ostr << "collsymbol \"" << _M_value << "\"\n";
  485. break;
  486. case _S_token_comma:
  487. ostr << "comma\n";
  488. break;
  489. case _S_token_dup_count:
  490. ostr << "dup count: " << _M_value << "\n";
  491. break;
  492. case _S_token_eof:
  493. ostr << "EOF\n";
  494. break;
  495. case _S_token_equiv_class_name:
  496. ostr << "equiv-class-name \"" << _M_value << "\"\n";
  497. break;
  498. case _S_token_interval_begin:
  499. ostr << "interval begin\n";
  500. break;
  501. case _S_token_interval_end:
  502. ostr << "interval end\n";
  503. break;
  504. case _S_token_line_begin:
  505. ostr << "line begin\n";
  506. break;
  507. case _S_token_line_end:
  508. ostr << "line end\n";
  509. break;
  510. case _S_token_opt:
  511. ostr << "opt\n";
  512. break;
  513. case _S_token_or:
  514. ostr << "or\n";
  515. break;
  516. case _S_token_ord_char:
  517. ostr << "ordinary character: \"" << _M_value << "\"\n";
  518. break;
  519. case _S_token_subexpr_begin:
  520. ostr << "subexpr begin\n";
  521. break;
  522. case _S_token_subexpr_no_group_begin:
  523. ostr << "no grouping subexpr begin\n";
  524. break;
  525. case _S_token_subexpr_lookahead_begin:
  526. ostr << "lookahead subexpr begin\n";
  527. break;
  528. case _S_token_subexpr_end:
  529. ostr << "subexpr end\n";
  530. break;
  531. case _S_token_unknown:
  532. ostr << "-- unknown token --\n";
  533. break;
  534. case _S_token_oct_num:
  535. ostr << "oct number " << _M_value << "\n";
  536. break;
  537. case _S_token_hex_num:
  538. ostr << "hex number " << _M_value << "\n";
  539. break;
  540. case _S_token_quoted_class:
  541. ostr << "quoted class " << "\\" << _M_value << "\n";
  542. break;
  543. default:
  544. _GLIBCXX_DEBUG_ASSERT(false);
  545. }
  546. return ostr;
  547. }
  548. #endif
  549. } // namespace __detail
  550. _GLIBCXX_END_NAMESPACE_VERSION
  551. } // namespace