regex_scanner.h 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271
  1. // class template regex -*- C++ -*-
  2. // Copyright (C) 2013-2022 Free Software Foundation, Inc.
  3. //
  4. // This file is part of the GNU ISO C++ Library. This library is free
  5. // software; you can redistribute it and/or modify it under the
  6. // terms of the GNU General Public License as published by the
  7. // Free Software Foundation; either version 3, or (at your option)
  8. // any later version.
  9. // This library is distributed in the hope that it will be useful,
  10. // but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. // GNU General Public License for more details.
  13. // Under Section 7 of GPL version 3, you are granted additional
  14. // permissions described in the GCC Runtime Library Exception, version
  15. // 3.1, as published by the Free Software Foundation.
  16. // You should have received a copy of the GNU General Public License and
  17. // a copy of the GCC Runtime Library Exception along with this program;
  18. // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
  19. // <http://www.gnu.org/licenses/>.
  20. /**
  21. * @file bits/regex_scanner.h
  22. * This is an internal header file, included by other library headers.
  23. * Do not attempt to use it directly. @headername{regex}
  24. */
  25. namespace std _GLIBCXX_VISIBILITY(default)
  26. {
  27. _GLIBCXX_BEGIN_NAMESPACE_VERSION
  28. namespace __detail
  29. {
  30. /**
  31. * @addtogroup regex-detail
  32. * @{
  33. */
  34. struct _ScannerBase
  35. {
  36. public:
  37. /// Token types returned from the scanner.
  38. enum _TokenT : unsigned
  39. {
  40. _S_token_anychar,
  41. _S_token_ord_char,
  42. _S_token_oct_num,
  43. _S_token_hex_num,
  44. _S_token_backref,
  45. _S_token_subexpr_begin,
  46. _S_token_subexpr_no_group_begin,
  47. _S_token_subexpr_lookahead_begin, // neg if _M_value[0] == 'n'
  48. _S_token_subexpr_end,
  49. _S_token_bracket_begin,
  50. _S_token_bracket_neg_begin,
  51. _S_token_bracket_end,
  52. _S_token_interval_begin,
  53. _S_token_interval_end,
  54. _S_token_quoted_class,
  55. _S_token_char_class_name,
  56. _S_token_collsymbol,
  57. _S_token_equiv_class_name,
  58. _S_token_opt,
  59. _S_token_or,
  60. _S_token_closure0,
  61. _S_token_closure1,
  62. _S_token_line_begin,
  63. _S_token_line_end,
  64. _S_token_word_bound, // neg if _M_value[0] == 'n'
  65. _S_token_comma,
  66. _S_token_dup_count,
  67. _S_token_eof,
  68. _S_token_bracket_dash,
  69. _S_token_unknown = -1u
  70. };
  71. protected:
  72. typedef regex_constants::syntax_option_type _FlagT;
  73. enum _StateT
  74. {
  75. _S_state_normal,
  76. _S_state_in_brace,
  77. _S_state_in_bracket,
  78. };
  79. protected:
  80. _ScannerBase(_FlagT __flags)
  81. : _M_state(_S_state_normal),
  82. _M_flags(__flags),
  83. _M_escape_tbl(_M_is_ecma()
  84. ? _M_ecma_escape_tbl
  85. : _M_awk_escape_tbl),
  86. _M_spec_char(_M_is_ecma()
  87. ? _M_ecma_spec_char
  88. : _M_flags & regex_constants::basic
  89. ? _M_basic_spec_char
  90. : _M_flags & regex_constants::extended
  91. ? _M_extended_spec_char
  92. : _M_flags & regex_constants::grep
  93. ? ".[\\*^$\n"
  94. : _M_flags & regex_constants::egrep
  95. ? ".[\\()*+?{|^$\n"
  96. : _M_flags & regex_constants::awk
  97. ? _M_extended_spec_char
  98. : nullptr),
  99. _M_at_bracket_start(false)
  100. { __glibcxx_assert(_M_spec_char); }
  101. protected:
  102. const char*
  103. _M_find_escape(char __c)
  104. {
  105. auto __it = _M_escape_tbl;
  106. for (; __it->first != '\0'; ++__it)
  107. if (__it->first == __c)
  108. return &__it->second;
  109. return nullptr;
  110. }
  111. bool
  112. _M_is_ecma() const
  113. { return _M_flags & regex_constants::ECMAScript; }
  114. bool
  115. _M_is_basic() const
  116. { return _M_flags & (regex_constants::basic | regex_constants::grep); }
  117. bool
  118. _M_is_extended() const
  119. {
  120. return _M_flags & (regex_constants::extended
  121. | regex_constants::egrep
  122. | regex_constants::awk);
  123. }
  124. bool
  125. _M_is_grep() const
  126. { return _M_flags & (regex_constants::grep | regex_constants::egrep); }
  127. bool
  128. _M_is_awk() const
  129. { return _M_flags & regex_constants::awk; }
  130. protected:
  131. // TODO: Make them static in the next abi change.
  132. const std::pair<char, _TokenT> _M_token_tbl[9] =
  133. {
  134. {'^', _S_token_line_begin},
  135. {'$', _S_token_line_end},
  136. {'.', _S_token_anychar},
  137. {'*', _S_token_closure0},
  138. {'+', _S_token_closure1},
  139. {'?', _S_token_opt},
  140. {'|', _S_token_or},
  141. {'\n', _S_token_or}, // grep and egrep
  142. {'\0', _S_token_or},
  143. };
  144. const std::pair<char, char> _M_ecma_escape_tbl[8] =
  145. {
  146. {'0', '\0'},
  147. {'b', '\b'},
  148. {'f', '\f'},
  149. {'n', '\n'},
  150. {'r', '\r'},
  151. {'t', '\t'},
  152. {'v', '\v'},
  153. {'\0', '\0'},
  154. };
  155. const std::pair<char, char> _M_awk_escape_tbl[11] =
  156. {
  157. {'"', '"'},
  158. {'/', '/'},
  159. {'\\', '\\'},
  160. {'a', '\a'},
  161. {'b', '\b'},
  162. {'f', '\f'},
  163. {'n', '\n'},
  164. {'r', '\r'},
  165. {'t', '\t'},
  166. {'v', '\v'},
  167. {'\0', '\0'},
  168. };
  169. const char* _M_ecma_spec_char = "^$\\.*+?()[]{}|";
  170. const char* _M_basic_spec_char = ".[\\*^$";
  171. const char* _M_extended_spec_char = ".[\\()*+?{|^$";
  172. _StateT _M_state;
  173. _FlagT _M_flags;
  174. _TokenT _M_token;
  175. const std::pair<char, char>* _M_escape_tbl;
  176. const char* _M_spec_char;
  177. bool _M_at_bracket_start;
  178. };
  179. /**
  180. * @brief Scans an input range for regex tokens.
  181. *
  182. * The %_Scanner class interprets the regular expression pattern in
  183. * the input range passed to its constructor as a sequence of parse
  184. * tokens passed to the regular expression compiler. The sequence
  185. * of tokens provided depends on the flag settings passed to the
  186. * constructor: different regular expression grammars will interpret
  187. * the same input pattern in syntactically different ways.
  188. */
  189. template<typename _CharT>
  190. class _Scanner
  191. : public _ScannerBase
  192. {
  193. public:
  194. typedef std::basic_string<_CharT> _StringT;
  195. typedef regex_constants::syntax_option_type _FlagT;
  196. typedef const std::ctype<_CharT> _CtypeT;
  197. _Scanner(const _CharT* __begin, const _CharT* __end,
  198. _FlagT __flags, std::locale __loc);
  199. void
  200. _M_advance();
  201. _TokenT
  202. _M_get_token() const noexcept
  203. { return _M_token; }
  204. const _StringT&
  205. _M_get_value() const noexcept
  206. { return _M_value; }
  207. #ifdef _GLIBCXX_DEBUG
  208. std::ostream&
  209. _M_print(std::ostream&);
  210. #endif
  211. private:
  212. void
  213. _M_scan_normal();
  214. void
  215. _M_scan_in_bracket();
  216. void
  217. _M_scan_in_brace();
  218. void
  219. _M_eat_escape_ecma();
  220. void
  221. _M_eat_escape_posix();
  222. void
  223. _M_eat_escape_awk();
  224. void
  225. _M_eat_class(char);
  226. const _CharT* _M_current;
  227. const _CharT* _M_end;
  228. _CtypeT& _M_ctype;
  229. _StringT _M_value;
  230. void (_Scanner::* _M_eat_escape)();
  231. };
  232. ///@} regex-detail
  233. } // namespace __detail
  234. _GLIBCXX_END_NAMESPACE_VERSION
  235. } // namespace std
  236. #include <bits/regex_scanner.tcc>