1 // class template regex -*- C++ -*- 2 3 // Copyright (C) 2013-2014 Free Software Foundation, Inc. 4 // 5 // This file is part of the GNU ISO C++ Library. This library is free 6 // software; you can redistribute it and/or modify it under the 7 // terms of the GNU General Public License as published by the 8 // Free Software Foundation; either version 3, or (at your option) 9 // any later version. 10 11 // This library is distributed in the hope that it will be useful, 12 // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 // GNU General Public License for more details. 15 16 // Under Section 7 of GPL version 3, you are granted additional 17 // permissions described in the GCC Runtime Library Exception, version 18 // 3.1, as published by the Free Software Foundation. 19 20 // You should have received a copy of the GNU General Public License and 21 // a copy of the GCC Runtime Library Exception along with this program; 22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23 // <http://www.gnu.org/licenses/>. 24 25 /** 26 * @file bits/regex_compiler.tcc 27 * This is an internal header file, included by other library headers. 28 * Do not attempt to use it directly. @headername{regex} 29 */ 30 31 // FIXME make comments doxygen format. 32 33 // This compiler refers to "Regular Expression Matching Can Be Simple And Fast" 34 // (http://swtch.com/~rsc/regexp/regexp1.html"), 35 // but doesn't strictly follow it. 36 // 37 // When compiling, states are *chained* instead of tree- or graph-constructed. 38 // It's more like structured programs: there's if statement and loop statement. 39 // 40 // For alternative structure(say "a|b"), aka "if statement", two branchs should 41 // be constructed. However, these two shall merge to an "end_tag" at the end of 42 // this operator: 43 // 44 // branch1 45 // / \ 46 // => begin_tag end_tag => 47 // \ / 48 // branch2 49 // 50 // This is the difference between this implementation and that in Russ's 51 // article. 52 // 53 // That's why we introduced dummy node here ------ "end_tag" is a dummy node. 54 // All dummy node will be eliminated at the end of compiling process. 55 56 namespace std _GLIBCXX_VISIBILITY(default) 57 { 58 namespace __detail 59 { 60 _GLIBCXX_BEGIN_NAMESPACE_VERSION 61 62 template<typename _TraitsT> 63 _Compiler<_TraitsT>:: 64 _Compiler(_IterT __b, _IterT __e, 65 const _TraitsT& __traits, _FlagT __flags) 66 : _M_flags((__flags 67 & (regex_constants::ECMAScript 68 | regex_constants::basic 69 | regex_constants::extended 70 | regex_constants::grep 71 | regex_constants::egrep 72 | regex_constants::awk)) 73 ? __flags 74 : __flags | regex_constants::ECMAScript), 75 _M_traits(__traits), 76 _M_ctype(std::use_facet<_CtypeT>(_M_traits.getloc())), 77 _M_scanner(__b, __e, _M_flags, _M_traits.getloc()), 78 _M_nfa(_M_flags) 79 { 80 _StateSeqT __r(_M_nfa, _M_nfa._M_start()); 81 __r._M_append(_M_nfa._M_insert_subexpr_begin()); 82 this->_M_disjunction(); 83 if (!_M_match_token(_ScannerT::_S_token_eof)) 84 __throw_regex_error(regex_constants::error_paren); 85 __r._M_append(_M_pop()); 86 _GLIBCXX_DEBUG_ASSERT(_M_stack.empty()); 87 __r._M_append(_M_nfa._M_insert_subexpr_end()); 88 __r._M_append(_M_nfa._M_insert_accept()); 89 _M_nfa._M_eliminate_dummy(); 90 } 91 92 template<typename _TraitsT> 93 void 94 _Compiler<_TraitsT>:: 95 _M_disjunction() 96 { 97 this->_M_alternative(); 98 while (_M_match_token(_ScannerT::_S_token_or)) 99 { 100 _StateSeqT __alt1 = _M_pop(); 101 this->_M_alternative(); 102 _StateSeqT __alt2 = _M_pop(); 103 auto __end = _M_nfa._M_insert_dummy(); 104 __alt1._M_append(__end); 105 __alt2._M_append(__end); 106 _M_stack.push(_StateSeqT(_M_nfa, 107 _M_nfa._M_insert_alt(__alt1._M_start, 108 __alt2._M_start, false), 109 __end)); 110 } 111 } 112 113 template<typename _TraitsT> 114 void 115 _Compiler<_TraitsT>:: 116 _M_alternative() 117 { 118 if (this->_M_term()) 119 { 120 _StateSeqT __re = _M_pop(); 121 this->_M_alternative(); 122 __re._M_append(_M_pop()); 123 _M_stack.push(__re); 124 } 125 else 126 _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_dummy())); 127 } 128 129 template<typename _TraitsT> 130 bool 131 _Compiler<_TraitsT>:: 132 _M_term() 133 { 134 if (this->_M_assertion()) 135 return true; 136 if (this->_M_atom()) 137 { 138 while (this->_M_quantifier()); 139 return true; 140 } 141 return false; 142 } 143 144 template<typename _TraitsT> 145 bool 146 _Compiler<_TraitsT>:: 147 _M_assertion() 148 { 149 if (_M_match_token(_ScannerT::_S_token_line_begin)) 150 _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_line_begin())); 151 else if (_M_match_token(_ScannerT::_S_token_line_end)) 152 _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_line_end())); 153 else if (_M_match_token(_ScannerT::_S_token_word_bound)) 154 // _M_value[0] == 'n' means it's negtive, say "not word boundary". 155 _M_stack.push(_StateSeqT(_M_nfa, _M_nfa. 156 _M_insert_word_bound(_M_value[0] == 'n'))); 157 else if (_M_match_token(_ScannerT::_S_token_subexpr_lookahead_begin)) 158 { 159 auto __neg = _M_value[0] == 'n'; 160 this->_M_disjunction(); 161 if (!_M_match_token(_ScannerT::_S_token_subexpr_end)) 162 __throw_regex_error(regex_constants::error_paren); 163 auto __tmp = _M_pop(); 164 __tmp._M_append(_M_nfa._M_insert_accept()); 165 _M_stack.push( 166 _StateSeqT( 167 _M_nfa, 168 _M_nfa._M_insert_lookahead(__tmp._M_start, __neg))); 169 } 170 else 171 return false; 172 return true; 173 } 174 175 template<typename _TraitsT> 176 bool 177 _Compiler<_TraitsT>:: 178 _M_quantifier() 179 { 180 bool __neg = (_M_flags & regex_constants::ECMAScript); 181 auto __init = [this, &__neg]() 182 { 183 if (_M_stack.empty()) 184 __throw_regex_error(regex_constants::error_badrepeat); 185 __neg = __neg && _M_match_token(_ScannerT::_S_token_opt); 186 }; 187 if (_M_match_token(_ScannerT::_S_token_closure0)) 188 { 189 __init(); 190 auto __e = _M_pop(); 191 _StateSeqT __r(_M_nfa, _M_nfa._M_insert_alt(_S_invalid_state_id, 192 __e._M_start, __neg)); 193 __e._M_append(__r); 194 _M_stack.push(__r); 195 } 196 else if (_M_match_token(_ScannerT::_S_token_closure1)) 197 { 198 __init(); 199 auto __e = _M_pop(); 200 __e._M_append(_M_nfa._M_insert_alt(_S_invalid_state_id, __e._M_start, 201 __neg)); 202 _M_stack.push(__e); 203 } 204 else if (_M_match_token(_ScannerT::_S_token_opt)) 205 { 206 __init(); 207 auto __e = _M_pop(); 208 auto __end = _M_nfa._M_insert_dummy(); 209 _StateSeqT __r(_M_nfa, _M_nfa._M_insert_alt(_S_invalid_state_id, 210 __e._M_start, __neg)); 211 __e._M_append(__end); 212 __r._M_append(__end); 213 _M_stack.push(__r); 214 } 215 else if (_M_match_token(_ScannerT::_S_token_interval_begin)) 216 { 217 if (_M_stack.empty()) 218 __throw_regex_error(regex_constants::error_badrepeat); 219 if (!_M_match_token(_ScannerT::_S_token_dup_count)) 220 __throw_regex_error(regex_constants::error_badbrace); 221 _StateSeqT __r(_M_pop()); 222 _StateSeqT __e(_M_nfa, _M_nfa._M_insert_dummy()); 223 long __min_rep = _M_cur_int_value(10); 224 bool __infi = false; 225 long __n; 226 227 // {3 228 if (_M_match_token(_ScannerT::_S_token_comma)) 229 if (_M_match_token(_ScannerT::_S_token_dup_count)) // {3,7} 230 __n = _M_cur_int_value(10) - __min_rep; 231 else 232 __infi = true; 233 else 234 __n = 0; 235 if (!_M_match_token(_ScannerT::_S_token_interval_end)) 236 __throw_regex_error(regex_constants::error_brace); 237 238 __neg = __neg && _M_match_token(_ScannerT::_S_token_opt); 239 240 for (long __i = 0; __i < __min_rep; ++__i) 241 __e._M_append(__r._M_clone()); 242 243 if (__infi) 244 { 245 auto __tmp = __r._M_clone(); 246 _StateSeqT __s(_M_nfa, 247 _M_nfa._M_insert_alt(_S_invalid_state_id, 248 __tmp._M_start, __neg)); 249 __tmp._M_append(__s); 250 __e._M_append(__s); 251 } 252 else 253 { 254 if (__n < 0) 255 __throw_regex_error(regex_constants::error_badbrace); 256 auto __end = _M_nfa._M_insert_dummy(); 257 // _M_alt is the "match more" branch, and _M_next is the 258 // "match less" one. Switch _M_alt and _M_next of all created 259 // nodes. This is a hacking but IMO works well. 260 std::stack<_StateIdT> __stack; 261 for (long __i = 0; __i < __n; ++__i) 262 { 263 auto __tmp = __r._M_clone(); 264 auto __alt = _M_nfa._M_insert_alt(__tmp._M_start, 265 __end, __neg); 266 __stack.push(__alt); 267 __e._M_append(_StateSeqT(_M_nfa, __alt, __tmp._M_end)); 268 } 269 __e._M_append(__end); 270 while (!__stack.empty()) 271 { 272 auto& __tmp = _M_nfa[__stack.top()]; 273 __stack.pop(); 274 swap(__tmp._M_next, __tmp._M_alt); 275 } 276 } 277 _M_stack.push(__e); 278 } 279 else 280 return false; 281 return true; 282 } 283 284 #define __INSERT_REGEX_MATCHER(__func, args...)\ 285 do\ 286 if (!(_M_flags & regex_constants::icase))\ 287 if (!(_M_flags & regex_constants::collate))\ 288 __func<false, false>(args);\ 289 else\ 290 __func<false, true>(args);\ 291 else\ 292 if (!(_M_flags & regex_constants::collate))\ 293 __func<true, false>(args);\ 294 else\ 295 __func<true, true>(args);\ 296 while (false) 297 298 template<typename _TraitsT> 299 bool 300 _Compiler<_TraitsT>:: 301 _M_atom() 302 { 303 if (_M_match_token(_ScannerT::_S_token_anychar)) 304 { 305 if (!(_M_flags & regex_constants::ECMAScript)) 306 __INSERT_REGEX_MATCHER(_M_insert_any_matcher_posix); 307 else 308 __INSERT_REGEX_MATCHER(_M_insert_any_matcher_ecma); 309 } 310 else if (_M_try_char()) 311 __INSERT_REGEX_MATCHER(_M_insert_char_matcher); 312 else if (_M_match_token(_ScannerT::_S_token_backref)) 313 _M_stack.push(_StateSeqT(_M_nfa, _M_nfa. 314 _M_insert_backref(_M_cur_int_value(10)))); 315 else if (_M_match_token(_ScannerT::_S_token_quoted_class)) 316 __INSERT_REGEX_MATCHER(_M_insert_character_class_matcher); 317 else if (_M_match_token(_ScannerT::_S_token_subexpr_no_group_begin)) 318 { 319 _StateSeqT __r(_M_nfa, _M_nfa._M_insert_dummy()); 320 this->_M_disjunction(); 321 if (!_M_match_token(_ScannerT::_S_token_subexpr_end)) 322 __throw_regex_error(regex_constants::error_paren); 323 __r._M_append(_M_pop()); 324 _M_stack.push(__r); 325 } 326 else if (_M_match_token(_ScannerT::_S_token_subexpr_begin)) 327 { 328 _StateSeqT __r(_M_nfa, _M_nfa._M_insert_subexpr_begin()); 329 this->_M_disjunction(); 330 if (!_M_match_token(_ScannerT::_S_token_subexpr_end)) 331 __throw_regex_error(regex_constants::error_paren); 332 __r._M_append(_M_pop()); 333 __r._M_append(_M_nfa._M_insert_subexpr_end()); 334 _M_stack.push(__r); 335 } 336 else if (!_M_bracket_expression()) 337 return false; 338 return true; 339 } 340 341 template<typename _TraitsT> 342 bool 343 _Compiler<_TraitsT>:: 344 _M_bracket_expression() 345 { 346 bool __neg = 347 _M_match_token(_ScannerT::_S_token_bracket_neg_begin); 348 if (!(__neg || _M_match_token(_ScannerT::_S_token_bracket_begin))) 349 return false; 350 __INSERT_REGEX_MATCHER(_M_insert_bracket_matcher, __neg); 351 return true; 352 } 353 #undef __INSERT_REGEX_MATCHER 354 355 template<typename _TraitsT> 356 template<bool __icase, bool __collate> 357 void 358 _Compiler<_TraitsT>:: 359 _M_insert_any_matcher_ecma() 360 { 361 _M_stack.push(_StateSeqT(_M_nfa, 362 _M_nfa._M_insert_matcher 363 (_AnyMatcher<_TraitsT, true, __icase, __collate> 364 (_M_traits)))); 365 } 366 367 template<typename _TraitsT> 368 template<bool __icase, bool __collate> 369 void 370 _Compiler<_TraitsT>:: 371 _M_insert_any_matcher_posix() 372 { 373 _M_stack.push(_StateSeqT(_M_nfa, 374 _M_nfa._M_insert_matcher 375 (_AnyMatcher<_TraitsT, false, __icase, __collate> 376 (_M_traits)))); 377 } 378 379 template<typename _TraitsT> 380 template<bool __icase, bool __collate> 381 void 382 _Compiler<_TraitsT>:: 383 _M_insert_char_matcher() 384 { 385 _M_stack.push(_StateSeqT(_M_nfa, 386 _M_nfa._M_insert_matcher 387 (_CharMatcher<_TraitsT, __icase, __collate> 388 (_M_value[0], _M_traits)))); 389 } 390 391 template<typename _TraitsT> 392 template<bool __icase, bool __collate> 393 void 394 _Compiler<_TraitsT>:: 395 _M_insert_character_class_matcher() 396 { 397 _GLIBCXX_DEBUG_ASSERT(_M_value.size() == 1); 398 _BracketMatcher<_TraitsT, __icase, __collate> __matcher 399 (_M_ctype.is(_CtypeT::upper, _M_value[0]), _M_traits); 400 __matcher._M_add_character_class(_M_value, false); 401 __matcher._M_ready(); 402 _M_stack.push(_StateSeqT(_M_nfa, 403 _M_nfa._M_insert_matcher(std::move(__matcher)))); 404 } 405 406 template<typename _TraitsT> 407 template<bool __icase, bool __collate> 408 void 409 _Compiler<_TraitsT>:: 410 _M_insert_bracket_matcher(bool __neg) 411 { 412 _BracketMatcher<_TraitsT, __icase, __collate> __matcher(__neg, _M_traits); 413 while (!_M_match_token(_ScannerT::_S_token_bracket_end)) 414 _M_expression_term(__matcher); 415 __matcher._M_ready(); 416 _M_stack.push(_StateSeqT(_M_nfa, 417 _M_nfa._M_insert_matcher(std::move(__matcher)))); 418 } 419 420 template<typename _TraitsT> 421 template<bool __icase, bool __collate> 422 void 423 _Compiler<_TraitsT>:: 424 _M_expression_term(_BracketMatcher<_TraitsT, __icase, __collate>& __matcher) 425 { 426 if (_M_match_token(_ScannerT::_S_token_collsymbol)) 427 __matcher._M_add_collating_element(_M_value); 428 else if (_M_match_token(_ScannerT::_S_token_equiv_class_name)) 429 __matcher._M_add_equivalence_class(_M_value); 430 else if (_M_match_token(_ScannerT::_S_token_char_class_name)) 431 __matcher._M_add_character_class(_M_value, false); 432 else if (_M_try_char()) // [a 433 { 434 auto __ch = _M_value[0]; 435 if (_M_try_char()) 436 { 437 if (_M_value[0] == '-') // [a- 438 { 439 if (_M_try_char()) // [a-z] 440 { 441 __matcher._M_make_range(__ch, _M_value[0]); 442 return; 443 } 444 // If the dash is the last character in the bracket 445 // expression, it is not special. 446 if (_M_scanner._M_get_token() 447 != _ScannerT::_S_token_bracket_end) 448 __throw_regex_error(regex_constants::error_range); 449 } 450 __matcher._M_add_char(_M_value[0]); 451 } 452 __matcher._M_add_char(__ch); 453 } 454 else if (_M_match_token(_ScannerT::_S_token_quoted_class)) 455 __matcher._M_add_character_class(_M_value, 456 _M_ctype.is(_CtypeT::upper, 457 _M_value[0])); 458 else 459 __throw_regex_error(regex_constants::error_brack); 460 } 461 462 template<typename _TraitsT> 463 bool 464 _Compiler<_TraitsT>:: 465 _M_try_char() 466 { 467 bool __is_char = false; 468 if (_M_match_token(_ScannerT::_S_token_oct_num)) 469 { 470 __is_char = true; 471 _M_value.assign(1, _M_cur_int_value(8)); 472 } 473 else if (_M_match_token(_ScannerT::_S_token_hex_num)) 474 { 475 __is_char = true; 476 _M_value.assign(1, _M_cur_int_value(16)); 477 } 478 else if (_M_match_token(_ScannerT::_S_token_ord_char)) 479 __is_char = true; 480 return __is_char; 481 } 482 483 template<typename _TraitsT> 484 bool 485 _Compiler<_TraitsT>:: 486 _M_match_token(_TokenT token) 487 { 488 if (token == _M_scanner._M_get_token()) 489 { 490 _M_value = _M_scanner._M_get_value(); 491 _M_scanner._M_advance(); 492 return true; 493 } 494 return false; 495 } 496 497 template<typename _TraitsT> 498 int 499 _Compiler<_TraitsT>:: 500 _M_cur_int_value(int __radix) 501 { 502 long __v = 0; 503 for (typename _StringT::size_type __i = 0; 504 __i < _M_value.length(); ++__i) 505 __v =__v * __radix + _M_traits.value(_M_value[__i], __radix); 506 return __v; 507 } 508 509 template<typename _TraitsT, bool __icase, bool __collate> 510 bool 511 _BracketMatcher<_TraitsT, __icase, __collate>:: 512 _M_apply(_CharT __ch, false_type) const 513 { 514 bool __ret = false; 515 if (std::find(_M_char_set.begin(), _M_char_set.end(), 516 _M_translator._M_translate(__ch)) 517 != _M_char_set.end()) 518 __ret = true; 519 else 520 { 521 auto __s = _M_translator._M_transform(__ch); 522 for (auto& __it : _M_range_set) 523 if (__it.first <= __s && __s <= __it.second) 524 { 525 __ret = true; 526 break; 527 } 528 if (_M_traits.isctype(__ch, _M_class_set)) 529 __ret = true; 530 else if (std::find(_M_equiv_set.begin(), _M_equiv_set.end(), 531 _M_traits.transform_primary(&__ch, &__ch+1)) 532 != _M_equiv_set.end()) 533 __ret = true; 534 else 535 { 536 for (auto& __it : _M_neg_class_set) 537 if (!_M_traits.isctype(__ch, __it)) 538 { 539 __ret = true; 540 break; 541 } 542 } 543 } 544 if (_M_is_non_matching) 545 return !__ret; 546 else 547 return __ret; 548 } 549 550 _GLIBCXX_END_NAMESPACE_VERSION 551 } // namespace __detail 552 } // namespace 553