1 // Copyright 2011 the V8 project authors. All rights reserved. 2 // Redistribution and use in source and binary forms, with or without 3 // modification, are permitted provided that the following conditions are 4 // met: 5 // 6 // * Redistributions of source code must retain the above copyright 7 // notice, this list of conditions and the following disclaimer. 8 // * Redistributions in binary form must reproduce the above 9 // copyright notice, this list of conditions and the following 10 // disclaimer in the documentation and/or other materials provided 11 // with the distribution. 12 // * Neither the name of Google Inc. nor the names of its 13 // contributors may be used to endorse or promote products derived 14 // from this software without specific prior written permission. 15 // 16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 28 #include "v8.h" 29 30 #include "ast.h" 31 #include "compiler.h" 32 #include "execution.h" 33 #include "factory.h" 34 #include "jsregexp.h" 35 #include "platform.h" 36 #include "string-search.h" 37 #include "runtime.h" 38 #include "compilation-cache.h" 39 #include "string-stream.h" 40 #include "parser.h" 41 #include "regexp-macro-assembler.h" 42 #include "regexp-macro-assembler-tracer.h" 43 #include "regexp-macro-assembler-irregexp.h" 44 #include "regexp-stack.h" 45 46 #ifndef V8_INTERPRETED_REGEXP 47 #if V8_TARGET_ARCH_IA32 48 #include "ia32/regexp-macro-assembler-ia32.h" 49 #elif V8_TARGET_ARCH_X64 50 #include "x64/regexp-macro-assembler-x64.h" 51 #elif V8_TARGET_ARCH_ARM 52 #include "arm/regexp-macro-assembler-arm.h" 53 #elif V8_TARGET_ARCH_MIPS 54 #include "mips/regexp-macro-assembler-mips.h" 55 #else 56 #error Unsupported target architecture. 57 #endif 58 #endif 59 60 #include "interpreter-irregexp.h" 61 62 63 namespace v8 { 64 namespace internal { 65 66 Handle<Object> RegExpImpl::CreateRegExpLiteral(Handle<JSFunction> constructor, 67 Handle<String> pattern, 68 Handle<String> flags, 69 bool* has_pending_exception) { 70 // Call the construct code with 2 arguments. 71 Handle<Object> argv[] = { pattern, flags }; 72 return Execution::New(constructor, ARRAY_SIZE(argv), argv, 73 has_pending_exception); 74 } 75 76 77 static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) { 78 int flags = JSRegExp::NONE; 79 for (int i = 0; i < str->length(); i++) { 80 switch (str->Get(i)) { 81 case 'i': 82 flags |= JSRegExp::IGNORE_CASE; 83 break; 84 case 'g': 85 flags |= JSRegExp::GLOBAL; 86 break; 87 case 'm': 88 flags |= JSRegExp::MULTILINE; 89 break; 90 } 91 } 92 return JSRegExp::Flags(flags); 93 } 94 95 96 static inline void ThrowRegExpException(Handle<JSRegExp> re, 97 Handle<String> pattern, 98 Handle<String> error_text, 99 const char* message) { 100 Isolate* isolate = re->GetIsolate(); 101 Factory* factory = isolate->factory(); 102 Handle<FixedArray> elements = factory->NewFixedArray(2); 103 elements->set(0, *pattern); 104 elements->set(1, *error_text); 105 Handle<JSArray> array = factory->NewJSArrayWithElements(elements); 106 Handle<Object> regexp_err = factory->NewSyntaxError(message, array); 107 isolate->Throw(*regexp_err); 108 } 109 110 111 // Generic RegExp methods. Dispatches to implementation specific methods. 112 113 114 Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re, 115 Handle<String> pattern, 116 Handle<String> flag_str) { 117 Isolate* isolate = re->GetIsolate(); 118 JSRegExp::Flags flags = RegExpFlagsFromString(flag_str); 119 CompilationCache* compilation_cache = isolate->compilation_cache(); 120 Handle<FixedArray> cached = compilation_cache->LookupRegExp(pattern, flags); 121 bool in_cache = !cached.is_null(); 122 LOG(isolate, RegExpCompileEvent(re, in_cache)); 123 124 Handle<Object> result; 125 if (in_cache) { 126 re->set_data(*cached); 127 return re; 128 } 129 pattern = FlattenGetString(pattern); 130 ZoneScope zone_scope(isolate, DELETE_ON_EXIT); 131 PostponeInterruptsScope postpone(isolate); 132 RegExpCompileData parse_result; 133 FlatStringReader reader(isolate, pattern); 134 if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(), 135 &parse_result)) { 136 // Throw an exception if we fail to parse the pattern. 137 ThrowRegExpException(re, 138 pattern, 139 parse_result.error, 140 "malformed_regexp"); 141 return Handle<Object>::null(); 142 } 143 144 if (parse_result.simple && !flags.is_ignore_case()) { 145 // Parse-tree is a single atom that is equal to the pattern. 146 AtomCompile(re, pattern, flags, pattern); 147 } else if (parse_result.tree->IsAtom() && 148 !flags.is_ignore_case() && 149 parse_result.capture_count == 0) { 150 RegExpAtom* atom = parse_result.tree->AsAtom(); 151 Vector<const uc16> atom_pattern = atom->data(); 152 Handle<String> atom_string = 153 isolate->factory()->NewStringFromTwoByte(atom_pattern); 154 AtomCompile(re, pattern, flags, atom_string); 155 } else { 156 IrregexpInitialize(re, pattern, flags, parse_result.capture_count); 157 } 158 ASSERT(re->data()->IsFixedArray()); 159 // Compilation succeeded so the data is set on the regexp 160 // and we can store it in the cache. 161 Handle<FixedArray> data(FixedArray::cast(re->data())); 162 compilation_cache->PutRegExp(pattern, flags, data); 163 164 return re; 165 } 166 167 168 Handle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp, 169 Handle<String> subject, 170 int index, 171 Handle<JSArray> last_match_info) { 172 switch (regexp->TypeTag()) { 173 case JSRegExp::ATOM: 174 return AtomExec(regexp, subject, index, last_match_info); 175 case JSRegExp::IRREGEXP: { 176 Handle<Object> result = 177 IrregexpExec(regexp, subject, index, last_match_info); 178 ASSERT(!result.is_null() || 179 regexp->GetIsolate()->has_pending_exception()); 180 return result; 181 } 182 default: 183 UNREACHABLE(); 184 return Handle<Object>::null(); 185 } 186 } 187 188 189 // RegExp Atom implementation: Simple string search using indexOf. 190 191 192 void RegExpImpl::AtomCompile(Handle<JSRegExp> re, 193 Handle<String> pattern, 194 JSRegExp::Flags flags, 195 Handle<String> match_pattern) { 196 re->GetIsolate()->factory()->SetRegExpAtomData(re, 197 JSRegExp::ATOM, 198 pattern, 199 flags, 200 match_pattern); 201 } 202 203 204 static void SetAtomLastCapture(FixedArray* array, 205 String* subject, 206 int from, 207 int to) { 208 NoHandleAllocation no_handles; 209 RegExpImpl::SetLastCaptureCount(array, 2); 210 RegExpImpl::SetLastSubject(array, subject); 211 RegExpImpl::SetLastInput(array, subject); 212 RegExpImpl::SetCapture(array, 0, from); 213 RegExpImpl::SetCapture(array, 1, to); 214 } 215 216 217 Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re, 218 Handle<String> subject, 219 int index, 220 Handle<JSArray> last_match_info) { 221 Isolate* isolate = re->GetIsolate(); 222 223 ASSERT(0 <= index); 224 ASSERT(index <= subject->length()); 225 226 if (!subject->IsFlat()) FlattenString(subject); 227 AssertNoAllocation no_heap_allocation; // ensure vectors stay valid 228 229 String* needle = String::cast(re->DataAt(JSRegExp::kAtomPatternIndex)); 230 int needle_len = needle->length(); 231 ASSERT(needle->IsFlat()); 232 233 if (needle_len != 0) { 234 if (index + needle_len > subject->length()) { 235 return isolate->factory()->null_value(); 236 } 237 238 String::FlatContent needle_content = needle->GetFlatContent(); 239 String::FlatContent subject_content = subject->GetFlatContent(); 240 ASSERT(needle_content.IsFlat()); 241 ASSERT(subject_content.IsFlat()); 242 // dispatch on type of strings 243 index = (needle_content.IsAscii() 244 ? (subject_content.IsAscii() 245 ? SearchString(isolate, 246 subject_content.ToAsciiVector(), 247 needle_content.ToAsciiVector(), 248 index) 249 : SearchString(isolate, 250 subject_content.ToUC16Vector(), 251 needle_content.ToAsciiVector(), 252 index)) 253 : (subject_content.IsAscii() 254 ? SearchString(isolate, 255 subject_content.ToAsciiVector(), 256 needle_content.ToUC16Vector(), 257 index) 258 : SearchString(isolate, 259 subject_content.ToUC16Vector(), 260 needle_content.ToUC16Vector(), 261 index))); 262 if (index == -1) return isolate->factory()->null_value(); 263 } 264 ASSERT(last_match_info->HasFastElements()); 265 266 { 267 NoHandleAllocation no_handles; 268 FixedArray* array = FixedArray::cast(last_match_info->elements()); 269 SetAtomLastCapture(array, *subject, index, index + needle_len); 270 } 271 return last_match_info; 272 } 273 274 275 // Irregexp implementation. 276 277 // Ensures that the regexp object contains a compiled version of the 278 // source for either ASCII or non-ASCII strings. 279 // If the compiled version doesn't already exist, it is compiled 280 // from the source pattern. 281 // If compilation fails, an exception is thrown and this function 282 // returns false. 283 bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, bool is_ascii) { 284 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii)); 285 #ifdef V8_INTERPRETED_REGEXP 286 if (compiled_code->IsByteArray()) return true; 287 #else // V8_INTERPRETED_REGEXP (RegExp native code) 288 if (compiled_code->IsCode()) return true; 289 #endif 290 // We could potentially have marked this as flushable, but have kept 291 // a saved version if we did not flush it yet. 292 Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_ascii)); 293 if (saved_code->IsCode()) { 294 // Reinstate the code in the original place. 295 re->SetDataAt(JSRegExp::code_index(is_ascii), saved_code); 296 ASSERT(compiled_code->IsSmi()); 297 return true; 298 } 299 return CompileIrregexp(re, is_ascii); 300 } 301 302 303 static bool CreateRegExpErrorObjectAndThrow(Handle<JSRegExp> re, 304 bool is_ascii, 305 Handle<String> error_message, 306 Isolate* isolate) { 307 Factory* factory = isolate->factory(); 308 Handle<FixedArray> elements = factory->NewFixedArray(2); 309 elements->set(0, re->Pattern()); 310 elements->set(1, *error_message); 311 Handle<JSArray> array = factory->NewJSArrayWithElements(elements); 312 Handle<Object> regexp_err = 313 factory->NewSyntaxError("malformed_regexp", array); 314 isolate->Throw(*regexp_err); 315 return false; 316 } 317 318 319 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, bool is_ascii) { 320 // Compile the RegExp. 321 Isolate* isolate = re->GetIsolate(); 322 ZoneScope zone_scope(isolate, DELETE_ON_EXIT); 323 PostponeInterruptsScope postpone(isolate); 324 // If we had a compilation error the last time this is saved at the 325 // saved code index. 326 Object* entry = re->DataAt(JSRegExp::code_index(is_ascii)); 327 // When arriving here entry can only be a smi, either representing an 328 // uncompiled regexp, a previous compilation error, or code that has 329 // been flushed. 330 ASSERT(entry->IsSmi()); 331 int entry_value = Smi::cast(entry)->value(); 332 ASSERT(entry_value == JSRegExp::kUninitializedValue || 333 entry_value == JSRegExp::kCompilationErrorValue || 334 (entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0)); 335 336 if (entry_value == JSRegExp::kCompilationErrorValue) { 337 // A previous compilation failed and threw an error which we store in 338 // the saved code index (we store the error message, not the actual 339 // error). Recreate the error object and throw it. 340 Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_ascii)); 341 ASSERT(error_string->IsString()); 342 Handle<String> error_message(String::cast(error_string)); 343 CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate); 344 return false; 345 } 346 347 JSRegExp::Flags flags = re->GetFlags(); 348 349 Handle<String> pattern(re->Pattern()); 350 if (!pattern->IsFlat()) FlattenString(pattern); 351 RegExpCompileData compile_data; 352 FlatStringReader reader(isolate, pattern); 353 if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(), 354 &compile_data)) { 355 // Throw an exception if we fail to parse the pattern. 356 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once. 357 ThrowRegExpException(re, 358 pattern, 359 compile_data.error, 360 "malformed_regexp"); 361 return false; 362 } 363 RegExpEngine::CompilationResult result = 364 RegExpEngine::Compile(&compile_data, 365 flags.is_ignore_case(), 366 flags.is_multiline(), 367 pattern, 368 is_ascii); 369 if (result.error_message != NULL) { 370 // Unable to compile regexp. 371 Handle<String> error_message = 372 isolate->factory()->NewStringFromUtf8(CStrVector(result.error_message)); 373 CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate); 374 return false; 375 } 376 377 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data())); 378 data->set(JSRegExp::code_index(is_ascii), result.code); 379 int register_max = IrregexpMaxRegisterCount(*data); 380 if (result.num_registers > register_max) { 381 SetIrregexpMaxRegisterCount(*data, result.num_registers); 382 } 383 384 return true; 385 } 386 387 388 int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) { 389 return Smi::cast( 390 re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value(); 391 } 392 393 394 void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) { 395 re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value)); 396 } 397 398 399 int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) { 400 return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value(); 401 } 402 403 404 int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) { 405 return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value(); 406 } 407 408 409 ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_ascii) { 410 return ByteArray::cast(re->get(JSRegExp::code_index(is_ascii))); 411 } 412 413 414 Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_ascii) { 415 return Code::cast(re->get(JSRegExp::code_index(is_ascii))); 416 } 417 418 419 void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re, 420 Handle<String> pattern, 421 JSRegExp::Flags flags, 422 int capture_count) { 423 // Initialize compiled code entries to null. 424 re->GetIsolate()->factory()->SetRegExpIrregexpData(re, 425 JSRegExp::IRREGEXP, 426 pattern, 427 flags, 428 capture_count); 429 } 430 431 432 int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp, 433 Handle<String> subject) { 434 if (!subject->IsFlat()) FlattenString(subject); 435 436 // Check the asciiness of the underlying storage. 437 bool is_ascii = subject->IsAsciiRepresentationUnderneath(); 438 if (!EnsureCompiledIrregexp(regexp, is_ascii)) return -1; 439 440 #ifdef V8_INTERPRETED_REGEXP 441 // Byte-code regexp needs space allocated for all its registers. 442 return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())); 443 #else // V8_INTERPRETED_REGEXP 444 // Native regexp only needs room to output captures. Registers are handled 445 // internally. 446 return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2; 447 #endif // V8_INTERPRETED_REGEXP 448 } 449 450 451 RegExpImpl::IrregexpResult RegExpImpl::IrregexpExecOnce( 452 Handle<JSRegExp> regexp, 453 Handle<String> subject, 454 int index, 455 Vector<int> output) { 456 Isolate* isolate = regexp->GetIsolate(); 457 458 Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate); 459 460 ASSERT(index >= 0); 461 ASSERT(index <= subject->length()); 462 ASSERT(subject->IsFlat()); 463 464 bool is_ascii = subject->IsAsciiRepresentationUnderneath(); 465 466 #ifndef V8_INTERPRETED_REGEXP 467 ASSERT(output.length() >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2); 468 do { 469 EnsureCompiledIrregexp(regexp, is_ascii); 470 Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii), isolate); 471 NativeRegExpMacroAssembler::Result res = 472 NativeRegExpMacroAssembler::Match(code, 473 subject, 474 output.start(), 475 output.length(), 476 index, 477 isolate); 478 if (res != NativeRegExpMacroAssembler::RETRY) { 479 ASSERT(res != NativeRegExpMacroAssembler::EXCEPTION || 480 isolate->has_pending_exception()); 481 STATIC_ASSERT( 482 static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS); 483 STATIC_ASSERT( 484 static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE); 485 STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION) 486 == RE_EXCEPTION); 487 return static_cast<IrregexpResult>(res); 488 } 489 // If result is RETRY, the string has changed representation, and we 490 // must restart from scratch. 491 // In this case, it means we must make sure we are prepared to handle 492 // the, potentially, different subject (the string can switch between 493 // being internal and external, and even between being ASCII and UC16, 494 // but the characters are always the same). 495 IrregexpPrepare(regexp, subject); 496 is_ascii = subject->IsAsciiRepresentationUnderneath(); 497 } while (true); 498 UNREACHABLE(); 499 return RE_EXCEPTION; 500 #else // V8_INTERPRETED_REGEXP 501 502 ASSERT(output.length() >= IrregexpNumberOfRegisters(*irregexp)); 503 // We must have done EnsureCompiledIrregexp, so we can get the number of 504 // registers. 505 int* register_vector = output.start(); 506 int number_of_capture_registers = 507 (IrregexpNumberOfCaptures(*irregexp) + 1) * 2; 508 for (int i = number_of_capture_registers - 1; i >= 0; i--) { 509 register_vector[i] = -1; 510 } 511 Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_ascii), isolate); 512 513 IrregexpResult result = IrregexpInterpreter::Match(isolate, 514 byte_codes, 515 subject, 516 register_vector, 517 index); 518 if (result == RE_EXCEPTION) { 519 ASSERT(!isolate->has_pending_exception()); 520 isolate->StackOverflow(); 521 } 522 return result; 523 #endif // V8_INTERPRETED_REGEXP 524 } 525 526 527 Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> jsregexp, 528 Handle<String> subject, 529 int previous_index, 530 Handle<JSArray> last_match_info) { 531 Isolate* isolate = jsregexp->GetIsolate(); 532 ASSERT_EQ(jsregexp->TypeTag(), JSRegExp::IRREGEXP); 533 534 // Prepare space for the return values. 535 #ifdef V8_INTERPRETED_REGEXP 536 #ifdef DEBUG 537 if (FLAG_trace_regexp_bytecodes) { 538 String* pattern = jsregexp->Pattern(); 539 PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString())); 540 PrintF("\n\nSubject string: '%s'\n\n", *(subject->ToCString())); 541 } 542 #endif 543 #endif 544 int required_registers = RegExpImpl::IrregexpPrepare(jsregexp, subject); 545 if (required_registers < 0) { 546 // Compiling failed with an exception. 547 ASSERT(isolate->has_pending_exception()); 548 return Handle<Object>::null(); 549 } 550 551 OffsetsVector registers(required_registers, isolate); 552 553 IrregexpResult res = RegExpImpl::IrregexpExecOnce( 554 jsregexp, subject, previous_index, Vector<int>(registers.vector(), 555 registers.length())); 556 if (res == RE_SUCCESS) { 557 int capture_register_count = 558 (IrregexpNumberOfCaptures(FixedArray::cast(jsregexp->data())) + 1) * 2; 559 last_match_info->EnsureSize(capture_register_count + kLastMatchOverhead); 560 AssertNoAllocation no_gc; 561 int* register_vector = registers.vector(); 562 FixedArray* array = FixedArray::cast(last_match_info->elements()); 563 for (int i = 0; i < capture_register_count; i += 2) { 564 SetCapture(array, i, register_vector[i]); 565 SetCapture(array, i + 1, register_vector[i + 1]); 566 } 567 SetLastCaptureCount(array, capture_register_count); 568 SetLastSubject(array, *subject); 569 SetLastInput(array, *subject); 570 return last_match_info; 571 } 572 if (res == RE_EXCEPTION) { 573 ASSERT(isolate->has_pending_exception()); 574 return Handle<Object>::null(); 575 } 576 ASSERT(res == RE_FAILURE); 577 return isolate->factory()->null_value(); 578 } 579 580 581 // ------------------------------------------------------------------- 582 // Implementation of the Irregexp regular expression engine. 583 // 584 // The Irregexp regular expression engine is intended to be a complete 585 // implementation of ECMAScript regular expressions. It generates either 586 // bytecodes or native code. 587 588 // The Irregexp regexp engine is structured in three steps. 589 // 1) The parser generates an abstract syntax tree. See ast.cc. 590 // 2) From the AST a node network is created. The nodes are all 591 // subclasses of RegExpNode. The nodes represent states when 592 // executing a regular expression. Several optimizations are 593 // performed on the node network. 594 // 3) From the nodes we generate either byte codes or native code 595 // that can actually execute the regular expression (perform 596 // the search). The code generation step is described in more 597 // detail below. 598 599 // Code generation. 600 // 601 // The nodes are divided into four main categories. 602 // * Choice nodes 603 // These represent places where the regular expression can 604 // match in more than one way. For example on entry to an 605 // alternation (foo|bar) or a repetition (*, +, ? or {}). 606 // * Action nodes 607 // These represent places where some action should be 608 // performed. Examples include recording the current position 609 // in the input string to a register (in order to implement 610 // captures) or other actions on register for example in order 611 // to implement the counters needed for {} repetitions. 612 // * Matching nodes 613 // These attempt to match some element part of the input string. 614 // Examples of elements include character classes, plain strings 615 // or back references. 616 // * End nodes 617 // These are used to implement the actions required on finding 618 // a successful match or failing to find a match. 619 // 620 // The code generated (whether as byte codes or native code) maintains 621 // some state as it runs. This consists of the following elements: 622 // 623 // * The capture registers. Used for string captures. 624 // * Other registers. Used for counters etc. 625 // * The current position. 626 // * The stack of backtracking information. Used when a matching node 627 // fails to find a match and needs to try an alternative. 628 // 629 // Conceptual regular expression execution model: 630 // 631 // There is a simple conceptual model of regular expression execution 632 // which will be presented first. The actual code generated is a more 633 // efficient simulation of the simple conceptual model: 634 // 635 // * Choice nodes are implemented as follows: 636 // For each choice except the last { 637 // push current position 638 // push backtrack code location 639 // <generate code to test for choice> 640 // backtrack code location: 641 // pop current position 642 // } 643 // <generate code to test for last choice> 644 // 645 // * Actions nodes are generated as follows 646 // <push affected registers on backtrack stack> 647 // <generate code to perform action> 648 // push backtrack code location 649 // <generate code to test for following nodes> 650 // backtrack code location: 651 // <pop affected registers to restore their state> 652 // <pop backtrack location from stack and go to it> 653 // 654 // * Matching nodes are generated as follows: 655 // if input string matches at current position 656 // update current position 657 // <generate code to test for following nodes> 658 // else 659 // <pop backtrack location from stack and go to it> 660 // 661 // Thus it can be seen that the current position is saved and restored 662 // by the choice nodes, whereas the registers are saved and restored by 663 // by the action nodes that manipulate them. 664 // 665 // The other interesting aspect of this model is that nodes are generated 666 // at the point where they are needed by a recursive call to Emit(). If 667 // the node has already been code generated then the Emit() call will 668 // generate a jump to the previously generated code instead. In order to 669 // limit recursion it is possible for the Emit() function to put the node 670 // on a work list for later generation and instead generate a jump. The 671 // destination of the jump is resolved later when the code is generated. 672 // 673 // Actual regular expression code generation. 674 // 675 // Code generation is actually more complicated than the above. In order 676 // to improve the efficiency of the generated code some optimizations are 677 // performed 678 // 679 // * Choice nodes have 1-character lookahead. 680 // A choice node looks at the following character and eliminates some of 681 // the choices immediately based on that character. This is not yet 682 // implemented. 683 // * Simple greedy loops store reduced backtracking information. 684 // A quantifier like /.*foo/m will greedily match the whole input. It will 685 // then need to backtrack to a point where it can match "foo". The naive 686 // implementation of this would push each character position onto the 687 // backtracking stack, then pop them off one by one. This would use space 688 // proportional to the length of the input string. However since the "." 689 // can only match in one way and always has a constant length (in this case 690 // of 1) it suffices to store the current position on the top of the stack 691 // once. Matching now becomes merely incrementing the current position and 692 // backtracking becomes decrementing the current position and checking the 693 // result against the stored current position. This is faster and saves 694 // space. 695 // * The current state is virtualized. 696 // This is used to defer expensive operations until it is clear that they 697 // are needed and to generate code for a node more than once, allowing 698 // specialized an efficient versions of the code to be created. This is 699 // explained in the section below. 700 // 701 // Execution state virtualization. 702 // 703 // Instead of emitting code, nodes that manipulate the state can record their 704 // manipulation in an object called the Trace. The Trace object can record a 705 // current position offset, an optional backtrack code location on the top of 706 // the virtualized backtrack stack and some register changes. When a node is 707 // to be emitted it can flush the Trace or update it. Flushing the Trace 708 // will emit code to bring the actual state into line with the virtual state. 709 // Avoiding flushing the state can postpone some work (e.g. updates of capture 710 // registers). Postponing work can save time when executing the regular 711 // expression since it may be found that the work never has to be done as a 712 // failure to match can occur. In addition it is much faster to jump to a 713 // known backtrack code location than it is to pop an unknown backtrack 714 // location from the stack and jump there. 715 // 716 // The virtual state found in the Trace affects code generation. For example 717 // the virtual state contains the difference between the actual current 718 // position and the virtual current position, and matching code needs to use 719 // this offset to attempt a match in the correct location of the input 720 // string. Therefore code generated for a non-trivial trace is specialized 721 // to that trace. The code generator therefore has the ability to generate 722 // code for each node several times. In order to limit the size of the 723 // generated code there is an arbitrary limit on how many specialized sets of 724 // code may be generated for a given node. If the limit is reached, the 725 // trace is flushed and a generic version of the code for a node is emitted. 726 // This is subsequently used for that node. The code emitted for non-generic 727 // trace is not recorded in the node and so it cannot currently be reused in 728 // the event that code generation is requested for an identical trace. 729 730 731 void RegExpTree::AppendToText(RegExpText* text) { 732 UNREACHABLE(); 733 } 734 735 736 void RegExpAtom::AppendToText(RegExpText* text) { 737 text->AddElement(TextElement::Atom(this)); 738 } 739 740 741 void RegExpCharacterClass::AppendToText(RegExpText* text) { 742 text->AddElement(TextElement::CharClass(this)); 743 } 744 745 746 void RegExpText::AppendToText(RegExpText* text) { 747 for (int i = 0; i < elements()->length(); i++) 748 text->AddElement(elements()->at(i)); 749 } 750 751 752 TextElement TextElement::Atom(RegExpAtom* atom) { 753 TextElement result = TextElement(ATOM); 754 result.data.u_atom = atom; 755 return result; 756 } 757 758 759 TextElement TextElement::CharClass( 760 RegExpCharacterClass* char_class) { 761 TextElement result = TextElement(CHAR_CLASS); 762 result.data.u_char_class = char_class; 763 return result; 764 } 765 766 767 int TextElement::length() { 768 if (type == ATOM) { 769 return data.u_atom->length(); 770 } else { 771 ASSERT(type == CHAR_CLASS); 772 return 1; 773 } 774 } 775 776 777 DispatchTable* ChoiceNode::GetTable(bool ignore_case) { 778 if (table_ == NULL) { 779 table_ = new DispatchTable(); 780 DispatchTableConstructor cons(table_, ignore_case); 781 cons.BuildTable(this); 782 } 783 return table_; 784 } 785 786 787 class RegExpCompiler { 788 public: 789 RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii); 790 791 int AllocateRegister() { 792 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) { 793 reg_exp_too_big_ = true; 794 return next_register_; 795 } 796 return next_register_++; 797 } 798 799 RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler, 800 RegExpNode* start, 801 int capture_count, 802 Handle<String> pattern); 803 804 inline void AddWork(RegExpNode* node) { work_list_->Add(node); } 805 806 static const int kImplementationOffset = 0; 807 static const int kNumberOfRegistersOffset = 0; 808 static const int kCodeOffset = 1; 809 810 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; } 811 EndNode* accept() { return accept_; } 812 813 static const int kMaxRecursion = 100; 814 inline int recursion_depth() { return recursion_depth_; } 815 inline void IncrementRecursionDepth() { recursion_depth_++; } 816 inline void DecrementRecursionDepth() { recursion_depth_--; } 817 818 void SetRegExpTooBig() { reg_exp_too_big_ = true; } 819 820 inline bool ignore_case() { return ignore_case_; } 821 inline bool ascii() { return ascii_; } 822 823 int current_expansion_factor() { return current_expansion_factor_; } 824 void set_current_expansion_factor(int value) { 825 current_expansion_factor_ = value; 826 } 827 828 static const int kNoRegister = -1; 829 830 private: 831 EndNode* accept_; 832 int next_register_; 833 List<RegExpNode*>* work_list_; 834 int recursion_depth_; 835 RegExpMacroAssembler* macro_assembler_; 836 bool ignore_case_; 837 bool ascii_; 838 bool reg_exp_too_big_; 839 int current_expansion_factor_; 840 }; 841 842 843 class RecursionCheck { 844 public: 845 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) { 846 compiler->IncrementRecursionDepth(); 847 } 848 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); } 849 private: 850 RegExpCompiler* compiler_; 851 }; 852 853 854 static RegExpEngine::CompilationResult IrregexpRegExpTooBig() { 855 return RegExpEngine::CompilationResult("RegExp too big"); 856 } 857 858 859 // Attempts to compile the regexp using an Irregexp code generator. Returns 860 // a fixed array or a null handle depending on whether it succeeded. 861 RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii) 862 : next_register_(2 * (capture_count + 1)), 863 work_list_(NULL), 864 recursion_depth_(0), 865 ignore_case_(ignore_case), 866 ascii_(ascii), 867 reg_exp_too_big_(false), 868 current_expansion_factor_(1) { 869 accept_ = new EndNode(EndNode::ACCEPT); 870 ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister); 871 } 872 873 874 RegExpEngine::CompilationResult RegExpCompiler::Assemble( 875 RegExpMacroAssembler* macro_assembler, 876 RegExpNode* start, 877 int capture_count, 878 Handle<String> pattern) { 879 Heap* heap = pattern->GetHeap(); 880 881 bool use_slow_safe_regexp_compiler = false; 882 if (heap->total_regexp_code_generated() > 883 RegExpImpl::kRegWxpCompiledLimit && 884 heap->isolate()->memory_allocator()->SizeExecutable() > 885 RegExpImpl::kRegExpExecutableMemoryLimit) { 886 use_slow_safe_regexp_compiler = true; 887 } 888 889 macro_assembler->set_slow_safe(use_slow_safe_regexp_compiler); 890 891 #ifdef DEBUG 892 if (FLAG_trace_regexp_assembler) 893 macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler); 894 else 895 #endif 896 macro_assembler_ = macro_assembler; 897 898 List <RegExpNode*> work_list(0); 899 work_list_ = &work_list; 900 Label fail; 901 macro_assembler_->PushBacktrack(&fail); 902 Trace new_trace; 903 start->Emit(this, &new_trace); 904 macro_assembler_->Bind(&fail); 905 macro_assembler_->Fail(); 906 while (!work_list.is_empty()) { 907 work_list.RemoveLast()->Emit(this, &new_trace); 908 } 909 if (reg_exp_too_big_) return IrregexpRegExpTooBig(); 910 911 Handle<HeapObject> code = macro_assembler_->GetCode(pattern); 912 heap->IncreaseTotalRegexpCodeGenerated(code->Size()); 913 work_list_ = NULL; 914 #ifdef DEBUG 915 if (FLAG_print_code) { 916 Handle<Code>::cast(code)->Disassemble(*pattern->ToCString()); 917 } 918 if (FLAG_trace_regexp_assembler) { 919 delete macro_assembler_; 920 } 921 #endif 922 return RegExpEngine::CompilationResult(*code, next_register_); 923 } 924 925 926 bool Trace::DeferredAction::Mentions(int that) { 927 if (type() == ActionNode::CLEAR_CAPTURES) { 928 Interval range = static_cast<DeferredClearCaptures*>(this)->range(); 929 return range.Contains(that); 930 } else { 931 return reg() == that; 932 } 933 } 934 935 936 bool Trace::mentions_reg(int reg) { 937 for (DeferredAction* action = actions_; 938 action != NULL; 939 action = action->next()) { 940 if (action->Mentions(reg)) 941 return true; 942 } 943 return false; 944 } 945 946 947 bool Trace::GetStoredPosition(int reg, int* cp_offset) { 948 ASSERT_EQ(0, *cp_offset); 949 for (DeferredAction* action = actions_; 950 action != NULL; 951 action = action->next()) { 952 if (action->Mentions(reg)) { 953 if (action->type() == ActionNode::STORE_POSITION) { 954 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset(); 955 return true; 956 } else { 957 return false; 958 } 959 } 960 } 961 return false; 962 } 963 964 965 int Trace::FindAffectedRegisters(OutSet* affected_registers) { 966 int max_register = RegExpCompiler::kNoRegister; 967 for (DeferredAction* action = actions_; 968 action != NULL; 969 action = action->next()) { 970 if (action->type() == ActionNode::CLEAR_CAPTURES) { 971 Interval range = static_cast<DeferredClearCaptures*>(action)->range(); 972 for (int i = range.from(); i <= range.to(); i++) 973 affected_registers->Set(i); 974 if (range.to() > max_register) max_register = range.to(); 975 } else { 976 affected_registers->Set(action->reg()); 977 if (action->reg() > max_register) max_register = action->reg(); 978 } 979 } 980 return max_register; 981 } 982 983 984 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler, 985 int max_register, 986 OutSet& registers_to_pop, 987 OutSet& registers_to_clear) { 988 for (int reg = max_register; reg >= 0; reg--) { 989 if (registers_to_pop.Get(reg)) assembler->PopRegister(reg); 990 else if (registers_to_clear.Get(reg)) { 991 int clear_to = reg; 992 while (reg > 0 && registers_to_clear.Get(reg - 1)) { 993 reg--; 994 } 995 assembler->ClearRegisters(reg, clear_to); 996 } 997 } 998 } 999 1000 1001 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler, 1002 int max_register, 1003 OutSet& affected_registers, 1004 OutSet* registers_to_pop, 1005 OutSet* registers_to_clear) { 1006 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1. 1007 const int push_limit = (assembler->stack_limit_slack() + 1) / 2; 1008 1009 // Count pushes performed to force a stack limit check occasionally. 1010 int pushes = 0; 1011 1012 for (int reg = 0; reg <= max_register; reg++) { 1013 if (!affected_registers.Get(reg)) { 1014 continue; 1015 } 1016 1017 // The chronologically first deferred action in the trace 1018 // is used to infer the action needed to restore a register 1019 // to its previous state (or not, if it's safe to ignore it). 1020 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR }; 1021 DeferredActionUndoType undo_action = IGNORE; 1022 1023 int value = 0; 1024 bool absolute = false; 1025 bool clear = false; 1026 int store_position = -1; 1027 // This is a little tricky because we are scanning the actions in reverse 1028 // historical order (newest first). 1029 for (DeferredAction* action = actions_; 1030 action != NULL; 1031 action = action->next()) { 1032 if (action->Mentions(reg)) { 1033 switch (action->type()) { 1034 case ActionNode::SET_REGISTER: { 1035 Trace::DeferredSetRegister* psr = 1036 static_cast<Trace::DeferredSetRegister*>(action); 1037 if (!absolute) { 1038 value += psr->value(); 1039 absolute = true; 1040 } 1041 // SET_REGISTER is currently only used for newly introduced loop 1042 // counters. They can have a significant previous value if they 1043 // occour in a loop. TODO(lrn): Propagate this information, so 1044 // we can set undo_action to IGNORE if we know there is no value to 1045 // restore. 1046 undo_action = RESTORE; 1047 ASSERT_EQ(store_position, -1); 1048 ASSERT(!clear); 1049 break; 1050 } 1051 case ActionNode::INCREMENT_REGISTER: 1052 if (!absolute) { 1053 value++; 1054 } 1055 ASSERT_EQ(store_position, -1); 1056 ASSERT(!clear); 1057 undo_action = RESTORE; 1058 break; 1059 case ActionNode::STORE_POSITION: { 1060 Trace::DeferredCapture* pc = 1061 static_cast<Trace::DeferredCapture*>(action); 1062 if (!clear && store_position == -1) { 1063 store_position = pc->cp_offset(); 1064 } 1065 1066 // For captures we know that stores and clears alternate. 1067 // Other register, are never cleared, and if the occur 1068 // inside a loop, they might be assigned more than once. 1069 if (reg <= 1) { 1070 // Registers zero and one, aka "capture zero", is 1071 // always set correctly if we succeed. There is no 1072 // need to undo a setting on backtrack, because we 1073 // will set it again or fail. 1074 undo_action = IGNORE; 1075 } else { 1076 undo_action = pc->is_capture() ? CLEAR : RESTORE; 1077 } 1078 ASSERT(!absolute); 1079 ASSERT_EQ(value, 0); 1080 break; 1081 } 1082 case ActionNode::CLEAR_CAPTURES: { 1083 // Since we're scanning in reverse order, if we've already 1084 // set the position we have to ignore historically earlier 1085 // clearing operations. 1086 if (store_position == -1) { 1087 clear = true; 1088 } 1089 undo_action = RESTORE; 1090 ASSERT(!absolute); 1091 ASSERT_EQ(value, 0); 1092 break; 1093 } 1094 default: 1095 UNREACHABLE(); 1096 break; 1097 } 1098 } 1099 } 1100 // Prepare for the undo-action (e.g., push if it's going to be popped). 1101 if (undo_action == RESTORE) { 1102 pushes++; 1103 RegExpMacroAssembler::StackCheckFlag stack_check = 1104 RegExpMacroAssembler::kNoStackLimitCheck; 1105 if (pushes == push_limit) { 1106 stack_check = RegExpMacroAssembler::kCheckStackLimit; 1107 pushes = 0; 1108 } 1109 1110 assembler->PushRegister(reg, stack_check); 1111 registers_to_pop->Set(reg); 1112 } else if (undo_action == CLEAR) { 1113 registers_to_clear->Set(reg); 1114 } 1115 // Perform the chronologically last action (or accumulated increment) 1116 // for the register. 1117 if (store_position != -1) { 1118 assembler->WriteCurrentPositionToRegister(reg, store_position); 1119 } else if (clear) { 1120 assembler->ClearRegisters(reg, reg); 1121 } else if (absolute) { 1122 assembler->SetRegister(reg, value); 1123 } else if (value != 0) { 1124 assembler->AdvanceRegister(reg, value); 1125 } 1126 } 1127 } 1128 1129 1130 // This is called as we come into a loop choice node and some other tricky 1131 // nodes. It normalizes the state of the code generator to ensure we can 1132 // generate generic code. 1133 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) { 1134 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1135 1136 ASSERT(!is_trivial()); 1137 1138 if (actions_ == NULL && backtrack() == NULL) { 1139 // Here we just have some deferred cp advances to fix and we are back to 1140 // a normal situation. We may also have to forget some information gained 1141 // through a quick check that was already performed. 1142 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_); 1143 // Create a new trivial state and generate the node with that. 1144 Trace new_state; 1145 successor->Emit(compiler, &new_state); 1146 return; 1147 } 1148 1149 // Generate deferred actions here along with code to undo them again. 1150 OutSet affected_registers; 1151 1152 if (backtrack() != NULL) { 1153 // Here we have a concrete backtrack location. These are set up by choice 1154 // nodes and so they indicate that we have a deferred save of the current 1155 // position which we may need to emit here. 1156 assembler->PushCurrentPosition(); 1157 } 1158 1159 int max_register = FindAffectedRegisters(&affected_registers); 1160 OutSet registers_to_pop; 1161 OutSet registers_to_clear; 1162 PerformDeferredActions(assembler, 1163 max_register, 1164 affected_registers, 1165 ®isters_to_pop, 1166 ®isters_to_clear); 1167 if (cp_offset_ != 0) { 1168 assembler->AdvanceCurrentPosition(cp_offset_); 1169 } 1170 1171 // Create a new trivial state and generate the node with that. 1172 Label undo; 1173 assembler->PushBacktrack(&undo); 1174 Trace new_state; 1175 successor->Emit(compiler, &new_state); 1176 1177 // On backtrack we need to restore state. 1178 assembler->Bind(&undo); 1179 RestoreAffectedRegisters(assembler, 1180 max_register, 1181 registers_to_pop, 1182 registers_to_clear); 1183 if (backtrack() == NULL) { 1184 assembler->Backtrack(); 1185 } else { 1186 assembler->PopCurrentPosition(); 1187 assembler->GoTo(backtrack()); 1188 } 1189 } 1190 1191 1192 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) { 1193 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1194 1195 // Omit flushing the trace. We discard the entire stack frame anyway. 1196 1197 if (!label()->is_bound()) { 1198 // We are completely independent of the trace, since we ignore it, 1199 // so this code can be used as the generic version. 1200 assembler->Bind(label()); 1201 } 1202 1203 // Throw away everything on the backtrack stack since the start 1204 // of the negative submatch and restore the character position. 1205 assembler->ReadCurrentPositionFromRegister(current_position_register_); 1206 assembler->ReadStackPointerFromRegister(stack_pointer_register_); 1207 if (clear_capture_count_ > 0) { 1208 // Clear any captures that might have been performed during the success 1209 // of the body of the negative look-ahead. 1210 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1; 1211 assembler->ClearRegisters(clear_capture_start_, clear_capture_end); 1212 } 1213 // Now that we have unwound the stack we find at the top of the stack the 1214 // backtrack that the BeginSubmatch node got. 1215 assembler->Backtrack(); 1216 } 1217 1218 1219 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) { 1220 if (!trace->is_trivial()) { 1221 trace->Flush(compiler, this); 1222 return; 1223 } 1224 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1225 if (!label()->is_bound()) { 1226 assembler->Bind(label()); 1227 } 1228 switch (action_) { 1229 case ACCEPT: 1230 assembler->Succeed(); 1231 return; 1232 case BACKTRACK: 1233 assembler->GoTo(trace->backtrack()); 1234 return; 1235 case NEGATIVE_SUBMATCH_SUCCESS: 1236 // This case is handled in a different virtual method. 1237 UNREACHABLE(); 1238 } 1239 UNIMPLEMENTED(); 1240 } 1241 1242 1243 void GuardedAlternative::AddGuard(Guard* guard) { 1244 if (guards_ == NULL) 1245 guards_ = new ZoneList<Guard*>(1); 1246 guards_->Add(guard); 1247 } 1248 1249 1250 ActionNode* ActionNode::SetRegister(int reg, 1251 int val, 1252 RegExpNode* on_success) { 1253 ActionNode* result = new ActionNode(SET_REGISTER, on_success); 1254 result->data_.u_store_register.reg = reg; 1255 result->data_.u_store_register.value = val; 1256 return result; 1257 } 1258 1259 1260 ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) { 1261 ActionNode* result = new ActionNode(INCREMENT_REGISTER, on_success); 1262 result->data_.u_increment_register.reg = reg; 1263 return result; 1264 } 1265 1266 1267 ActionNode* ActionNode::StorePosition(int reg, 1268 bool is_capture, 1269 RegExpNode* on_success) { 1270 ActionNode* result = new ActionNode(STORE_POSITION, on_success); 1271 result->data_.u_position_register.reg = reg; 1272 result->data_.u_position_register.is_capture = is_capture; 1273 return result; 1274 } 1275 1276 1277 ActionNode* ActionNode::ClearCaptures(Interval range, 1278 RegExpNode* on_success) { 1279 ActionNode* result = new ActionNode(CLEAR_CAPTURES, on_success); 1280 result->data_.u_clear_captures.range_from = range.from(); 1281 result->data_.u_clear_captures.range_to = range.to(); 1282 return result; 1283 } 1284 1285 1286 ActionNode* ActionNode::BeginSubmatch(int stack_reg, 1287 int position_reg, 1288 RegExpNode* on_success) { 1289 ActionNode* result = new ActionNode(BEGIN_SUBMATCH, on_success); 1290 result->data_.u_submatch.stack_pointer_register = stack_reg; 1291 result->data_.u_submatch.current_position_register = position_reg; 1292 return result; 1293 } 1294 1295 1296 ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg, 1297 int position_reg, 1298 int clear_register_count, 1299 int clear_register_from, 1300 RegExpNode* on_success) { 1301 ActionNode* result = new ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success); 1302 result->data_.u_submatch.stack_pointer_register = stack_reg; 1303 result->data_.u_submatch.current_position_register = position_reg; 1304 result->data_.u_submatch.clear_register_count = clear_register_count; 1305 result->data_.u_submatch.clear_register_from = clear_register_from; 1306 return result; 1307 } 1308 1309 1310 ActionNode* ActionNode::EmptyMatchCheck(int start_register, 1311 int repetition_register, 1312 int repetition_limit, 1313 RegExpNode* on_success) { 1314 ActionNode* result = new ActionNode(EMPTY_MATCH_CHECK, on_success); 1315 result->data_.u_empty_match_check.start_register = start_register; 1316 result->data_.u_empty_match_check.repetition_register = repetition_register; 1317 result->data_.u_empty_match_check.repetition_limit = repetition_limit; 1318 return result; 1319 } 1320 1321 1322 #define DEFINE_ACCEPT(Type) \ 1323 void Type##Node::Accept(NodeVisitor* visitor) { \ 1324 visitor->Visit##Type(this); \ 1325 } 1326 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT) 1327 #undef DEFINE_ACCEPT 1328 1329 1330 void LoopChoiceNode::Accept(NodeVisitor* visitor) { 1331 visitor->VisitLoopChoice(this); 1332 } 1333 1334 1335 // ------------------------------------------------------------------- 1336 // Emit code. 1337 1338 1339 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler, 1340 Guard* guard, 1341 Trace* trace) { 1342 switch (guard->op()) { 1343 case Guard::LT: 1344 ASSERT(!trace->mentions_reg(guard->reg())); 1345 macro_assembler->IfRegisterGE(guard->reg(), 1346 guard->value(), 1347 trace->backtrack()); 1348 break; 1349 case Guard::GEQ: 1350 ASSERT(!trace->mentions_reg(guard->reg())); 1351 macro_assembler->IfRegisterLT(guard->reg(), 1352 guard->value(), 1353 trace->backtrack()); 1354 break; 1355 } 1356 } 1357 1358 1359 // Returns the number of characters in the equivalence class, omitting those 1360 // that cannot occur in the source string because it is ASCII. 1361 static int GetCaseIndependentLetters(Isolate* isolate, 1362 uc16 character, 1363 bool ascii_subject, 1364 unibrow::uchar* letters) { 1365 int length = 1366 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters); 1367 // Unibrow returns 0 or 1 for characters where case independence is 1368 // trivial. 1369 if (length == 0) { 1370 letters[0] = character; 1371 length = 1; 1372 } 1373 if (!ascii_subject || character <= String::kMaxAsciiCharCode) { 1374 return length; 1375 } 1376 // The standard requires that non-ASCII characters cannot have ASCII 1377 // character codes in their equivalence class. 1378 return 0; 1379 } 1380 1381 1382 static inline bool EmitSimpleCharacter(Isolate* isolate, 1383 RegExpCompiler* compiler, 1384 uc16 c, 1385 Label* on_failure, 1386 int cp_offset, 1387 bool check, 1388 bool preloaded) { 1389 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1390 bool bound_checked = false; 1391 if (!preloaded) { 1392 assembler->LoadCurrentCharacter( 1393 cp_offset, 1394 on_failure, 1395 check); 1396 bound_checked = true; 1397 } 1398 assembler->CheckNotCharacter(c, on_failure); 1399 return bound_checked; 1400 } 1401 1402 1403 // Only emits non-letters (things that don't have case). Only used for case 1404 // independent matches. 1405 static inline bool EmitAtomNonLetter(Isolate* isolate, 1406 RegExpCompiler* compiler, 1407 uc16 c, 1408 Label* on_failure, 1409 int cp_offset, 1410 bool check, 1411 bool preloaded) { 1412 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 1413 bool ascii = compiler->ascii(); 1414 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1415 int length = GetCaseIndependentLetters(isolate, c, ascii, chars); 1416 if (length < 1) { 1417 // This can't match. Must be an ASCII subject and a non-ASCII character. 1418 // We do not need to do anything since the ASCII pass already handled this. 1419 return false; // Bounds not checked. 1420 } 1421 bool checked = false; 1422 // We handle the length > 1 case in a later pass. 1423 if (length == 1) { 1424 if (ascii && c > String::kMaxAsciiCharCodeU) { 1425 // Can't match - see above. 1426 return false; // Bounds not checked. 1427 } 1428 if (!preloaded) { 1429 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 1430 checked = check; 1431 } 1432 macro_assembler->CheckNotCharacter(c, on_failure); 1433 } 1434 return checked; 1435 } 1436 1437 1438 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler, 1439 bool ascii, 1440 uc16 c1, 1441 uc16 c2, 1442 Label* on_failure) { 1443 uc16 char_mask; 1444 if (ascii) { 1445 char_mask = String::kMaxAsciiCharCode; 1446 } else { 1447 char_mask = String::kMaxUtf16CodeUnit; 1448 } 1449 uc16 exor = c1 ^ c2; 1450 // Check whether exor has only one bit set. 1451 if (((exor - 1) & exor) == 0) { 1452 // If c1 and c2 differ only by one bit. 1453 // Ecma262UnCanonicalize always gives the highest number last. 1454 ASSERT(c2 > c1); 1455 uc16 mask = char_mask ^ exor; 1456 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure); 1457 return true; 1458 } 1459 ASSERT(c2 > c1); 1460 uc16 diff = c2 - c1; 1461 if (((diff - 1) & diff) == 0 && c1 >= diff) { 1462 // If the characters differ by 2^n but don't differ by one bit then 1463 // subtract the difference from the found character, then do the or 1464 // trick. We avoid the theoretical case where negative numbers are 1465 // involved in order to simplify code generation. 1466 uc16 mask = char_mask ^ diff; 1467 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff, 1468 diff, 1469 mask, 1470 on_failure); 1471 return true; 1472 } 1473 return false; 1474 } 1475 1476 1477 typedef bool EmitCharacterFunction(Isolate* isolate, 1478 RegExpCompiler* compiler, 1479 uc16 c, 1480 Label* on_failure, 1481 int cp_offset, 1482 bool check, 1483 bool preloaded); 1484 1485 // Only emits letters (things that have case). Only used for case independent 1486 // matches. 1487 static inline bool EmitAtomLetter(Isolate* isolate, 1488 RegExpCompiler* compiler, 1489 uc16 c, 1490 Label* on_failure, 1491 int cp_offset, 1492 bool check, 1493 bool preloaded) { 1494 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 1495 bool ascii = compiler->ascii(); 1496 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1497 int length = GetCaseIndependentLetters(isolate, c, ascii, chars); 1498 if (length <= 1) return false; 1499 // We may not need to check against the end of the input string 1500 // if this character lies before a character that matched. 1501 if (!preloaded) { 1502 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 1503 } 1504 Label ok; 1505 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4); 1506 switch (length) { 1507 case 2: { 1508 if (ShortCutEmitCharacterPair(macro_assembler, 1509 ascii, 1510 chars[0], 1511 chars[1], 1512 on_failure)) { 1513 } else { 1514 macro_assembler->CheckCharacter(chars[0], &ok); 1515 macro_assembler->CheckNotCharacter(chars[1], on_failure); 1516 macro_assembler->Bind(&ok); 1517 } 1518 break; 1519 } 1520 case 4: 1521 macro_assembler->CheckCharacter(chars[3], &ok); 1522 // Fall through! 1523 case 3: 1524 macro_assembler->CheckCharacter(chars[0], &ok); 1525 macro_assembler->CheckCharacter(chars[1], &ok); 1526 macro_assembler->CheckNotCharacter(chars[2], on_failure); 1527 macro_assembler->Bind(&ok); 1528 break; 1529 default: 1530 UNREACHABLE(); 1531 break; 1532 } 1533 return true; 1534 } 1535 1536 1537 static void EmitCharClass(RegExpMacroAssembler* macro_assembler, 1538 RegExpCharacterClass* cc, 1539 bool ascii, 1540 Label* on_failure, 1541 int cp_offset, 1542 bool check_offset, 1543 bool preloaded) { 1544 ZoneList<CharacterRange>* ranges = cc->ranges(); 1545 int max_char; 1546 if (ascii) { 1547 max_char = String::kMaxAsciiCharCode; 1548 } else { 1549 max_char = String::kMaxUtf16CodeUnit; 1550 } 1551 1552 Label success; 1553 1554 Label* char_is_in_class = 1555 cc->is_negated() ? on_failure : &success; 1556 1557 int range_count = ranges->length(); 1558 1559 int last_valid_range = range_count - 1; 1560 while (last_valid_range >= 0) { 1561 CharacterRange& range = ranges->at(last_valid_range); 1562 if (range.from() <= max_char) { 1563 break; 1564 } 1565 last_valid_range--; 1566 } 1567 1568 if (last_valid_range < 0) { 1569 if (!cc->is_negated()) { 1570 // TODO(plesner): We can remove this when the node level does our 1571 // ASCII optimizations for us. 1572 macro_assembler->GoTo(on_failure); 1573 } 1574 if (check_offset) { 1575 macro_assembler->CheckPosition(cp_offset, on_failure); 1576 } 1577 return; 1578 } 1579 1580 if (last_valid_range == 0 && 1581 !cc->is_negated() && 1582 ranges->at(0).IsEverything(max_char)) { 1583 // This is a common case hit by non-anchored expressions. 1584 if (check_offset) { 1585 macro_assembler->CheckPosition(cp_offset, on_failure); 1586 } 1587 return; 1588 } 1589 1590 if (!preloaded) { 1591 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset); 1592 } 1593 1594 if (cc->is_standard() && 1595 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(), 1596 on_failure)) { 1597 return; 1598 } 1599 1600 for (int i = 0; i < last_valid_range; i++) { 1601 CharacterRange& range = ranges->at(i); 1602 Label next_range; 1603 uc16 from = range.from(); 1604 uc16 to = range.to(); 1605 if (from > max_char) { 1606 continue; 1607 } 1608 if (to > max_char) to = max_char; 1609 if (to == from) { 1610 macro_assembler->CheckCharacter(to, char_is_in_class); 1611 } else { 1612 if (from != 0) { 1613 macro_assembler->CheckCharacterLT(from, &next_range); 1614 } 1615 if (to != max_char) { 1616 macro_assembler->CheckCharacterLT(to + 1, char_is_in_class); 1617 } else { 1618 macro_assembler->GoTo(char_is_in_class); 1619 } 1620 } 1621 macro_assembler->Bind(&next_range); 1622 } 1623 1624 CharacterRange& range = ranges->at(last_valid_range); 1625 uc16 from = range.from(); 1626 uc16 to = range.to(); 1627 1628 if (to > max_char) to = max_char; 1629 ASSERT(to >= from); 1630 1631 if (to == from) { 1632 if (cc->is_negated()) { 1633 macro_assembler->CheckCharacter(to, on_failure); 1634 } else { 1635 macro_assembler->CheckNotCharacter(to, on_failure); 1636 } 1637 } else { 1638 if (from != 0) { 1639 if (cc->is_negated()) { 1640 macro_assembler->CheckCharacterLT(from, &success); 1641 } else { 1642 macro_assembler->CheckCharacterLT(from, on_failure); 1643 } 1644 } 1645 if (to != String::kMaxUtf16CodeUnit) { 1646 if (cc->is_negated()) { 1647 macro_assembler->CheckCharacterLT(to + 1, on_failure); 1648 } else { 1649 macro_assembler->CheckCharacterGT(to, on_failure); 1650 } 1651 } else { 1652 if (cc->is_negated()) { 1653 macro_assembler->GoTo(on_failure); 1654 } 1655 } 1656 } 1657 macro_assembler->Bind(&success); 1658 } 1659 1660 1661 RegExpNode::~RegExpNode() { 1662 } 1663 1664 1665 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler, 1666 Trace* trace) { 1667 // If we are generating a greedy loop then don't stop and don't reuse code. 1668 if (trace->stop_node() != NULL) { 1669 return CONTINUE; 1670 } 1671 1672 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 1673 if (trace->is_trivial()) { 1674 if (label_.is_bound()) { 1675 // We are being asked to generate a generic version, but that's already 1676 // been done so just go to it. 1677 macro_assembler->GoTo(&label_); 1678 return DONE; 1679 } 1680 if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) { 1681 // To avoid too deep recursion we push the node to the work queue and just 1682 // generate a goto here. 1683 compiler->AddWork(this); 1684 macro_assembler->GoTo(&label_); 1685 return DONE; 1686 } 1687 // Generate generic version of the node and bind the label for later use. 1688 macro_assembler->Bind(&label_); 1689 return CONTINUE; 1690 } 1691 1692 // We are being asked to make a non-generic version. Keep track of how many 1693 // non-generic versions we generate so as not to overdo it. 1694 trace_count_++; 1695 if (FLAG_regexp_optimization && 1696 trace_count_ < kMaxCopiesCodeGenerated && 1697 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) { 1698 return CONTINUE; 1699 } 1700 1701 // If we get here code has been generated for this node too many times or 1702 // recursion is too deep. Time to switch to a generic version. The code for 1703 // generic versions above can handle deep recursion properly. 1704 trace->Flush(compiler, this); 1705 return DONE; 1706 } 1707 1708 1709 int ActionNode::EatsAtLeast(int still_to_find, 1710 int recursion_depth, 1711 bool not_at_start) { 1712 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; 1713 if (type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input! 1714 return on_success()->EatsAtLeast(still_to_find, 1715 recursion_depth + 1, 1716 not_at_start); 1717 } 1718 1719 1720 int AssertionNode::EatsAtLeast(int still_to_find, 1721 int recursion_depth, 1722 bool not_at_start) { 1723 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; 1724 // If we know we are not at the start and we are asked "how many characters 1725 // will you match if you succeed?" then we can answer anything since false 1726 // implies false. So lets just return the max answer (still_to_find) since 1727 // that won't prevent us from preloading a lot of characters for the other 1728 // branches in the node graph. 1729 if (type() == AT_START && not_at_start) return still_to_find; 1730 return on_success()->EatsAtLeast(still_to_find, 1731 recursion_depth + 1, 1732 not_at_start); 1733 } 1734 1735 1736 int BackReferenceNode::EatsAtLeast(int still_to_find, 1737 int recursion_depth, 1738 bool not_at_start) { 1739 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; 1740 return on_success()->EatsAtLeast(still_to_find, 1741 recursion_depth + 1, 1742 not_at_start); 1743 } 1744 1745 1746 int TextNode::EatsAtLeast(int still_to_find, 1747 int recursion_depth, 1748 bool not_at_start) { 1749 int answer = Length(); 1750 if (answer >= still_to_find) return answer; 1751 if (recursion_depth > RegExpCompiler::kMaxRecursion) return answer; 1752 // We are not at start after this node so we set the last argument to 'true'. 1753 return answer + on_success()->EatsAtLeast(still_to_find - answer, 1754 recursion_depth + 1, 1755 true); 1756 } 1757 1758 1759 int NegativeLookaheadChoiceNode::EatsAtLeast(int still_to_find, 1760 int recursion_depth, 1761 bool not_at_start) { 1762 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; 1763 // Alternative 0 is the negative lookahead, alternative 1 is what comes 1764 // afterwards. 1765 RegExpNode* node = alternatives_->at(1).node(); 1766 return node->EatsAtLeast(still_to_find, recursion_depth + 1, not_at_start); 1767 } 1768 1769 1770 void NegativeLookaheadChoiceNode::GetQuickCheckDetails( 1771 QuickCheckDetails* details, 1772 RegExpCompiler* compiler, 1773 int filled_in, 1774 bool not_at_start) { 1775 // Alternative 0 is the negative lookahead, alternative 1 is what comes 1776 // afterwards. 1777 RegExpNode* node = alternatives_->at(1).node(); 1778 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start); 1779 } 1780 1781 1782 int ChoiceNode::EatsAtLeastHelper(int still_to_find, 1783 int recursion_depth, 1784 RegExpNode* ignore_this_node, 1785 bool not_at_start) { 1786 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; 1787 int min = 100; 1788 int choice_count = alternatives_->length(); 1789 for (int i = 0; i < choice_count; i++) { 1790 RegExpNode* node = alternatives_->at(i).node(); 1791 if (node == ignore_this_node) continue; 1792 int node_eats_at_least = node->EatsAtLeast(still_to_find, 1793 recursion_depth + 1, 1794 not_at_start); 1795 if (node_eats_at_least < min) min = node_eats_at_least; 1796 } 1797 return min; 1798 } 1799 1800 1801 int LoopChoiceNode::EatsAtLeast(int still_to_find, 1802 int recursion_depth, 1803 bool not_at_start) { 1804 return EatsAtLeastHelper(still_to_find, 1805 recursion_depth, 1806 loop_node_, 1807 not_at_start); 1808 } 1809 1810 1811 int ChoiceNode::EatsAtLeast(int still_to_find, 1812 int recursion_depth, 1813 bool not_at_start) { 1814 return EatsAtLeastHelper(still_to_find, 1815 recursion_depth, 1816 NULL, 1817 not_at_start); 1818 } 1819 1820 1821 // Takes the left-most 1-bit and smears it out, setting all bits to its right. 1822 static inline uint32_t SmearBitsRight(uint32_t v) { 1823 v |= v >> 1; 1824 v |= v >> 2; 1825 v |= v >> 4; 1826 v |= v >> 8; 1827 v |= v >> 16; 1828 return v; 1829 } 1830 1831 1832 bool QuickCheckDetails::Rationalize(bool asc) { 1833 bool found_useful_op = false; 1834 uint32_t char_mask; 1835 if (asc) { 1836 char_mask = String::kMaxAsciiCharCode; 1837 } else { 1838 char_mask = String::kMaxUtf16CodeUnit; 1839 } 1840 mask_ = 0; 1841 value_ = 0; 1842 int char_shift = 0; 1843 for (int i = 0; i < characters_; i++) { 1844 Position* pos = &positions_[i]; 1845 if ((pos->mask & String::kMaxAsciiCharCode) != 0) { 1846 found_useful_op = true; 1847 } 1848 mask_ |= (pos->mask & char_mask) << char_shift; 1849 value_ |= (pos->value & char_mask) << char_shift; 1850 char_shift += asc ? 8 : 16; 1851 } 1852 return found_useful_op; 1853 } 1854 1855 1856 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler, 1857 Trace* trace, 1858 bool preload_has_checked_bounds, 1859 Label* on_possible_success, 1860 QuickCheckDetails* details, 1861 bool fall_through_on_failure) { 1862 if (details->characters() == 0) return false; 1863 GetQuickCheckDetails(details, compiler, 0, trace->at_start() == Trace::FALSE); 1864 if (details->cannot_match()) return false; 1865 if (!details->Rationalize(compiler->ascii())) return false; 1866 ASSERT(details->characters() == 1 || 1867 compiler->macro_assembler()->CanReadUnaligned()); 1868 uint32_t mask = details->mask(); 1869 uint32_t value = details->value(); 1870 1871 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1872 1873 if (trace->characters_preloaded() != details->characters()) { 1874 assembler->LoadCurrentCharacter(trace->cp_offset(), 1875 trace->backtrack(), 1876 !preload_has_checked_bounds, 1877 details->characters()); 1878 } 1879 1880 1881 bool need_mask = true; 1882 1883 if (details->characters() == 1) { 1884 // If number of characters preloaded is 1 then we used a byte or 16 bit 1885 // load so the value is already masked down. 1886 uint32_t char_mask; 1887 if (compiler->ascii()) { 1888 char_mask = String::kMaxAsciiCharCode; 1889 } else { 1890 char_mask = String::kMaxUtf16CodeUnit; 1891 } 1892 if ((mask & char_mask) == char_mask) need_mask = false; 1893 mask &= char_mask; 1894 } else { 1895 // For 2-character preloads in ASCII mode or 1-character preloads in 1896 // TWO_BYTE mode we also use a 16 bit load with zero extend. 1897 if (details->characters() == 2 && compiler->ascii()) { 1898 if ((mask & 0x7f7f) == 0x7f7f) need_mask = false; 1899 } else if (details->characters() == 1 && !compiler->ascii()) { 1900 if ((mask & 0xffff) == 0xffff) need_mask = false; 1901 } else { 1902 if (mask == 0xffffffff) need_mask = false; 1903 } 1904 } 1905 1906 if (fall_through_on_failure) { 1907 if (need_mask) { 1908 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success); 1909 } else { 1910 assembler->CheckCharacter(value, on_possible_success); 1911 } 1912 } else { 1913 if (need_mask) { 1914 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack()); 1915 } else { 1916 assembler->CheckNotCharacter(value, trace->backtrack()); 1917 } 1918 } 1919 return true; 1920 } 1921 1922 1923 // Here is the meat of GetQuickCheckDetails (see also the comment on the 1924 // super-class in the .h file). 1925 // 1926 // We iterate along the text object, building up for each character a 1927 // mask and value that can be used to test for a quick failure to match. 1928 // The masks and values for the positions will be combined into a single 1929 // machine word for the current character width in order to be used in 1930 // generating a quick check. 1931 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, 1932 RegExpCompiler* compiler, 1933 int characters_filled_in, 1934 bool not_at_start) { 1935 Isolate* isolate = Isolate::Current(); 1936 ASSERT(characters_filled_in < details->characters()); 1937 int characters = details->characters(); 1938 int char_mask; 1939 if (compiler->ascii()) { 1940 char_mask = String::kMaxAsciiCharCode; 1941 } else { 1942 char_mask = String::kMaxUtf16CodeUnit; 1943 } 1944 for (int k = 0; k < elms_->length(); k++) { 1945 TextElement elm = elms_->at(k); 1946 if (elm.type == TextElement::ATOM) { 1947 Vector<const uc16> quarks = elm.data.u_atom->data(); 1948 for (int i = 0; i < characters && i < quarks.length(); i++) { 1949 QuickCheckDetails::Position* pos = 1950 details->positions(characters_filled_in); 1951 uc16 c = quarks[i]; 1952 if (c > char_mask) { 1953 // If we expect a non-ASCII character from an ASCII string, 1954 // there is no way we can match. Not even case independent 1955 // matching can turn an ASCII character into non-ASCII or 1956 // vice versa. 1957 details->set_cannot_match(); 1958 pos->determines_perfectly = false; 1959 return; 1960 } 1961 if (compiler->ignore_case()) { 1962 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1963 int length = GetCaseIndependentLetters(isolate, c, compiler->ascii(), 1964 chars); 1965 ASSERT(length != 0); // Can only happen if c > char_mask (see above). 1966 if (length == 1) { 1967 // This letter has no case equivalents, so it's nice and simple 1968 // and the mask-compare will determine definitely whether we have 1969 // a match at this character position. 1970 pos->mask = char_mask; 1971 pos->value = c; 1972 pos->determines_perfectly = true; 1973 } else { 1974 uint32_t common_bits = char_mask; 1975 uint32_t bits = chars[0]; 1976 for (int j = 1; j < length; j++) { 1977 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits); 1978 common_bits ^= differing_bits; 1979 bits &= common_bits; 1980 } 1981 // If length is 2 and common bits has only one zero in it then 1982 // our mask and compare instruction will determine definitely 1983 // whether we have a match at this character position. Otherwise 1984 // it can only be an approximate check. 1985 uint32_t one_zero = (common_bits | ~char_mask); 1986 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) { 1987 pos->determines_perfectly = true; 1988 } 1989 pos->mask = common_bits; 1990 pos->value = bits; 1991 } 1992 } else { 1993 // Don't ignore case. Nice simple case where the mask-compare will 1994 // determine definitely whether we have a match at this character 1995 // position. 1996 pos->mask = char_mask; 1997 pos->value = c; 1998 pos->determines_perfectly = true; 1999 } 2000 characters_filled_in++; 2001 ASSERT(characters_filled_in <= details->characters()); 2002 if (characters_filled_in == details->characters()) { 2003 return; 2004 } 2005 } 2006 } else { 2007 QuickCheckDetails::Position* pos = 2008 details->positions(characters_filled_in); 2009 RegExpCharacterClass* tree = elm.data.u_char_class; 2010 ZoneList<CharacterRange>* ranges = tree->ranges(); 2011 if (tree->is_negated()) { 2012 // A quick check uses multi-character mask and compare. There is no 2013 // useful way to incorporate a negative char class into this scheme 2014 // so we just conservatively create a mask and value that will always 2015 // succeed. 2016 pos->mask = 0; 2017 pos->value = 0; 2018 } else { 2019 int first_range = 0; 2020 while (ranges->at(first_range).from() > char_mask) { 2021 first_range++; 2022 if (first_range == ranges->length()) { 2023 details->set_cannot_match(); 2024 pos->determines_perfectly = false; 2025 return; 2026 } 2027 } 2028 CharacterRange range = ranges->at(first_range); 2029 uc16 from = range.from(); 2030 uc16 to = range.to(); 2031 if (to > char_mask) { 2032 to = char_mask; 2033 } 2034 uint32_t differing_bits = (from ^ to); 2035 // A mask and compare is only perfect if the differing bits form a 2036 // number like 00011111 with one single block of trailing 1s. 2037 if ((differing_bits & (differing_bits + 1)) == 0 && 2038 from + differing_bits == to) { 2039 pos->determines_perfectly = true; 2040 } 2041 uint32_t common_bits = ~SmearBitsRight(differing_bits); 2042 uint32_t bits = (from & common_bits); 2043 for (int i = first_range + 1; i < ranges->length(); i++) { 2044 CharacterRange range = ranges->at(i); 2045 uc16 from = range.from(); 2046 uc16 to = range.to(); 2047 if (from > char_mask) continue; 2048 if (to > char_mask) to = char_mask; 2049 // Here we are combining more ranges into the mask and compare 2050 // value. With each new range the mask becomes more sparse and 2051 // so the chances of a false positive rise. A character class 2052 // with multiple ranges is assumed never to be equivalent to a 2053 // mask and compare operation. 2054 pos->determines_perfectly = false; 2055 uint32_t new_common_bits = (from ^ to); 2056 new_common_bits = ~SmearBitsRight(new_common_bits); 2057 common_bits &= new_common_bits; 2058 bits &= new_common_bits; 2059 uint32_t differing_bits = (from & common_bits) ^ bits; 2060 common_bits ^= differing_bits; 2061 bits &= common_bits; 2062 } 2063 pos->mask = common_bits; 2064 pos->value = bits; 2065 } 2066 characters_filled_in++; 2067 ASSERT(characters_filled_in <= details->characters()); 2068 if (characters_filled_in == details->characters()) { 2069 return; 2070 } 2071 } 2072 } 2073 ASSERT(characters_filled_in != details->characters()); 2074 on_success()-> GetQuickCheckDetails(details, 2075 compiler, 2076 characters_filled_in, 2077 true); 2078 } 2079 2080 2081 void QuickCheckDetails::Clear() { 2082 for (int i = 0; i < characters_; i++) { 2083 positions_[i].mask = 0; 2084 positions_[i].value = 0; 2085 positions_[i].determines_perfectly = false; 2086 } 2087 characters_ = 0; 2088 } 2089 2090 2091 void QuickCheckDetails::Advance(int by, bool ascii) { 2092 ASSERT(by >= 0); 2093 if (by >= characters_) { 2094 Clear(); 2095 return; 2096 } 2097 for (int i = 0; i < characters_ - by; i++) { 2098 positions_[i] = positions_[by + i]; 2099 } 2100 for (int i = characters_ - by; i < characters_; i++) { 2101 positions_[i].mask = 0; 2102 positions_[i].value = 0; 2103 positions_[i].determines_perfectly = false; 2104 } 2105 characters_ -= by; 2106 // We could change mask_ and value_ here but we would never advance unless 2107 // they had already been used in a check and they won't be used again because 2108 // it would gain us nothing. So there's no point. 2109 } 2110 2111 2112 void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) { 2113 ASSERT(characters_ == other->characters_); 2114 if (other->cannot_match_) { 2115 return; 2116 } 2117 if (cannot_match_) { 2118 *this = *other; 2119 return; 2120 } 2121 for (int i = from_index; i < characters_; i++) { 2122 QuickCheckDetails::Position* pos = positions(i); 2123 QuickCheckDetails::Position* other_pos = other->positions(i); 2124 if (pos->mask != other_pos->mask || 2125 pos->value != other_pos->value || 2126 !other_pos->determines_perfectly) { 2127 // Our mask-compare operation will be approximate unless we have the 2128 // exact same operation on both sides of the alternation. 2129 pos->determines_perfectly = false; 2130 } 2131 pos->mask &= other_pos->mask; 2132 pos->value &= pos->mask; 2133 other_pos->value &= pos->mask; 2134 uc16 differing_bits = (pos->value ^ other_pos->value); 2135 pos->mask &= ~differing_bits; 2136 pos->value &= pos->mask; 2137 } 2138 } 2139 2140 2141 class VisitMarker { 2142 public: 2143 explicit VisitMarker(NodeInfo* info) : info_(info) { 2144 ASSERT(!info->visited); 2145 info->visited = true; 2146 } 2147 ~VisitMarker() { 2148 info_->visited = false; 2149 } 2150 private: 2151 NodeInfo* info_; 2152 }; 2153 2154 2155 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 2156 RegExpCompiler* compiler, 2157 int characters_filled_in, 2158 bool not_at_start) { 2159 if (body_can_be_zero_length_ || info()->visited) return; 2160 VisitMarker marker(info()); 2161 return ChoiceNode::GetQuickCheckDetails(details, 2162 compiler, 2163 characters_filled_in, 2164 not_at_start); 2165 } 2166 2167 2168 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 2169 RegExpCompiler* compiler, 2170 int characters_filled_in, 2171 bool not_at_start) { 2172 not_at_start = (not_at_start || not_at_start_); 2173 int choice_count = alternatives_->length(); 2174 ASSERT(choice_count > 0); 2175 alternatives_->at(0).node()->GetQuickCheckDetails(details, 2176 compiler, 2177 characters_filled_in, 2178 not_at_start); 2179 for (int i = 1; i < choice_count; i++) { 2180 QuickCheckDetails new_details(details->characters()); 2181 RegExpNode* node = alternatives_->at(i).node(); 2182 node->GetQuickCheckDetails(&new_details, compiler, 2183 characters_filled_in, 2184 not_at_start); 2185 // Here we merge the quick match details of the two branches. 2186 details->Merge(&new_details, characters_filled_in); 2187 } 2188 } 2189 2190 2191 // Check for [0-9A-Z_a-z]. 2192 static void EmitWordCheck(RegExpMacroAssembler* assembler, 2193 Label* word, 2194 Label* non_word, 2195 bool fall_through_on_word) { 2196 if (assembler->CheckSpecialCharacterClass( 2197 fall_through_on_word ? 'w' : 'W', 2198 fall_through_on_word ? non_word : word)) { 2199 // Optimized implementation available. 2200 return; 2201 } 2202 assembler->CheckCharacterGT('z', non_word); 2203 assembler->CheckCharacterLT('0', non_word); 2204 assembler->CheckCharacterGT('a' - 1, word); 2205 assembler->CheckCharacterLT('9' + 1, word); 2206 assembler->CheckCharacterLT('A', non_word); 2207 assembler->CheckCharacterLT('Z' + 1, word); 2208 if (fall_through_on_word) { 2209 assembler->CheckNotCharacter('_', non_word); 2210 } else { 2211 assembler->CheckCharacter('_', word); 2212 } 2213 } 2214 2215 2216 // Emit the code to check for a ^ in multiline mode (1-character lookbehind 2217 // that matches newline or the start of input). 2218 static void EmitHat(RegExpCompiler* compiler, 2219 RegExpNode* on_success, 2220 Trace* trace) { 2221 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2222 // We will be loading the previous character into the current character 2223 // register. 2224 Trace new_trace(*trace); 2225 new_trace.InvalidateCurrentCharacter(); 2226 2227 Label ok; 2228 if (new_trace.cp_offset() == 0) { 2229 // The start of input counts as a newline in this context, so skip to 2230 // ok if we are at the start. 2231 assembler->CheckAtStart(&ok); 2232 } 2233 // We already checked that we are not at the start of input so it must be 2234 // OK to load the previous character. 2235 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1, 2236 new_trace.backtrack(), 2237 false); 2238 if (!assembler->CheckSpecialCharacterClass('n', 2239 new_trace.backtrack())) { 2240 // Newline means \n, \r, 0x2028 or 0x2029. 2241 if (!compiler->ascii()) { 2242 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok); 2243 } 2244 assembler->CheckCharacter('\n', &ok); 2245 assembler->CheckNotCharacter('\r', new_trace.backtrack()); 2246 } 2247 assembler->Bind(&ok); 2248 on_success->Emit(compiler, &new_trace); 2249 } 2250 2251 2252 // Emit the code to handle \b and \B (word-boundary or non-word-boundary) 2253 // when we know whether the next character must be a word character or not. 2254 static void EmitHalfBoundaryCheck(AssertionNode::AssertionNodeType type, 2255 RegExpCompiler* compiler, 2256 RegExpNode* on_success, 2257 Trace* trace) { 2258 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2259 Label done; 2260 2261 Trace new_trace(*trace); 2262 2263 bool expect_word_character = (type == AssertionNode::AFTER_WORD_CHARACTER); 2264 Label* on_word = expect_word_character ? &done : new_trace.backtrack(); 2265 Label* on_non_word = expect_word_character ? new_trace.backtrack() : &done; 2266 2267 // Check whether previous character was a word character. 2268 switch (trace->at_start()) { 2269 case Trace::TRUE: 2270 if (expect_word_character) { 2271 assembler->GoTo(on_non_word); 2272 } 2273 break; 2274 case Trace::UNKNOWN: 2275 ASSERT_EQ(0, trace->cp_offset()); 2276 assembler->CheckAtStart(on_non_word); 2277 // Fall through. 2278 case Trace::FALSE: 2279 int prev_char_offset = trace->cp_offset() - 1; 2280 assembler->LoadCurrentCharacter(prev_char_offset, NULL, false, 1); 2281 EmitWordCheck(assembler, on_word, on_non_word, expect_word_character); 2282 // We may or may not have loaded the previous character. 2283 new_trace.InvalidateCurrentCharacter(); 2284 } 2285 2286 assembler->Bind(&done); 2287 2288 on_success->Emit(compiler, &new_trace); 2289 } 2290 2291 2292 // Emit the code to handle \b and \B (word-boundary or non-word-boundary). 2293 static void EmitBoundaryCheck(AssertionNode::AssertionNodeType type, 2294 RegExpCompiler* compiler, 2295 RegExpNode* on_success, 2296 Trace* trace) { 2297 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2298 Label before_non_word; 2299 Label before_word; 2300 if (trace->characters_preloaded() != 1) { 2301 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word); 2302 } 2303 // Fall through on non-word. 2304 EmitWordCheck(assembler, &before_word, &before_non_word, false); 2305 2306 // We will be loading the previous character into the current character 2307 // register. 2308 Trace new_trace(*trace); 2309 new_trace.InvalidateCurrentCharacter(); 2310 2311 Label ok; 2312 Label* boundary; 2313 Label* not_boundary; 2314 if (type == AssertionNode::AT_BOUNDARY) { 2315 boundary = &ok; 2316 not_boundary = new_trace.backtrack(); 2317 } else { 2318 not_boundary = &ok; 2319 boundary = new_trace.backtrack(); 2320 } 2321 2322 // Next character is not a word character. 2323 assembler->Bind(&before_non_word); 2324 if (new_trace.cp_offset() == 0) { 2325 // The start of input counts as a non-word character, so the question is 2326 // decided if we are at the start. 2327 assembler->CheckAtStart(not_boundary); 2328 } 2329 // We already checked that we are not at the start of input so it must be 2330 // OK to load the previous character. 2331 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, 2332 &ok, // Unused dummy label in this call. 2333 false); 2334 // Fall through on non-word. 2335 EmitWordCheck(assembler, boundary, not_boundary, false); 2336 assembler->GoTo(not_boundary); 2337 2338 // Next character is a word character. 2339 assembler->Bind(&before_word); 2340 if (new_trace.cp_offset() == 0) { 2341 // The start of input counts as a non-word character, so the question is 2342 // decided if we are at the start. 2343 assembler->CheckAtStart(boundary); 2344 } 2345 // We already checked that we are not at the start of input so it must be 2346 // OK to load the previous character. 2347 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, 2348 &ok, // Unused dummy label in this call. 2349 false); 2350 bool fall_through_on_word = (type == AssertionNode::AT_NON_BOUNDARY); 2351 EmitWordCheck(assembler, not_boundary, boundary, fall_through_on_word); 2352 2353 assembler->Bind(&ok); 2354 2355 on_success->Emit(compiler, &new_trace); 2356 } 2357 2358 2359 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details, 2360 RegExpCompiler* compiler, 2361 int filled_in, 2362 bool not_at_start) { 2363 if (type_ == AT_START && not_at_start) { 2364 details->set_cannot_match(); 2365 return; 2366 } 2367 return on_success()->GetQuickCheckDetails(details, 2368 compiler, 2369 filled_in, 2370 not_at_start); 2371 } 2372 2373 2374 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) { 2375 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2376 switch (type_) { 2377 case AT_END: { 2378 Label ok; 2379 assembler->CheckPosition(trace->cp_offset(), &ok); 2380 assembler->GoTo(trace->backtrack()); 2381 assembler->Bind(&ok); 2382 break; 2383 } 2384 case AT_START: { 2385 if (trace->at_start() == Trace::FALSE) { 2386 assembler->GoTo(trace->backtrack()); 2387 return; 2388 } 2389 if (trace->at_start() == Trace::UNKNOWN) { 2390 assembler->CheckNotAtStart(trace->backtrack()); 2391 Trace at_start_trace = *trace; 2392 at_start_trace.set_at_start(true); 2393 on_success()->Emit(compiler, &at_start_trace); 2394 return; 2395 } 2396 } 2397 break; 2398 case AFTER_NEWLINE: 2399 EmitHat(compiler, on_success(), trace); 2400 return; 2401 case AT_BOUNDARY: 2402 case AT_NON_BOUNDARY: { 2403 EmitBoundaryCheck(type_, compiler, on_success(), trace); 2404 return; 2405 } 2406 case AFTER_WORD_CHARACTER: 2407 case AFTER_NONWORD_CHARACTER: { 2408 EmitHalfBoundaryCheck(type_, compiler, on_success(), trace); 2409 } 2410 } 2411 on_success()->Emit(compiler, trace); 2412 } 2413 2414 2415 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) { 2416 if (quick_check == NULL) return false; 2417 if (offset >= quick_check->characters()) return false; 2418 return quick_check->positions(offset)->determines_perfectly; 2419 } 2420 2421 2422 static void UpdateBoundsCheck(int index, int* checked_up_to) { 2423 if (index > *checked_up_to) { 2424 *checked_up_to = index; 2425 } 2426 } 2427 2428 2429 // We call this repeatedly to generate code for each pass over the text node. 2430 // The passes are in increasing order of difficulty because we hope one 2431 // of the first passes will fail in which case we are saved the work of the 2432 // later passes. for example for the case independent regexp /%[asdfghjkl]a/ 2433 // we will check the '%' in the first pass, the case independent 'a' in the 2434 // second pass and the character class in the last pass. 2435 // 2436 // The passes are done from right to left, so for example to test for /bar/ 2437 // we will first test for an 'r' with offset 2, then an 'a' with offset 1 2438 // and then a 'b' with offset 0. This means we can avoid the end-of-input 2439 // bounds check most of the time. In the example we only need to check for 2440 // end-of-input when loading the putative 'r'. 2441 // 2442 // A slight complication involves the fact that the first character may already 2443 // be fetched into a register by the previous node. In this case we want to 2444 // do the test for that character first. We do this in separate passes. The 2445 // 'preloaded' argument indicates that we are doing such a 'pass'. If such a 2446 // pass has been performed then subsequent passes will have true in 2447 // first_element_checked to indicate that that character does not need to be 2448 // checked again. 2449 // 2450 // In addition to all this we are passed a Trace, which can 2451 // contain an AlternativeGeneration object. In this AlternativeGeneration 2452 // object we can see details of any quick check that was already passed in 2453 // order to get to the code we are now generating. The quick check can involve 2454 // loading characters, which means we do not need to recheck the bounds 2455 // up to the limit the quick check already checked. In addition the quick 2456 // check can have involved a mask and compare operation which may simplify 2457 // or obviate the need for further checks at some character positions. 2458 void TextNode::TextEmitPass(RegExpCompiler* compiler, 2459 TextEmitPassType pass, 2460 bool preloaded, 2461 Trace* trace, 2462 bool first_element_checked, 2463 int* checked_up_to) { 2464 Isolate* isolate = Isolate::Current(); 2465 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2466 bool ascii = compiler->ascii(); 2467 Label* backtrack = trace->backtrack(); 2468 QuickCheckDetails* quick_check = trace->quick_check_performed(); 2469 int element_count = elms_->length(); 2470 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) { 2471 TextElement elm = elms_->at(i); 2472 int cp_offset = trace->cp_offset() + elm.cp_offset; 2473 if (elm.type == TextElement::ATOM) { 2474 Vector<const uc16> quarks = elm.data.u_atom->data(); 2475 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { 2476 if (first_element_checked && i == 0 && j == 0) continue; 2477 if (DeterminedAlready(quick_check, elm.cp_offset + j)) continue; 2478 EmitCharacterFunction* emit_function = NULL; 2479 switch (pass) { 2480 case NON_ASCII_MATCH: 2481 ASSERT(ascii); 2482 if (quarks[j] > String::kMaxAsciiCharCode) { 2483 assembler->GoTo(backtrack); 2484 return; 2485 } 2486 break; 2487 case NON_LETTER_CHARACTER_MATCH: 2488 emit_function = &EmitAtomNonLetter; 2489 break; 2490 case SIMPLE_CHARACTER_MATCH: 2491 emit_function = &EmitSimpleCharacter; 2492 break; 2493 case CASE_CHARACTER_MATCH: 2494 emit_function = &EmitAtomLetter; 2495 break; 2496 default: 2497 break; 2498 } 2499 if (emit_function != NULL) { 2500 bool bound_checked = emit_function(isolate, 2501 compiler, 2502 quarks[j], 2503 backtrack, 2504 cp_offset + j, 2505 *checked_up_to < cp_offset + j, 2506 preloaded); 2507 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); 2508 } 2509 } 2510 } else { 2511 ASSERT_EQ(elm.type, TextElement::CHAR_CLASS); 2512 if (pass == CHARACTER_CLASS_MATCH) { 2513 if (first_element_checked && i == 0) continue; 2514 if (DeterminedAlready(quick_check, elm.cp_offset)) continue; 2515 RegExpCharacterClass* cc = elm.data.u_char_class; 2516 EmitCharClass(assembler, 2517 cc, 2518 ascii, 2519 backtrack, 2520 cp_offset, 2521 *checked_up_to < cp_offset, 2522 preloaded); 2523 UpdateBoundsCheck(cp_offset, checked_up_to); 2524 } 2525 } 2526 } 2527 } 2528 2529 2530 int TextNode::Length() { 2531 TextElement elm = elms_->last(); 2532 ASSERT(elm.cp_offset >= 0); 2533 if (elm.type == TextElement::ATOM) { 2534 return elm.cp_offset + elm.data.u_atom->data().length(); 2535 } else { 2536 return elm.cp_offset + 1; 2537 } 2538 } 2539 2540 2541 bool TextNode::SkipPass(int int_pass, bool ignore_case) { 2542 TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass); 2543 if (ignore_case) { 2544 return pass == SIMPLE_CHARACTER_MATCH; 2545 } else { 2546 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH; 2547 } 2548 } 2549 2550 2551 // This generates the code to match a text node. A text node can contain 2552 // straight character sequences (possibly to be matched in a case-independent 2553 // way) and character classes. For efficiency we do not do this in a single 2554 // pass from left to right. Instead we pass over the text node several times, 2555 // emitting code for some character positions every time. See the comment on 2556 // TextEmitPass for details. 2557 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) { 2558 LimitResult limit_result = LimitVersions(compiler, trace); 2559 if (limit_result == DONE) return; 2560 ASSERT(limit_result == CONTINUE); 2561 2562 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) { 2563 compiler->SetRegExpTooBig(); 2564 return; 2565 } 2566 2567 if (compiler->ascii()) { 2568 int dummy = 0; 2569 TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy); 2570 } 2571 2572 bool first_elt_done = false; 2573 int bound_checked_to = trace->cp_offset() - 1; 2574 bound_checked_to += trace->bound_checked_up_to(); 2575 2576 // If a character is preloaded into the current character register then 2577 // check that now. 2578 if (trace->characters_preloaded() == 1) { 2579 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { 2580 if (!SkipPass(pass, compiler->ignore_case())) { 2581 TextEmitPass(compiler, 2582 static_cast<TextEmitPassType>(pass), 2583 true, 2584 trace, 2585 false, 2586 &bound_checked_to); 2587 } 2588 } 2589 first_elt_done = true; 2590 } 2591 2592 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { 2593 if (!SkipPass(pass, compiler->ignore_case())) { 2594 TextEmitPass(compiler, 2595 static_cast<TextEmitPassType>(pass), 2596 false, 2597 trace, 2598 first_elt_done, 2599 &bound_checked_to); 2600 } 2601 } 2602 2603 Trace successor_trace(*trace); 2604 successor_trace.set_at_start(false); 2605 successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler); 2606 RecursionCheck rc(compiler); 2607 on_success()->Emit(compiler, &successor_trace); 2608 } 2609 2610 2611 void Trace::InvalidateCurrentCharacter() { 2612 characters_preloaded_ = 0; 2613 } 2614 2615 2616 void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) { 2617 ASSERT(by > 0); 2618 // We don't have an instruction for shifting the current character register 2619 // down or for using a shifted value for anything so lets just forget that 2620 // we preloaded any characters into it. 2621 characters_preloaded_ = 0; 2622 // Adjust the offsets of the quick check performed information. This 2623 // information is used to find out what we already determined about the 2624 // characters by means of mask and compare. 2625 quick_check_performed_.Advance(by, compiler->ascii()); 2626 cp_offset_ += by; 2627 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) { 2628 compiler->SetRegExpTooBig(); 2629 cp_offset_ = 0; 2630 } 2631 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by); 2632 } 2633 2634 2635 void TextNode::MakeCaseIndependent(bool is_ascii) { 2636 int element_count = elms_->length(); 2637 for (int i = 0; i < element_count; i++) { 2638 TextElement elm = elms_->at(i); 2639 if (elm.type == TextElement::CHAR_CLASS) { 2640 RegExpCharacterClass* cc = elm.data.u_char_class; 2641 // None of the standard character classes is different in the case 2642 // independent case and it slows us down if we don't know that. 2643 if (cc->is_standard()) continue; 2644 ZoneList<CharacterRange>* ranges = cc->ranges(); 2645 int range_count = ranges->length(); 2646 for (int j = 0; j < range_count; j++) { 2647 ranges->at(j).AddCaseEquivalents(ranges, is_ascii); 2648 } 2649 } 2650 } 2651 } 2652 2653 2654 int TextNode::GreedyLoopTextLength() { 2655 TextElement elm = elms_->at(elms_->length() - 1); 2656 if (elm.type == TextElement::CHAR_CLASS) { 2657 return elm.cp_offset + 1; 2658 } else { 2659 return elm.cp_offset + elm.data.u_atom->data().length(); 2660 } 2661 } 2662 2663 2664 // Finds the fixed match length of a sequence of nodes that goes from 2665 // this alternative and back to this choice node. If there are variable 2666 // length nodes or other complications in the way then return a sentinel 2667 // value indicating that a greedy loop cannot be constructed. 2668 int ChoiceNode::GreedyLoopTextLengthForAlternative( 2669 GuardedAlternative* alternative) { 2670 int length = 0; 2671 RegExpNode* node = alternative->node(); 2672 // Later we will generate code for all these text nodes using recursion 2673 // so we have to limit the max number. 2674 int recursion_depth = 0; 2675 while (node != this) { 2676 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) { 2677 return kNodeIsTooComplexForGreedyLoops; 2678 } 2679 int node_length = node->GreedyLoopTextLength(); 2680 if (node_length == kNodeIsTooComplexForGreedyLoops) { 2681 return kNodeIsTooComplexForGreedyLoops; 2682 } 2683 length += node_length; 2684 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node); 2685 node = seq_node->on_success(); 2686 } 2687 return length; 2688 } 2689 2690 2691 void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) { 2692 ASSERT_EQ(loop_node_, NULL); 2693 AddAlternative(alt); 2694 loop_node_ = alt.node(); 2695 } 2696 2697 2698 void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) { 2699 ASSERT_EQ(continue_node_, NULL); 2700 AddAlternative(alt); 2701 continue_node_ = alt.node(); 2702 } 2703 2704 2705 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 2706 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 2707 if (trace->stop_node() == this) { 2708 int text_length = 2709 GreedyLoopTextLengthForAlternative(&(alternatives_->at(0))); 2710 ASSERT(text_length != kNodeIsTooComplexForGreedyLoops); 2711 // Update the counter-based backtracking info on the stack. This is an 2712 // optimization for greedy loops (see below). 2713 ASSERT(trace->cp_offset() == text_length); 2714 macro_assembler->AdvanceCurrentPosition(text_length); 2715 macro_assembler->GoTo(trace->loop_label()); 2716 return; 2717 } 2718 ASSERT(trace->stop_node() == NULL); 2719 if (!trace->is_trivial()) { 2720 trace->Flush(compiler, this); 2721 return; 2722 } 2723 ChoiceNode::Emit(compiler, trace); 2724 } 2725 2726 2727 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler, 2728 bool not_at_start) { 2729 int preload_characters = EatsAtLeast(4, 0, not_at_start); 2730 if (compiler->macro_assembler()->CanReadUnaligned()) { 2731 bool ascii = compiler->ascii(); 2732 if (ascii) { 2733 if (preload_characters > 4) preload_characters = 4; 2734 // We can't preload 3 characters because there is no machine instruction 2735 // to do that. We can't just load 4 because we could be reading 2736 // beyond the end of the string, which could cause a memory fault. 2737 if (preload_characters == 3) preload_characters = 2; 2738 } else { 2739 if (preload_characters > 2) preload_characters = 2; 2740 } 2741 } else { 2742 if (preload_characters > 1) preload_characters = 1; 2743 } 2744 return preload_characters; 2745 } 2746 2747 2748 // This class is used when generating the alternatives in a choice node. It 2749 // records the way the alternative is being code generated. 2750 class AlternativeGeneration: public Malloced { 2751 public: 2752 AlternativeGeneration() 2753 : possible_success(), 2754 expects_preload(false), 2755 after(), 2756 quick_check_details() { } 2757 Label possible_success; 2758 bool expects_preload; 2759 Label after; 2760 QuickCheckDetails quick_check_details; 2761 }; 2762 2763 2764 // Creates a list of AlternativeGenerations. If the list has a reasonable 2765 // size then it is on the stack, otherwise the excess is on the heap. 2766 class AlternativeGenerationList { 2767 public: 2768 explicit AlternativeGenerationList(int count) 2769 : alt_gens_(count) { 2770 for (int i = 0; i < count && i < kAFew; i++) { 2771 alt_gens_.Add(a_few_alt_gens_ + i); 2772 } 2773 for (int i = kAFew; i < count; i++) { 2774 alt_gens_.Add(new AlternativeGeneration()); 2775 } 2776 } 2777 ~AlternativeGenerationList() { 2778 for (int i = kAFew; i < alt_gens_.length(); i++) { 2779 delete alt_gens_[i]; 2780 alt_gens_[i] = NULL; 2781 } 2782 } 2783 2784 AlternativeGeneration* at(int i) { 2785 return alt_gens_[i]; 2786 } 2787 2788 private: 2789 static const int kAFew = 10; 2790 ZoneList<AlternativeGeneration*> alt_gens_; 2791 AlternativeGeneration a_few_alt_gens_[kAFew]; 2792 }; 2793 2794 2795 /* Code generation for choice nodes. 2796 * 2797 * We generate quick checks that do a mask and compare to eliminate a 2798 * choice. If the quick check succeeds then it jumps to the continuation to 2799 * do slow checks and check subsequent nodes. If it fails (the common case) 2800 * it falls through to the next choice. 2801 * 2802 * Here is the desired flow graph. Nodes directly below each other imply 2803 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative 2804 * 3 doesn't have a quick check so we have to call the slow check. 2805 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire 2806 * regexp continuation is generated directly after the Sn node, up to the 2807 * next GoTo if we decide to reuse some already generated code. Some 2808 * nodes expect preload_characters to be preloaded into the current 2809 * character register. R nodes do this preloading. Vertices are marked 2810 * F for failures and S for success (possible success in the case of quick 2811 * nodes). L, V, < and > are used as arrow heads. 2812 * 2813 * ----------> R 2814 * | 2815 * V 2816 * Q1 -----> S1 2817 * | S / 2818 * F| / 2819 * | F/ 2820 * | / 2821 * | R 2822 * | / 2823 * V L 2824 * Q2 -----> S2 2825 * | S / 2826 * F| / 2827 * | F/ 2828 * | / 2829 * | R 2830 * | / 2831 * V L 2832 * S3 2833 * | 2834 * F| 2835 * | 2836 * R 2837 * | 2838 * backtrack V 2839 * <----------Q4 2840 * \ F | 2841 * \ |S 2842 * \ F V 2843 * \-----S4 2844 * 2845 * For greedy loops we reverse our expectation and expect to match rather 2846 * than fail. Therefore we want the loop code to look like this (U is the 2847 * unwind code that steps back in the greedy loop). The following alternatives 2848 * look the same as above. 2849 * _____ 2850 * / \ 2851 * V | 2852 * ----------> S1 | 2853 * /| | 2854 * / |S | 2855 * F/ \_____/ 2856 * / 2857 * |<----------- 2858 * | \ 2859 * V \ 2860 * Q2 ---> S2 \ 2861 * | S / | 2862 * F| / | 2863 * | F/ | 2864 * | / | 2865 * | R | 2866 * | / | 2867 * F VL | 2868 * <------U | 2869 * back |S | 2870 * \______________/ 2871 */ 2872 2873 2874 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 2875 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 2876 int choice_count = alternatives_->length(); 2877 #ifdef DEBUG 2878 for (int i = 0; i < choice_count - 1; i++) { 2879 GuardedAlternative alternative = alternatives_->at(i); 2880 ZoneList<Guard*>* guards = alternative.guards(); 2881 int guard_count = (guards == NULL) ? 0 : guards->length(); 2882 for (int j = 0; j < guard_count; j++) { 2883 ASSERT(!trace->mentions_reg(guards->at(j)->reg())); 2884 } 2885 } 2886 #endif 2887 2888 LimitResult limit_result = LimitVersions(compiler, trace); 2889 if (limit_result == DONE) return; 2890 ASSERT(limit_result == CONTINUE); 2891 2892 int new_flush_budget = trace->flush_budget() / choice_count; 2893 if (trace->flush_budget() == 0 && trace->actions() != NULL) { 2894 trace->Flush(compiler, this); 2895 return; 2896 } 2897 2898 RecursionCheck rc(compiler); 2899 2900 Trace* current_trace = trace; 2901 2902 int text_length = GreedyLoopTextLengthForAlternative(&(alternatives_->at(0))); 2903 bool greedy_loop = false; 2904 Label greedy_loop_label; 2905 Trace counter_backtrack_trace; 2906 counter_backtrack_trace.set_backtrack(&greedy_loop_label); 2907 if (not_at_start()) counter_backtrack_trace.set_at_start(false); 2908 2909 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) { 2910 // Here we have special handling for greedy loops containing only text nodes 2911 // and other simple nodes. These are handled by pushing the current 2912 // position on the stack and then incrementing the current position each 2913 // time around the switch. On backtrack we decrement the current position 2914 // and check it against the pushed value. This avoids pushing backtrack 2915 // information for each iteration of the loop, which could take up a lot of 2916 // space. 2917 greedy_loop = true; 2918 ASSERT(trace->stop_node() == NULL); 2919 macro_assembler->PushCurrentPosition(); 2920 current_trace = &counter_backtrack_trace; 2921 Label greedy_match_failed; 2922 Trace greedy_match_trace; 2923 if (not_at_start()) greedy_match_trace.set_at_start(false); 2924 greedy_match_trace.set_backtrack(&greedy_match_failed); 2925 Label loop_label; 2926 macro_assembler->Bind(&loop_label); 2927 greedy_match_trace.set_stop_node(this); 2928 greedy_match_trace.set_loop_label(&loop_label); 2929 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace); 2930 macro_assembler->Bind(&greedy_match_failed); 2931 } 2932 2933 Label second_choice; // For use in greedy matches. 2934 macro_assembler->Bind(&second_choice); 2935 2936 int first_normal_choice = greedy_loop ? 1 : 0; 2937 2938 int preload_characters = 2939 CalculatePreloadCharacters(compiler, 2940 current_trace->at_start() == Trace::FALSE); 2941 bool preload_is_current = 2942 (current_trace->characters_preloaded() == preload_characters); 2943 bool preload_has_checked_bounds = preload_is_current; 2944 2945 AlternativeGenerationList alt_gens(choice_count); 2946 2947 // For now we just call all choices one after the other. The idea ultimately 2948 // is to use the Dispatch table to try only the relevant ones. 2949 for (int i = first_normal_choice; i < choice_count; i++) { 2950 GuardedAlternative alternative = alternatives_->at(i); 2951 AlternativeGeneration* alt_gen = alt_gens.at(i); 2952 alt_gen->quick_check_details.set_characters(preload_characters); 2953 ZoneList<Guard*>* guards = alternative.guards(); 2954 int guard_count = (guards == NULL) ? 0 : guards->length(); 2955 Trace new_trace(*current_trace); 2956 new_trace.set_characters_preloaded(preload_is_current ? 2957 preload_characters : 2958 0); 2959 if (preload_has_checked_bounds) { 2960 new_trace.set_bound_checked_up_to(preload_characters); 2961 } 2962 new_trace.quick_check_performed()->Clear(); 2963 if (not_at_start_) new_trace.set_at_start(Trace::FALSE); 2964 alt_gen->expects_preload = preload_is_current; 2965 bool generate_full_check_inline = false; 2966 if (FLAG_regexp_optimization && 2967 try_to_emit_quick_check_for_alternative(i) && 2968 alternative.node()->EmitQuickCheck(compiler, 2969 &new_trace, 2970 preload_has_checked_bounds, 2971 &alt_gen->possible_success, 2972 &alt_gen->quick_check_details, 2973 i < choice_count - 1)) { 2974 // Quick check was generated for this choice. 2975 preload_is_current = true; 2976 preload_has_checked_bounds = true; 2977 // On the last choice in the ChoiceNode we generated the quick 2978 // check to fall through on possible success. So now we need to 2979 // generate the full check inline. 2980 if (i == choice_count - 1) { 2981 macro_assembler->Bind(&alt_gen->possible_success); 2982 new_trace.set_quick_check_performed(&alt_gen->quick_check_details); 2983 new_trace.set_characters_preloaded(preload_characters); 2984 new_trace.set_bound_checked_up_to(preload_characters); 2985 generate_full_check_inline = true; 2986 } 2987 } else if (alt_gen->quick_check_details.cannot_match()) { 2988 if (i == choice_count - 1 && !greedy_loop) { 2989 macro_assembler->GoTo(trace->backtrack()); 2990 } 2991 continue; 2992 } else { 2993 // No quick check was generated. Put the full code here. 2994 // If this is not the first choice then there could be slow checks from 2995 // previous cases that go here when they fail. There's no reason to 2996 // insist that they preload characters since the slow check we are about 2997 // to generate probably can't use it. 2998 if (i != first_normal_choice) { 2999 alt_gen->expects_preload = false; 3000 new_trace.InvalidateCurrentCharacter(); 3001 } 3002 if (i < choice_count - 1) { 3003 new_trace.set_backtrack(&alt_gen->after); 3004 } 3005 generate_full_check_inline = true; 3006 } 3007 if (generate_full_check_inline) { 3008 if (new_trace.actions() != NULL) { 3009 new_trace.set_flush_budget(new_flush_budget); 3010 } 3011 for (int j = 0; j < guard_count; j++) { 3012 GenerateGuard(macro_assembler, guards->at(j), &new_trace); 3013 } 3014 alternative.node()->Emit(compiler, &new_trace); 3015 preload_is_current = false; 3016 } 3017 macro_assembler->Bind(&alt_gen->after); 3018 } 3019 if (greedy_loop) { 3020 macro_assembler->Bind(&greedy_loop_label); 3021 // If we have unwound to the bottom then backtrack. 3022 macro_assembler->CheckGreedyLoop(trace->backtrack()); 3023 // Otherwise try the second priority at an earlier position. 3024 macro_assembler->AdvanceCurrentPosition(-text_length); 3025 macro_assembler->GoTo(&second_choice); 3026 } 3027 3028 // At this point we need to generate slow checks for the alternatives where 3029 // the quick check was inlined. We can recognize these because the associated 3030 // label was bound. 3031 for (int i = first_normal_choice; i < choice_count - 1; i++) { 3032 AlternativeGeneration* alt_gen = alt_gens.at(i); 3033 Trace new_trace(*current_trace); 3034 // If there are actions to be flushed we have to limit how many times 3035 // they are flushed. Take the budget of the parent trace and distribute 3036 // it fairly amongst the children. 3037 if (new_trace.actions() != NULL) { 3038 new_trace.set_flush_budget(new_flush_budget); 3039 } 3040 EmitOutOfLineContinuation(compiler, 3041 &new_trace, 3042 alternatives_->at(i), 3043 alt_gen, 3044 preload_characters, 3045 alt_gens.at(i + 1)->expects_preload); 3046 } 3047 } 3048 3049 3050 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler, 3051 Trace* trace, 3052 GuardedAlternative alternative, 3053 AlternativeGeneration* alt_gen, 3054 int preload_characters, 3055 bool next_expects_preload) { 3056 if (!alt_gen->possible_success.is_linked()) return; 3057 3058 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 3059 macro_assembler->Bind(&alt_gen->possible_success); 3060 Trace out_of_line_trace(*trace); 3061 out_of_line_trace.set_characters_preloaded(preload_characters); 3062 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details); 3063 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE); 3064 ZoneList<Guard*>* guards = alternative.guards(); 3065 int guard_count = (guards == NULL) ? 0 : guards->length(); 3066 if (next_expects_preload) { 3067 Label reload_current_char; 3068 out_of_line_trace.set_backtrack(&reload_current_char); 3069 for (int j = 0; j < guard_count; j++) { 3070 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace); 3071 } 3072 alternative.node()->Emit(compiler, &out_of_line_trace); 3073 macro_assembler->Bind(&reload_current_char); 3074 // Reload the current character, since the next quick check expects that. 3075 // We don't need to check bounds here because we only get into this 3076 // code through a quick check which already did the checked load. 3077 macro_assembler->LoadCurrentCharacter(trace->cp_offset(), 3078 NULL, 3079 false, 3080 preload_characters); 3081 macro_assembler->GoTo(&(alt_gen->after)); 3082 } else { 3083 out_of_line_trace.set_backtrack(&(alt_gen->after)); 3084 for (int j = 0; j < guard_count; j++) { 3085 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace); 3086 } 3087 alternative.node()->Emit(compiler, &out_of_line_trace); 3088 } 3089 } 3090 3091 3092 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3093 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3094 LimitResult limit_result = LimitVersions(compiler, trace); 3095 if (limit_result == DONE) return; 3096 ASSERT(limit_result == CONTINUE); 3097 3098 RecursionCheck rc(compiler); 3099 3100 switch (type_) { 3101 case STORE_POSITION: { 3102 Trace::DeferredCapture 3103 new_capture(data_.u_position_register.reg, 3104 data_.u_position_register.is_capture, 3105 trace); 3106 Trace new_trace = *trace; 3107 new_trace.add_action(&new_capture); 3108 on_success()->Emit(compiler, &new_trace); 3109 break; 3110 } 3111 case INCREMENT_REGISTER: { 3112 Trace::DeferredIncrementRegister 3113 new_increment(data_.u_increment_register.reg); 3114 Trace new_trace = *trace; 3115 new_trace.add_action(&new_increment); 3116 on_success()->Emit(compiler, &new_trace); 3117 break; 3118 } 3119 case SET_REGISTER: { 3120 Trace::DeferredSetRegister 3121 new_set(data_.u_store_register.reg, data_.u_store_register.value); 3122 Trace new_trace = *trace; 3123 new_trace.add_action(&new_set); 3124 on_success()->Emit(compiler, &new_trace); 3125 break; 3126 } 3127 case CLEAR_CAPTURES: { 3128 Trace::DeferredClearCaptures 3129 new_capture(Interval(data_.u_clear_captures.range_from, 3130 data_.u_clear_captures.range_to)); 3131 Trace new_trace = *trace; 3132 new_trace.add_action(&new_capture); 3133 on_success()->Emit(compiler, &new_trace); 3134 break; 3135 } 3136 case BEGIN_SUBMATCH: 3137 if (!trace->is_trivial()) { 3138 trace->Flush(compiler, this); 3139 } else { 3140 assembler->WriteCurrentPositionToRegister( 3141 data_.u_submatch.current_position_register, 0); 3142 assembler->WriteStackPointerToRegister( 3143 data_.u_submatch.stack_pointer_register); 3144 on_success()->Emit(compiler, trace); 3145 } 3146 break; 3147 case EMPTY_MATCH_CHECK: { 3148 int start_pos_reg = data_.u_empty_match_check.start_register; 3149 int stored_pos = 0; 3150 int rep_reg = data_.u_empty_match_check.repetition_register; 3151 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister); 3152 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos); 3153 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) { 3154 // If we know we haven't advanced and there is no minimum we 3155 // can just backtrack immediately. 3156 assembler->GoTo(trace->backtrack()); 3157 } else if (know_dist && stored_pos < trace->cp_offset()) { 3158 // If we know we've advanced we can generate the continuation 3159 // immediately. 3160 on_success()->Emit(compiler, trace); 3161 } else if (!trace->is_trivial()) { 3162 trace->Flush(compiler, this); 3163 } else { 3164 Label skip_empty_check; 3165 // If we have a minimum number of repetitions we check the current 3166 // number first and skip the empty check if it's not enough. 3167 if (has_minimum) { 3168 int limit = data_.u_empty_match_check.repetition_limit; 3169 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check); 3170 } 3171 // If the match is empty we bail out, otherwise we fall through 3172 // to the on-success continuation. 3173 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register, 3174 trace->backtrack()); 3175 assembler->Bind(&skip_empty_check); 3176 on_success()->Emit(compiler, trace); 3177 } 3178 break; 3179 } 3180 case POSITIVE_SUBMATCH_SUCCESS: { 3181 if (!trace->is_trivial()) { 3182 trace->Flush(compiler, this); 3183 return; 3184 } 3185 assembler->ReadCurrentPositionFromRegister( 3186 data_.u_submatch.current_position_register); 3187 assembler->ReadStackPointerFromRegister( 3188 data_.u_submatch.stack_pointer_register); 3189 int clear_register_count = data_.u_submatch.clear_register_count; 3190 if (clear_register_count == 0) { 3191 on_success()->Emit(compiler, trace); 3192 return; 3193 } 3194 int clear_registers_from = data_.u_submatch.clear_register_from; 3195 Label clear_registers_backtrack; 3196 Trace new_trace = *trace; 3197 new_trace.set_backtrack(&clear_registers_backtrack); 3198 on_success()->Emit(compiler, &new_trace); 3199 3200 assembler->Bind(&clear_registers_backtrack); 3201 int clear_registers_to = clear_registers_from + clear_register_count - 1; 3202 assembler->ClearRegisters(clear_registers_from, clear_registers_to); 3203 3204 ASSERT(trace->backtrack() == NULL); 3205 assembler->Backtrack(); 3206 return; 3207 } 3208 default: 3209 UNREACHABLE(); 3210 } 3211 } 3212 3213 3214 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3215 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3216 if (!trace->is_trivial()) { 3217 trace->Flush(compiler, this); 3218 return; 3219 } 3220 3221 LimitResult limit_result = LimitVersions(compiler, trace); 3222 if (limit_result == DONE) return; 3223 ASSERT(limit_result == CONTINUE); 3224 3225 RecursionCheck rc(compiler); 3226 3227 ASSERT_EQ(start_reg_ + 1, end_reg_); 3228 if (compiler->ignore_case()) { 3229 assembler->CheckNotBackReferenceIgnoreCase(start_reg_, 3230 trace->backtrack()); 3231 } else { 3232 assembler->CheckNotBackReference(start_reg_, trace->backtrack()); 3233 } 3234 on_success()->Emit(compiler, trace); 3235 } 3236 3237 3238 // ------------------------------------------------------------------- 3239 // Dot/dotty output 3240 3241 3242 #ifdef DEBUG 3243 3244 3245 class DotPrinter: public NodeVisitor { 3246 public: 3247 explicit DotPrinter(bool ignore_case) 3248 : ignore_case_(ignore_case), 3249 stream_(&alloc_) { } 3250 void PrintNode(const char* label, RegExpNode* node); 3251 void Visit(RegExpNode* node); 3252 void PrintAttributes(RegExpNode* from); 3253 StringStream* stream() { return &stream_; } 3254 void PrintOnFailure(RegExpNode* from, RegExpNode* to); 3255 #define DECLARE_VISIT(Type) \ 3256 virtual void Visit##Type(Type##Node* that); 3257 FOR_EACH_NODE_TYPE(DECLARE_VISIT) 3258 #undef DECLARE_VISIT 3259 private: 3260 bool ignore_case_; 3261 HeapStringAllocator alloc_; 3262 StringStream stream_; 3263 }; 3264 3265 3266 void DotPrinter::PrintNode(const char* label, RegExpNode* node) { 3267 stream()->Add("digraph G {\n graph [label=\""); 3268 for (int i = 0; label[i]; i++) { 3269 switch (label[i]) { 3270 case '\\': 3271 stream()->Add("\\\\"); 3272 break; 3273 case '"': 3274 stream()->Add("\""); 3275 break; 3276 default: 3277 stream()->Put(label[i]); 3278 break; 3279 } 3280 } 3281 stream()->Add("\"];\n"); 3282 Visit(node); 3283 stream()->Add("}\n"); 3284 printf("%s", *(stream()->ToCString())); 3285 } 3286 3287 3288 void DotPrinter::Visit(RegExpNode* node) { 3289 if (node->info()->visited) return; 3290 node->info()->visited = true; 3291 node->Accept(this); 3292 } 3293 3294 3295 void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) { 3296 stream()->Add(" n%p -> n%p [style=dotted];\n", from, on_failure); 3297 Visit(on_failure); 3298 } 3299 3300 3301 class TableEntryBodyPrinter { 3302 public: 3303 TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice) 3304 : stream_(stream), choice_(choice) { } 3305 void Call(uc16 from, DispatchTable::Entry entry) { 3306 OutSet* out_set = entry.out_set(); 3307 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { 3308 if (out_set->Get(i)) { 3309 stream()->Add(" n%p:s%io%i -> n%p;\n", 3310 choice(), 3311 from, 3312 i, 3313 choice()->alternatives()->at(i).node()); 3314 } 3315 } 3316 } 3317 private: 3318 StringStream* stream() { return stream_; } 3319 ChoiceNode* choice() { return choice_; } 3320 StringStream* stream_; 3321 ChoiceNode* choice_; 3322 }; 3323 3324 3325 class TableEntryHeaderPrinter { 3326 public: 3327 explicit TableEntryHeaderPrinter(StringStream* stream) 3328 : first_(true), stream_(stream) { } 3329 void Call(uc16 from, DispatchTable::Entry entry) { 3330 if (first_) { 3331 first_ = false; 3332 } else { 3333 stream()->Add("|"); 3334 } 3335 stream()->Add("{\\%k-\\%k|{", from, entry.to()); 3336 OutSet* out_set = entry.out_set(); 3337 int priority = 0; 3338 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { 3339 if (out_set->Get(i)) { 3340 if (priority > 0) stream()->Add("|"); 3341 stream()->Add("<s%io%i> %i", from, i, priority); 3342 priority++; 3343 } 3344 } 3345 stream()->Add("}}"); 3346 } 3347 3348 private: 3349 bool first_; 3350 StringStream* stream() { return stream_; } 3351 StringStream* stream_; 3352 }; 3353 3354 3355 class AttributePrinter { 3356 public: 3357 explicit AttributePrinter(DotPrinter* out) 3358 : out_(out), first_(true) { } 3359 void PrintSeparator() { 3360 if (first_) { 3361 first_ = false; 3362 } else { 3363 out_->stream()->Add("|"); 3364 } 3365 } 3366 void PrintBit(const char* name, bool value) { 3367 if (!value) return; 3368 PrintSeparator(); 3369 out_->stream()->Add("{%s}", name); 3370 } 3371 void PrintPositive(const char* name, int value) { 3372 if (value < 0) return; 3373 PrintSeparator(); 3374 out_->stream()->Add("{%s|%x}", name, value); 3375 } 3376 private: 3377 DotPrinter* out_; 3378 bool first_; 3379 }; 3380 3381 3382 void DotPrinter::PrintAttributes(RegExpNode* that) { 3383 stream()->Add(" a%p [shape=Mrecord, color=grey, fontcolor=grey, " 3384 "margin=0.1, fontsize=10, label=\"{", 3385 that); 3386 AttributePrinter printer(this); 3387 NodeInfo* info = that->info(); 3388 printer.PrintBit("NI", info->follows_newline_interest); 3389 printer.PrintBit("WI", info->follows_word_interest); 3390 printer.PrintBit("SI", info->follows_start_interest); 3391 Label* label = that->label(); 3392 if (label->is_bound()) 3393 printer.PrintPositive("@", label->pos()); 3394 stream()->Add("}\"];\n"); 3395 stream()->Add(" a%p -> n%p [style=dashed, color=grey, " 3396 "arrowhead=none];\n", that, that); 3397 } 3398 3399 3400 static const bool kPrintDispatchTable = false; 3401 void DotPrinter::VisitChoice(ChoiceNode* that) { 3402 if (kPrintDispatchTable) { 3403 stream()->Add(" n%p [shape=Mrecord, label=\"", that); 3404 TableEntryHeaderPrinter header_printer(stream()); 3405 that->GetTable(ignore_case_)->ForEach(&header_printer); 3406 stream()->Add("\"]\n", that); 3407 PrintAttributes(that); 3408 TableEntryBodyPrinter body_printer(stream(), that); 3409 that->GetTable(ignore_case_)->ForEach(&body_printer); 3410 } else { 3411 stream()->Add(" n%p [shape=Mrecord, label=\"?\"];\n", that); 3412 for (int i = 0; i < that->alternatives()->length(); i++) { 3413 GuardedAlternative alt = that->alternatives()->at(i); 3414 stream()->Add(" n%p -> n%p;\n", that, alt.node()); 3415 } 3416 } 3417 for (int i = 0; i < that->alternatives()->length(); i++) { 3418 GuardedAlternative alt = that->alternatives()->at(i); 3419 alt.node()->Accept(this); 3420 } 3421 } 3422 3423 3424 void DotPrinter::VisitText(TextNode* that) { 3425 stream()->Add(" n%p [label=\"", that); 3426 for (int i = 0; i < that->elements()->length(); i++) { 3427 if (i > 0) stream()->Add(" "); 3428 TextElement elm = that->elements()->at(i); 3429 switch (elm.type) { 3430 case TextElement::ATOM: { 3431 stream()->Add("'%w'", elm.data.u_atom->data()); 3432 break; 3433 } 3434 case TextElement::CHAR_CLASS: { 3435 RegExpCharacterClass* node = elm.data.u_char_class; 3436 stream()->Add("["); 3437 if (node->is_negated()) 3438 stream()->Add("^"); 3439 for (int j = 0; j < node->ranges()->length(); j++) { 3440 CharacterRange range = node->ranges()->at(j); 3441 stream()->Add("%k-%k", range.from(), range.to()); 3442 } 3443 stream()->Add("]"); 3444 break; 3445 } 3446 default: 3447 UNREACHABLE(); 3448 } 3449 } 3450 stream()->Add("\", shape=box, peripheries=2];\n"); 3451 PrintAttributes(that); 3452 stream()->Add(" n%p -> n%p;\n", that, that->on_success()); 3453 Visit(that->on_success()); 3454 } 3455 3456 3457 void DotPrinter::VisitBackReference(BackReferenceNode* that) { 3458 stream()->Add(" n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n", 3459 that, 3460 that->start_register(), 3461 that->end_register()); 3462 PrintAttributes(that); 3463 stream()->Add(" n%p -> n%p;\n", that, that->on_success()); 3464 Visit(that->on_success()); 3465 } 3466 3467 3468 void DotPrinter::VisitEnd(EndNode* that) { 3469 stream()->Add(" n%p [style=bold, shape=point];\n", that); 3470 PrintAttributes(that); 3471 } 3472 3473 3474 void DotPrinter::VisitAssertion(AssertionNode* that) { 3475 stream()->Add(" n%p [", that); 3476 switch (that->type()) { 3477 case AssertionNode::AT_END: 3478 stream()->Add("label=\"$\", shape=septagon"); 3479 break; 3480 case AssertionNode::AT_START: 3481 stream()->Add("label=\"^\", shape=septagon"); 3482 break; 3483 case AssertionNode::AT_BOUNDARY: 3484 stream()->Add("label=\"\\b\", shape=septagon"); 3485 break; 3486 case AssertionNode::AT_NON_BOUNDARY: 3487 stream()->Add("label=\"\\B\", shape=septagon"); 3488 break; 3489 case AssertionNode::AFTER_NEWLINE: 3490 stream()->Add("label=\"(?<=\\n)\", shape=septagon"); 3491 break; 3492 case AssertionNode::AFTER_WORD_CHARACTER: 3493 stream()->Add("label=\"(?<=\\w)\", shape=septagon"); 3494 break; 3495 case AssertionNode::AFTER_NONWORD_CHARACTER: 3496 stream()->Add("label=\"(?<=\\W)\", shape=septagon"); 3497 break; 3498 } 3499 stream()->Add("];\n"); 3500 PrintAttributes(that); 3501 RegExpNode* successor = that->on_success(); 3502 stream()->Add(" n%p -> n%p;\n", that, successor); 3503 Visit(successor); 3504 } 3505 3506 3507 void DotPrinter::VisitAction(ActionNode* that) { 3508 stream()->Add(" n%p [", that); 3509 switch (that->type_) { 3510 case ActionNode::SET_REGISTER: 3511 stream()->Add("label=\"$%i:=%i\", shape=octagon", 3512 that->data_.u_store_register.reg, 3513 that->data_.u_store_register.value); 3514 break; 3515 case ActionNode::INCREMENT_REGISTER: 3516 stream()->Add("label=\"$%i++\", shape=octagon", 3517 that->data_.u_increment_register.reg); 3518 break; 3519 case ActionNode::STORE_POSITION: 3520 stream()->Add("label=\"$%i:=$pos\", shape=octagon", 3521 that->data_.u_position_register.reg); 3522 break; 3523 case ActionNode::BEGIN_SUBMATCH: 3524 stream()->Add("label=\"$%i:=$pos,begin\", shape=septagon", 3525 that->data_.u_submatch.current_position_register); 3526 break; 3527 case ActionNode::POSITIVE_SUBMATCH_SUCCESS: 3528 stream()->Add("label=\"escape\", shape=septagon"); 3529 break; 3530 case ActionNode::EMPTY_MATCH_CHECK: 3531 stream()->Add("label=\"$%i=$pos?,$%i<%i?\", shape=septagon", 3532 that->data_.u_empty_match_check.start_register, 3533 that->data_.u_empty_match_check.repetition_register, 3534 that->data_.u_empty_match_check.repetition_limit); 3535 break; 3536 case ActionNode::CLEAR_CAPTURES: { 3537 stream()->Add("label=\"clear $%i to $%i\", shape=septagon", 3538 that->data_.u_clear_captures.range_from, 3539 that->data_.u_clear_captures.range_to); 3540 break; 3541 } 3542 } 3543 stream()->Add("];\n"); 3544 PrintAttributes(that); 3545 RegExpNode* successor = that->on_success(); 3546 stream()->Add(" n%p -> n%p;\n", that, successor); 3547 Visit(successor); 3548 } 3549 3550 3551 class DispatchTableDumper { 3552 public: 3553 explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { } 3554 void Call(uc16 key, DispatchTable::Entry entry); 3555 StringStream* stream() { return stream_; } 3556 private: 3557 StringStream* stream_; 3558 }; 3559 3560 3561 void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) { 3562 stream()->Add("[%k-%k]: {", key, entry.to()); 3563 OutSet* set = entry.out_set(); 3564 bool first = true; 3565 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { 3566 if (set->Get(i)) { 3567 if (first) { 3568 first = false; 3569 } else { 3570 stream()->Add(", "); 3571 } 3572 stream()->Add("%i", i); 3573 } 3574 } 3575 stream()->Add("}\n"); 3576 } 3577 3578 3579 void DispatchTable::Dump() { 3580 HeapStringAllocator alloc; 3581 StringStream stream(&alloc); 3582 DispatchTableDumper dumper(&stream); 3583 tree()->ForEach(&dumper); 3584 OS::PrintError("%s", *stream.ToCString()); 3585 } 3586 3587 3588 void RegExpEngine::DotPrint(const char* label, 3589 RegExpNode* node, 3590 bool ignore_case) { 3591 DotPrinter printer(ignore_case); 3592 printer.PrintNode(label, node); 3593 } 3594 3595 3596 #endif // DEBUG 3597 3598 3599 // ------------------------------------------------------------------- 3600 // Tree to graph conversion 3601 3602 static const uc16 kSpaceRanges[] = { 0x0009, 0x000D, 0x0020, 0x0020, 0x00A0, 3603 0x00A0, 0x1680, 0x1680, 0x180E, 0x180E, 0x2000, 0x200A, 0x2028, 0x2029, 3604 0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000, 0xFEFF, 0xFEFF }; 3605 static const int kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges); 3606 3607 static const uc16 kWordRanges[] = { '0', '9', 'A', 'Z', '_', '_', 'a', 'z' }; 3608 static const int kWordRangeCount = ARRAY_SIZE(kWordRanges); 3609 3610 static const uc16 kDigitRanges[] = { '0', '9' }; 3611 static const int kDigitRangeCount = ARRAY_SIZE(kDigitRanges); 3612 3613 static const uc16 kLineTerminatorRanges[] = { 0x000A, 0x000A, 0x000D, 0x000D, 3614 0x2028, 0x2029 }; 3615 static const int kLineTerminatorRangeCount = ARRAY_SIZE(kLineTerminatorRanges); 3616 3617 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler, 3618 RegExpNode* on_success) { 3619 ZoneList<TextElement>* elms = new ZoneList<TextElement>(1); 3620 elms->Add(TextElement::Atom(this)); 3621 return new TextNode(elms, on_success); 3622 } 3623 3624 3625 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, 3626 RegExpNode* on_success) { 3627 return new TextNode(elements(), on_success); 3628 } 3629 3630 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges, 3631 const uc16* special_class, 3632 int length) { 3633 ASSERT(ranges->length() != 0); 3634 ASSERT(length != 0); 3635 ASSERT(special_class[0] != 0); 3636 if (ranges->length() != (length >> 1) + 1) { 3637 return false; 3638 } 3639 CharacterRange range = ranges->at(0); 3640 if (range.from() != 0) { 3641 return false; 3642 } 3643 for (int i =