1 // Copyright 2011 the V8 project authors. All rights reserved. 2 // Redistribution and use in source and binary forms, with or without 3 // modification, are permitted provided that the following conditions are 4 // met: 5 // 6 // * Redistributions of source code must retain the above copyright 7 // notice, this list of conditions and the following disclaimer. 8 // * Redistributions in binary form must reproduce the above 9 // copyright notice, this list of conditions and the following 10 // disclaimer in the documentation and/or other materials provided 11 // with the distribution. 12 // * Neither the name of Google Inc. nor the names of its 13 // contributors may be used to endorse or promote products derived 14 // from this software without specific prior written permission. 15 // 16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 28 #include "v8.h" 29 30 #include "ast.h" 31 #include "compiler.h" 32 #include "execution.h" 33 #include "factory.h" 34 #include "jsregexp.h" 35 #include "platform.h" 36 #include "string-search.h" 37 #include "runtime.h" 38 #include "compilation-cache.h" 39 #include "string-stream.h" 40 #include "parser.h" 41 #include "regexp-macro-assembler.h" 42 #include "regexp-macro-assembler-tracer.h" 43 #include "regexp-macro-assembler-irregexp.h" 44 #include "regexp-stack.h" 45 46 #ifndef V8_INTERPRETED_REGEXP 47 #if V8_TARGET_ARCH_IA32 48 #include "ia32/regexp-macro-assembler-ia32.h" 49 #elif V8_TARGET_ARCH_X64 50 #include "x64/regexp-macro-assembler-x64.h" 51 #elif V8_TARGET_ARCH_ARM 52 #include "arm/regexp-macro-assembler-arm.h" 53 #elif V8_TARGET_ARCH_MIPS 54 #include "mips/regexp-macro-assembler-mips.h" 55 #else 56 #error Unsupported target architecture. 57 #endif 58 #endif 59 60 #include "interpreter-irregexp.h" 61 62 63 namespace v8 { 64 namespace internal { 65 66 Handle<Object> RegExpImpl::CreateRegExpLiteral(Handle<JSFunction> constructor, 67 Handle<String> pattern, 68 Handle<String> flags, 69 bool* has_pending_exception) { 70 // Call the construct code with 2 arguments. 71 Handle<Object> argv[] = { pattern, flags }; 72 return Execution::New(constructor, ARRAY_SIZE(argv), argv, 73 has_pending_exception); 74 } 75 76 77 static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) { 78 int flags = JSRegExp::NONE; 79 for (int i = 0; i < str->length(); i++) { 80 switch (str->Get(i)) { 81 case 'i': 82 flags |= JSRegExp::IGNORE_CASE; 83 break; 84 case 'g': 85 flags |= JSRegExp::GLOBAL; 86 break; 87 case 'm': 88 flags |= JSRegExp::MULTILINE; 89 break; 90 } 91 } 92 return JSRegExp::Flags(flags); 93 } 94 95 96 static inline void ThrowRegExpException(Handle<JSRegExp> re, 97 Handle<String> pattern, 98 Handle<String> error_text, 99 const char* message) { 100 Isolate* isolate = re->GetIsolate(); 101 Factory* factory = isolate->factory(); 102 Handle<FixedArray> elements = factory->NewFixedArray(2); 103 elements->set(0, *pattern); 104 elements->set(1, *error_text); 105 Handle<JSArray> array = factory->NewJSArrayWithElements(elements); 106 Handle<Object> regexp_err = factory->NewSyntaxError(message, array); 107 isolate->Throw(*regexp_err); 108 } 109 110 111 // Generic RegExp methods. Dispatches to implementation specific methods. 112 113 114 Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re, 115 Handle<String> pattern, 116 Handle<String> flag_str) { 117 Isolate* isolate = re->GetIsolate(); 118 JSRegExp::Flags flags = RegExpFlagsFromString(flag_str); 119 CompilationCache* compilation_cache = isolate->compilation_cache(); 120 Handle<FixedArray> cached = compilation_cache->LookupRegExp(pattern, flags); 121 bool in_cache = !cached.is_null(); 122 LOG(isolate, RegExpCompileEvent(re, in_cache)); 123 124 Handle<Object> result; 125 if (in_cache) { 126 re->set_data(*cached); 127 return re; 128 } 129 pattern = FlattenGetString(pattern); 130 ZoneScope zone_scope(isolate, DELETE_ON_EXIT); 131 PostponeInterruptsScope postpone(isolate); 132 RegExpCompileData parse_result; 133 FlatStringReader reader(isolate, pattern); 134 if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(), 135 &parse_result)) { 136 // Throw an exception if we fail to parse the pattern. 137 ThrowRegExpException(re, 138 pattern, 139 parse_result.error, 140 "malformed_regexp"); 141 return Handle<Object>::null(); 142 } 143 144 if (parse_result.simple && !flags.is_ignore_case()) { 145 // Parse-tree is a single atom that is equal to the pattern. 146 AtomCompile(re, pattern, flags, pattern); 147 } else if (parse_result.tree->IsAtom() && 148 !flags.is_ignore_case() && 149 parse_result.capture_count == 0) { 150 RegExpAtom* atom = parse_result.tree->AsAtom(); 151 Vector<const uc16> atom_pattern = atom->data(); 152 Handle<String> atom_string = 153 isolate->factory()->NewStringFromTwoByte(atom_pattern); 154 AtomCompile(re, pattern, flags, atom_string); 155 } else { 156 IrregexpInitialize(re, pattern, flags, parse_result.capture_count); 157 } 158 ASSERT(re->data()->IsFixedArray()); 159 // Compilation succeeded so the data is set on the regexp 160 // and we can store it in the cache. 161 Handle<FixedArray> data(FixedArray::cast(re->data())); 162 compilation_cache->PutRegExp(pattern, flags, data); 163 164 return re; 165 } 166 167 168 Handle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp, 169 Handle<String> subject, 170 int index, 171 Handle<JSArray> last_match_info) { 172 switch (regexp->TypeTag()) { 173 case JSRegExp::ATOM: 174 return AtomExec(regexp, subject, index, last_match_info); 175 case JSRegExp::IRREGEXP: { 176 Handle<Object> result = 177 IrregexpExec(regexp, subject, index, last_match_info); 178 ASSERT(!result.is_null() || 179 regexp->GetIsolate()->has_pending_exception()); 180 return result; 181 } 182 default: 183 UNREACHABLE(); 184 return Handle<Object>::null(); 185 } 186 } 187 188 189 // RegExp Atom implementation: Simple string search using indexOf. 190 191 192 void RegExpImpl::AtomCompile(Handle<JSRegExp> re, 193 Handle<String> pattern, 194 JSRegExp::Flags flags, 195 Handle<String> match_pattern) { 196 re->GetIsolate()->factory()->SetRegExpAtomData(re, 197 JSRegExp::ATOM, 198 pattern, 199 flags, 200 match_pattern); 201 } 202 203 204 static void SetAtomLastCapture(FixedArray* array, 205 String* subject, 206 int from, 207 int to) { 208 NoHandleAllocation no_handles; 209 RegExpImpl::SetLastCaptureCount(array, 2); 210 RegExpImpl::SetLastSubject(array, subject); 211 RegExpImpl::SetLastInput(array, subject); 212 RegExpImpl::SetCapture(array, 0, from); 213 RegExpImpl::SetCapture(array, 1, to); 214 } 215 216 217 Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re, 218 Handle<String> subject, 219 int index, 220 Handle<JSArray> last_match_info) { 221 Isolate* isolate = re->GetIsolate(); 222 223 ASSERT(0 <= index); 224 ASSERT(index <= subject->length()); 225 226 if (!subject->IsFlat()) FlattenString(subject); 227 AssertNoAllocation no_heap_allocation; // ensure vectors stay valid 228 229 String* needle = String::cast(re->DataAt(JSRegExp::kAtomPatternIndex)); 230 int needle_len = needle->length(); 231 ASSERT(needle->IsFlat()); 232 233 if (needle_len != 0) { 234 if (index + needle_len > subject->length()) { 235 return isolate->factory()->null_value(); 236 } 237 238 String::FlatContent needle_content = needle->GetFlatContent(); 239 String::FlatContent subject_content = subject->GetFlatContent(); 240 ASSERT(needle_content.IsFlat()); 241 ASSERT(subject_content.IsFlat()); 242 // dispatch on type of strings 243 index = (needle_content.IsAscii() 244 ? (subject_content.IsAscii() 245 ? SearchString(isolate, 246 subject_content.ToAsciiVector(), 247 needle_content.ToAsciiVector(), 248 index) 249 : SearchString(isolate, 250 subject_content.ToUC16Vector(), 251 needle_content.ToAsciiVector(), 252 index)) 253 : (subject_content.IsAscii() 254 ? SearchString(isolate, 255 subject_content.ToAsciiVector(), 256 needle_content.ToUC16Vector(), 257 index) 258 : SearchString(isolate, 259 subject_content.ToUC16Vector(), 260 needle_content.ToUC16Vector(), 261 index))); 262 if (index == -1) return isolate->factory()->null_value(); 263 } 264 ASSERT(last_match_info->HasFastElements()); 265 266 { 267 NoHandleAllocation no_handles; 268 FixedArray* array = FixedArray::cast(last_match_info->elements()); 269 SetAtomLastCapture(array, *subject, index, index + needle_len); 270 } 271 return last_match_info; 272 } 273 274 275 // Irregexp implementation. 276 277 // Ensures that the regexp object contains a compiled version of the 278 // source for either ASCII or non-ASCII strings. 279 // If the compiled version doesn't already exist, it is compiled 280 // from the source pattern. 281 // If compilation fails, an exception is thrown and this function 282 // returns false. 283 bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, bool is_ascii) { 284 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii)); 285 #ifdef V8_INTERPRETED_REGEXP 286 if (compiled_code->IsByteArray()) return true; 287 #else // V8_INTERPRETED_REGEXP (RegExp native code) 288 if (compiled_code->IsCode()) return true; 289 #endif 290 // We could potentially have marked this as flushable, but have kept 291 // a saved version if we did not flush it yet. 292 Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_ascii)); 293 if (saved_code->IsCode()) { 294 // Reinstate the code in the original place. 295 re->SetDataAt(JSRegExp::code_index(is_ascii), saved_code); 296 ASSERT(compiled_code->IsSmi()); 297 return true; 298 } 299 return CompileIrregexp(re, is_ascii); 300 } 301 302 303 static bool CreateRegExpErrorObjectAndThrow(Handle<JSRegExp> re, 304 bool is_ascii, 305 Handle<String> error_message, 306 Isolate* isolate) { 307 Factory* factory = isolate->factory(); 308 Handle<FixedArray> elements = factory->NewFixedArray(2); 309 elements->set(0, re->Pattern()); 310 elements->set(1, *error_message); 311 Handle<JSArray> array = factory->NewJSArrayWithElements(elements); 312 Handle<Object> regexp_err = 313 factory->NewSyntaxError("malformed_regexp", array); 314 isolate->Throw(*regexp_err); 315 return false; 316 } 317 318 319 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, bool is_ascii) { 320 // Compile the RegExp. 321 Isolate* isolate = re->GetIsolate(); 322 ZoneScope zone_scope(isolate, DELETE_ON_EXIT); 323 PostponeInterruptsScope postpone(isolate); 324 // If we had a compilation error the last time this is saved at the 325 // saved code index. 326 Object* entry = re->DataAt(JSRegExp::code_index(is_ascii)); 327 // When arriving here entry can only be a smi, either representing an 328 // uncompiled regexp, a previous compilation error, or code that has 329 // been flushed. 330 ASSERT(entry->IsSmi()); 331 int entry_value = Smi::cast(entry)->value(); 332 ASSERT(entry_value == JSRegExp::kUninitializedValue || 333 entry_value == JSRegExp::kCompilationErrorValue || 334 (entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0)); 335 336 if (entry_value == JSRegExp::kCompilationErrorValue) { 337 // A previous compilation failed and threw an error which we store in 338 // the saved code index (we store the error message, not the actual 339 // error). Recreate the error object and throw it. 340 Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_ascii)); 341 ASSERT(error_string->IsString()); 342 Handle<String> error_message(String::cast(error_string)); 343 CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate); 344 return false; 345 } 346 347 JSRegExp::Flags flags = re->GetFlags(); 348 349 Handle<String> pattern(re->Pattern()); 350 if (!pattern->IsFlat()) FlattenString(pattern); 351 RegExpCompileData compile_data; 352 FlatStringReader reader(isolate, pattern); 353 if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(), 354 &compile_data)) { 355 // Throw an exception if we fail to parse the pattern. 356 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once. 357 ThrowRegExpException(re, 358 pattern, 359 compile_data.error, 360 "malformed_regexp"); 361 return false; 362 } 363 RegExpEngine::CompilationResult result = 364 RegExpEngine::Compile(&compile_data, 365 flags.is_ignore_case(), 366 flags.is_multiline(), 367 pattern, 368 is_ascii); 369 if (result.error_message != NULL) { 370 // Unable to compile regexp. 371 Handle<String> error_message = 372 isolate->factory()->NewStringFromUtf8(CStrVector(result.error_message)); 373 CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate); 374 return false; 375 } 376 377 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data())); 378 data->set(JSRegExp::code_index(is_ascii), result.code); 379 int register_max = IrregexpMaxRegisterCount(*data); 380 if (result.num_registers > register_max) { 381 SetIrregexpMaxRegisterCount(*data, result.num_registers); 382 } 383 384 return true; 385 } 386 387 388 int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) { 389 return Smi::cast( 390 re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value(); 391 } 392 393 394 void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) { 395 re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value)); 396 } 397 398 399 int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) { 400 return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value(); 401 } 402 403 404 int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) { 405 return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value(); 406 } 407 408 409 ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_ascii) { 410 return ByteArray::cast(re->get(JSRegExp::code_index(is_ascii))); 411 } 412 413 414 Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_ascii) { 415 return Code::cast(re->get(JSRegExp::code_index(is_ascii))); 416 } 417 418 419 void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re, 420 Handle<String> pattern, 421 JSRegExp::Flags flags, 422 int capture_count) { 423 // Initialize compiled code entries to null. 424 re->GetIsolate()->factory()->SetRegExpIrregexpData(re, 425 JSRegExp::IRREGEXP, 426 pattern, 427 flags, 428 capture_count); 429 } 430 431 432 int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp, 433 Handle<String> subject) { 434 if (!subject->IsFlat()) FlattenString(subject); 435 436 // Check the asciiness of the underlying storage. 437 bool is_ascii = subject->IsAsciiRepresentationUnderneath(); 438 if (!EnsureCompiledIrregexp(regexp, is_ascii)) return -1; 439 440 #ifdef V8_INTERPRETED_REGEXP 441 // Byte-code regexp needs space allocated for all its registers. 442 return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())); 443 #else // V8_INTERPRETED_REGEXP 444 // Native regexp only needs room to output captures. Registers are handled 445 // internally. 446 return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2; 447 #endif // V8_INTERPRETED_REGEXP 448 } 449 450 451 RegExpImpl::IrregexpResult RegExpImpl::IrregexpExecOnce( 452 Handle<JSRegExp> regexp, 453 Handle<String> subject, 454 int index, 455 Vector<int> output) { 456 Isolate* isolate = regexp->GetIsolate(); 457 458 Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate); 459 460 ASSERT(index >= 0); 461 ASSERT(index <= subject->length()); 462 ASSERT(subject->IsFlat()); 463 464 bool is_ascii = subject->IsAsciiRepresentationUnderneath(); 465 466 #ifndef V8_INTERPRETED_REGEXP 467 ASSERT(output.length() >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2); 468 do { 469 EnsureCompiledIrregexp(regexp, is_ascii); 470 Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii), isolate); 471 NativeRegExpMacroAssembler::Result res = 472 NativeRegExpMacroAssembler::Match(code, 473 subject, 474 output.start(), 475 output.length(), 476 index, 477 isolate); 478 if (res != NativeRegExpMacroAssembler::RETRY) { 479 ASSERT(res != NativeRegExpMacroAssembler::EXCEPTION || 480 isolate->has_pending_exception()); 481 STATIC_ASSERT( 482 static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS); 483 STATIC_ASSERT( 484 static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE); 485 STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION) 486 == RE_EXCEPTION); 487 return static_cast<IrregexpResult>(res); 488 } 489 // If result is RETRY, the string has changed representation, and we 490 // must restart from scratch. 491 // In this case, it means we must make sure we are prepared to handle 492 // the, potentially, different subject (the string can switch between 493 // being internal and external, and even between being ASCII and UC16, 494 // but the characters are always the same). 495 IrregexpPrepare(regexp, subject); 496 is_ascii = subject->IsAsciiRepresentationUnderneath(); 497 } while (true); 498 UNREACHABLE(); 499 return RE_EXCEPTION; 500 #else // V8_INTERPRETED_REGEXP 501 502 ASSERT(output.length() >= IrregexpNumberOfRegisters(*irregexp)); 503 // We must have done EnsureCompiledIrregexp, so we can get the number of 504 // registers. 505 int* register_vector = output.start(); 506 int number_of_capture_registers = 507 (IrregexpNumberOfCaptures(*irregexp) + 1) * 2; 508 for (int i = number_of_capture_registers - 1; i >= 0; i--) { 509 register_vector[i] = -1; 510 } 511 Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_ascii), isolate); 512 513 IrregexpResult result = IrregexpInterpreter::Match(isolate, 514 byte_codes, 515 subject, 516 register_vector, 517 index); 518 if (result == RE_EXCEPTION) { 519 ASSERT(!isolate->has_pending_exception()); 520 isolate->StackOverflow(); 521 } 522 return result; 523 #endif // V8_INTERPRETED_REGEXP 524 } 525 526 527 Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> jsregexp, 528 Handle<String> subject, 529 int previous_index, 530 Handle<JSArray> last_match_info) { 531 Isolate* isolate = jsregexp->GetIsolate(); 532 ASSERT_EQ(jsregexp->TypeTag(), JSRegExp::IRREGEXP); 533 534 // Prepare space for the return values. 535 #ifdef V8_INTERPRETED_REGEXP 536 #ifdef DEBUG 537 if (FLAG_trace_regexp_bytecodes) { 538 String* pattern = jsregexp->Pattern(); 539 PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString())); 540 PrintF("\n\nSubject string: '%s'\n\n", *(subject->ToCString())); 541 } 542 #endif 543 #endif 544 int required_registers = RegExpImpl::IrregexpPrepare(jsregexp, subject); 545 if (required_registers < 0) { 546 // Compiling failed with an exception. 547 ASSERT(isolate->has_pending_exception()); 548 return Handle<Object>::null(); 549 } 550 551 OffsetsVector registers(required_registers, isolate); 552 553 IrregexpResult res = RegExpImpl::IrregexpExecOnce( 554 jsregexp, subject, previous_index, Vector<int>(registers.vector(), 555 registers.length())); 556 if (res == RE_SUCCESS) { 557 int capture_register_count = 558 (IrregexpNumberOfCaptures(FixedArray::cast(jsregexp->data())) + 1) * 2; 559 last_match_info->EnsureSize(capture_register_count + kLastMatchOverhead); 560 AssertNoAllocation no_gc; 561 int* register_vector = registers.vector(); 562 FixedArray* array = FixedArray::cast(last_match_info->elements()); 563 for (int i = 0; i < capture_register_count; i += 2) { 564 SetCapture(array, i, register_vector[i]); 565 SetCapture(array, i + 1, register_vector[i + 1]); 566 } 567 SetLastCaptureCount(array, capture_register_count); 568 SetLastSubject(array, *subject); 569 SetLastInput(array, *subject); 570 return last_match_info; 571 } 572 if (res == RE_EXCEPTION) { 573 ASSERT(isolate->has_pending_exception()); 574 return Handle<Object>::null(); 575 } 576 ASSERT(res == RE_FAILURE); 577 return isolate->factory()->null_value(); 578 } 579 580 581 // ------------------------------------------------------------------- 582 // Implementation of the Irregexp regular expression engine. 583 // 584 // The Irregexp regular expression engine is intended to be a complete 585 // implementation of ECMAScript regular expressions. It generates either 586 // bytecodes or native code. 587 588 // The Irregexp regexp engine is structured in three steps. 589 // 1) The parser generates an abstract syntax tree. See ast.cc. 590 // 2) From the AST a node network is created. The nodes are all 591 // subclasses of RegExpNode. The nodes represent states when 592 // executing a regular expression. Several optimizations are 593 // performed on the node network. 594 // 3) From the nodes we generate either byte codes or native code 595 // that can actually execute the regular expression (perform 596 // the search). The code generation step is described in more 597 // detail below. 598 599 // Code generation. 600 // 601 // The nodes are divided into four main categories. 602 // * Choice nodes 603 // These represent places where the regular expression can 604 // match in more than one way. For example on entry to an 605 // alternation (foo|bar) or a repetition (*, +, ? or {}). 606 // * Action nodes 607 // These represent places where some action should be 608 // performed. Examples include recording the current position 609 // in the input string to a register (in order to implement 610 // captures) or other actions on register for example in order 611 // to implement the counters needed for {} repetitions. 612 // * Matching nodes 613 // These attempt to match some element part of the input string. 614 // Examples of elements include character classes, plain strings 615 // or back references. 616 // * End nodes 617 // These are used to implement the actions required on finding 618 // a successful match or failing to find a match. 619 // 620 // The code generated (whether as byte codes or native code) maintains 621 // some state as it runs. This consists of the following elements: 622 // 623 // * The capture registers. Used for string captures. 624 // * Other registers. Used for counters etc. 625 // * The current position. 626 // * The stack of backtracking information. Used when a matching node 627 // fails to find a match and needs to try an alternative. 628 // 629 // Conceptual regular expression execution model: 630 // 631 // There is a simple conceptual model of regular expression execution 632 // which will be presented first. The actual code generated is a more 633 // efficient simulation of the simple conceptual model: 634 // 635 // * Choice nodes are implemented as follows: 636 // For each choice except the last { 637 // push current position 638 // push backtrack code location 639 // <generate code to test for choice> 640 // backtrack code location: 641 // pop current position 642 // } 643 // <generate code to test for last choice> 644 // 645 // * Actions nodes are generated as follows 646 // <push affected registers on backtrack stack> 647 // <generate code to perform action> 648 // push backtrack code location 649 // <generate code to test for following nodes> 650 // backtrack code location: 651 // <pop affected registers to restore their state> 652 // <pop backtrack location from stack and go to it> 653 // 654 // * Matching nodes are generated as follows: 655 // if input string matches at current position 656 // update current position 657 // <generate code to test for following nodes> 658 // else 659 // <pop backtrack location from stack and go to it> 660 // 661 // Thus it can be seen that the current position is saved and restored 662 // by the choice nodes, whereas the registers are saved and restored by 663 // by the action nodes that manipulate them. 664 // 665 // The other interesting aspect of this model is that nodes are generated 666 // at the point where they are needed by a recursive call to Emit(). If 667 // the node has already been code generated then the Emit() call will 668 // generate a jump to the previously generated code instead. In order to 669 // limit recursion it is possible for the Emit() function to put the node 670 // on a work list for later generation and instead generate a jump. The 671 // destination of the jump is resolved later when the code is generated. 672 // 673 // Actual regular expression code generation. 674 // 675 // Code generation is actually more complicated than the above. In order 676 // to improve the efficiency of the generated code some optimizations are 677 // performed 678 // 679 // * Choice nodes have 1-character lookahead. 680 // A choice node looks at the following character and eliminates some of 681 // the choices immediately based on that character. This is not yet 682 // implemented. 683 // * Simple greedy loops store reduced backtracking information. 684 // A quantifier like /.*foo/m will greedily match the whole input. It will 685 // then need to backtrack to a point where it can match "foo". The naive 686 // implementation of this would push each character position onto the 687 // backtracking stack, then pop them off one by one. This would use space 688 // proportional to the length of the input string. However since the "." 689 // can only match in one way and always has a constant length (in this case 690 // of 1) it suffices to store the current position on the top of the stack 691 // once. Matching now becomes merely incrementing the current position and 692 // backtracking becomes decrementing the current position and checking the 693 // result against the stored current position. This is faster and saves 694 // space. 695 // * The current state is virtualized. 696 // This is used to defer expensive operations until it is clear that they 697 // are needed and to generate code for a node more than once, allowing 698 // specialized an efficient versions of the code to be created. This is 699 // explained in the section below. 700 // 701 // Execution state virtualization. 702 // 703 // Instead of emitting code, nodes that manipulate the state can record their 704 // manipulation in an object called the Trace. The Trace object can record a 705 // current position offset, an optional backtrack code location on the top of 706 // the virtualized backtrack stack and some register changes. When a node is 707 // to be emitted it can flush the Trace or update it. Flushing the Trace 708 // will emit code to bring the actual state into line with the virtual state. 709 // Avoiding flushing the state can postpone some work (e.g. updates of capture 710 // registers). Postponing work can save time when executing the regular 711 // expression since it may be found that the work never has to be done as a 712 // failure to match can occur. In addition it is much faster to jump to a 713 // known backtrack code location than it is to pop an unknown backtrack 714 // location from the stack and jump there. 715 // 716 // The virtual state found in the Trace affects code generation. For example 717 // the virtual state contains the difference between the actual current 718 // position and the virtual current position, and matching code needs to use 719 // this offset to attempt a match in the correct location of the input 720 // string. Therefore code generated for a non-trivial trace is specialized 721 // to that trace. The code generator therefore has the ability to generate 722 // code for each node several times. In order to limit the size of the 723 // generated code there is an arbitrary limit on how many specialized sets of 724 // code may be generated for a given node. If the limit is reached, the 725 // trace is flushed and a generic version of the code for a node is emitted. 726 // This is subsequently used for that node. The code emitted for non-generic 727 // trace is not recorded in the node and so it cannot currently be reused in 728 // the event that code generation is requested for an identical trace. 729 730 731 void RegExpTree::AppendToText(RegExpText* text) { 732 UNREACHABLE(); 733 } 734 735 736 void RegExpAtom::AppendToText(RegExpText* text) { 737 text->AddElement(TextElement::Atom(this)); 738 } 739 740 741 void RegExpCharacterClass::AppendToText(RegExpText* text) { 742 text->AddElement(TextElement::CharClass(this)); 743 } 744 745 746 void RegExpText::AppendToText(RegExpText* text) { 747 for (int i = 0; i < elements()->length(); i++) 748 text->AddElement(elements()->at(i)); 749 } 750 751 752 TextElement TextElement::Atom(RegExpAtom* atom) { 753 TextElement result = TextElement(ATOM); 754 result.data.u_atom = atom; 755 return result; 756 } 757 758 759 TextElement TextElement::CharClass( 760 RegExpCharacterClass* char_class) { 761 TextElement result = TextElement(CHAR_CLASS); 762 result.data.u_char_class = char_class; 763 return result; 764 } 765 766 767 int TextElement::length() { 768 if (type == ATOM) { 769 return data.u_atom->length(); 770 } else { 771 ASSERT(type == CHAR_CLASS); 772 return 1; 773 } 774 } 775 776 777 DispatchTable* ChoiceNode::GetTable(bool ignore_case) { 778 if (table_ == NULL) { 779 table_ = new DispatchTable(); 780 DispatchTableConstructor cons(table_, ignore_case); 781 cons.BuildTable(this); 782 } 783 return table_; 784 } 785 786 787 class RegExpCompiler { 788 public: 789 RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii); 790 791 int AllocateRegister() { 792 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) { 793 reg_exp_too_big_ = true; 794 return next_register_; 795 } 796 return next_register_++; 797 } 798 799 RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler, 800 RegExpNode* start, 801 int capture_count, 802 Handle<String> pattern); 803 804 inline void AddWork(RegExpNode* node) { work_list_->Add(node); } 805 806 static const int kImplementationOffset = 0; 807 static const int kNumberOfRegistersOffset = 0; 808 static const int kCodeOffset = 1; 809 810 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; } 811 EndNode* accept() { return accept_; } 812 813 static const int kMaxRecursion = 100; 814 inline int recursion_depth() { return recursion_depth_; } 815 inline void IncrementRecursionDepth() { recursion_depth_++; } 816 inline void DecrementRecursionDepth() { recursion_depth_--; } 817 818 void SetRegExpTooBig() { reg_exp_too_big_ = true; } 819 820 inline bool ignore_case() { return ignore_case_; } 821 inline bool ascii() { return ascii_; } 822 823 int current_expansion_factor() { return current_expansion_factor_; } 824 void set_current_expansion_factor(int value) { 825 current_expansion_factor_ = value; 826 } 827 828 static const int kNoRegister = -1; 829 830 private: 831 EndNode* accept_; 832 int next_register_; 833 List<RegExpNode*>* work_list_; 834 int recursion_depth_; 835 RegExpMacroAssembler* macro_assembler_; 836 bool ignore_case_; 837 bool ascii_; 838 bool reg_exp_too_big_; 839 int current_expansion_factor_; 840 }; 841 842 843 class RecursionCheck { 844 public: 845 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) { 846 compiler->IncrementRecursionDepth(); 847 } 848 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); } 849 private: 850 RegExpCompiler* compiler_; 851 }; 852 853 854 static RegExpEngine::CompilationResult IrregexpRegExpTooBig() { 855 return RegExpEngine::CompilationResult("RegExp too big"); 856 } 857 858 859 // Attempts to compile the regexp using an Irregexp code generator. Returns 860 // a fixed array or a null handle depending on whether it succeeded. 861 RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii) 862 : next_register_(2 * (capture_count + 1)), 863 work_list_(NULL), 864 recursion_depth_(0), 865 ignore_case_(ignore_case), 866 ascii_(ascii), 867 reg_exp_too_big_(false), 868 current_expansion_factor_(1) { 869 accept_ = new EndNode(EndNode::ACCEPT); 870 ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister); 871 } 872 873 874 RegExpEngine::CompilationResult RegExpCompiler::Assemble( 875 RegExpMacroAssembler* macro_assembler, 876 RegExpNode* start, 877 int capture_count, 878 Handle<String> pattern) { 879 Heap* heap = pattern->GetHeap(); 880 881 bool use_slow_safe_regexp_compiler = false; 882 if (heap->total_regexp_code_generated() > 883 RegExpImpl::kRegWxpCompiledLimit && 884 heap->isolate()->memory_allocator()->SizeExecutable() > 885 RegExpImpl::kRegExpExecutableMemoryLimit) { 886 use_slow_safe_regexp_compiler = true; 887 } 888 889 macro_assembler->set_slow_safe(use_slow_safe_regexp_compiler); 890 891 #ifdef DEBUG 892 if (FLAG_trace_regexp_assembler) 893 macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler); 894 else 895 #endif 896 macro_assembler_ = macro_assembler; 897 898 List <RegExpNode*> work_list(0); 899 work_list_ = &work_list; 900 Label fail; 901 macro_assembler_->PushBacktrack(&fail); 902 Trace new_trace; 903 start->Emit(this, &new_trace); 904 macro_assembler_->Bind(&fail); 905 macro_assembler_->Fail(); 906 while (!work_list.is_empty()) { 907 work_list.RemoveLast()->Emit(this, &new_trace); 908 } 909 if (reg_exp_too_big_) return IrregexpRegExpTooBig(); 910 911 Handle<HeapObject> code = macro_assembler_->GetCode(pattern); 912 heap->IncreaseTotalRegexpCodeGenerated(code->Size()); 913 work_list_ = NULL; 914 #ifdef DEBUG 915 if (FLAG_print_code) { 916 Handle<Code>::cast(code)->Disassemble(*pattern->ToCString()); 917 } 918 if (FLAG_trace_regexp_assembler) { 919 delete macro_assembler_; 920 } 921 #endif 922 return RegExpEngine::CompilationResult(*code, next_register_); 923 } 924 925 926 bool Trace::DeferredAction::Mentions(int that) { 927 if (type() == ActionNode::CLEAR_CAPTURES) { 928 Interval range = static_cast<DeferredClearCaptures*>(this)->range(); 929 return range.Contains(that); 930 } else { 931 return reg() == that; 932 } 933 } 934 935 936 bool Trace::mentions_reg(int reg) { 937 for (DeferredAction* action = actions_; 938 action != NULL; 939 action = action->next()) { 940 if (action->Mentions(reg)) 941 return true; 942 } 943 return false; 944 } 945 946 947 bool Trace::GetStoredPosition(int reg, int* cp_offset) { 948 ASSERT_EQ(0, *cp_offset); 949 for (DeferredAction* action = actions_; 950 action != NULL; 951 action = action->next()) { 952 if (action->Mentions(reg)) { 953 if (action->type() == ActionNode::STORE_POSITION) { 954 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset(); 955 return true; 956 } else { 957 return false; 958 } 959 } 960 } 961 return false; 962 } 963 964 965 int Trace::FindAffectedRegisters(OutSet* affected_registers) { 966 int max_register = RegExpCompiler::kNoRegister; 967 for (DeferredAction* action = actions_; 968 action != NULL; 969 action = action->next()) { 970 if (action->type() == ActionNode::CLEAR_CAPTURES) { 971 Interval range = static_cast<DeferredClearCaptures*>(action)->range(); 972 for (int i = range.from(); i <= range.to(); i++) 973 affected_registers->Set(i); 974 if (range.to() > max_register) max_register = range.to(); 975 } else { 976 affected_registers->Set(action->reg()); 977 if (action->reg() > max_register) max_register = action->reg(); 978 } 979 } 980 return max_register; 981 } 982 983 984 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler, 985 int max_register, 986 OutSet& registers_to_pop, 987 OutSet& registers_to_clear) { 988 for (int reg = max_register; reg >= 0; reg--) { 989 if (registers_to_pop.Get(reg)) assembler->PopRegister(reg); 990 else if (registers_to_clear.Get(reg)) { 991 int clear_to = reg; 992 while (reg > 0 && registers_to_clear.Get(reg - 1)) { 993 reg--; 994 } 995 assembler->ClearRegisters(reg, clear_to); 996 } 997 } 998 } 999 1000 1001 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler, 1002 int max_register, 1003 OutSet& affected_registers, 1004 OutSet* registers_to_pop, 1005 OutSet* registers_to_clear) { 1006 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1. 1007 const int push_limit = (assembler->stack_limit_slack() + 1) / 2; 1008 1009 // Count pushes performed to force a stack limit check occasionally. 1010 int pushes = 0; 1011 1012 for (int reg = 0; reg <= max_register; reg++) { 1013 if (!affected_registers.Get(reg)) { 1014 continue; 1015 } 1016 1017 // The chronologically first deferred action in the trace 1018 // is used to infer the action needed to restore a register 1019 // to its previous state (or not, if it's safe to ignore it). 1020 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR }; 1021 DeferredActionUndoType undo_action = IGNORE; 1022 1023 int value = 0; 1024 bool absolute = false; 1025 bool clear = false; 1026 int store_position = -1; 1027 // This is a little tricky because we are scanning the actions in reverse 1028 // historical order (newest first). 1029 for (DeferredAction* action = actions_; 1030 action != NULL; 1031 action = action->next()) { 1032 if (action->Mentions(reg)) { 1033 switch (action->type()) { 1034 case ActionNode::SET_REGISTER: { 1035 Trace::DeferredSetRegister* psr = 1036 static_cast<Trace::DeferredSetRegister*>(action); 1037 if (!absolute) { 1038 value += psr->value(); 1039 absolute = true; 1040 } 1041 // SET_REGISTER is currently only used for newly introduced loop 1042 // counters. They can have a significant previous value if they 1043 // occour in a loop. TODO(lrn): Propagate this information, so 1044 // we can set undo_action to IGNORE if we know there is no value to 1045 // restore. 1046 undo_action = RESTORE; 1047 ASSERT_EQ(store_position, -1); 1048 ASSERT(!clear); 1049 break; 1050 } 1051 case ActionNode::INCREMENT_REGISTER: 1052 if (!absolute) { 1053 value++; 1054 } 1055 ASSERT_EQ(store_position, -1); 1056 ASSERT(!clear); 1057 undo_action = RESTORE; 1058 break; 1059 case ActionNode::STORE_POSITION: { 1060 Trace::DeferredCapture* pc = 1061 static_cast<Trace::DeferredCapture*>(action); 1062 if (!clear && store_position == -1) { 1063 store_position = pc->cp_offset(); 1064 } 1065 1066 // For captures we know that stores and clears alternate. 1067 // Other register, are never cleared, and if the occur 1068 // inside a loop, they might be assigned more than once. 1069 if (reg <= 1) { 1070 // Registers zero and one, aka "capture zero", is 1071 // always set correctly if we succeed. There is no 1072 // need to undo a setting on backtrack, because we 1073 // will set it again or fail. 1074 undo_action = IGNORE; 1075 } else { 1076 undo_action = pc->is_capture() ? CLEAR : RESTORE; 1077 } 1078 ASSERT(!absolute); 1079 ASSERT_EQ(value, 0); 1080 break; 1081 } 1082 case ActionNode::CLEAR_CAPTURES: { 1083 // Since we're scanning in reverse order, if we've already 1084 // set the position we have to ignore historically earlier 1085 // clearing operations. 1086 if (store_position == -1) { 1087 clear = true; 1088 } 1089 undo_action = RESTORE; 1090 ASSERT(!absolute); 1091 ASSERT_EQ(value, 0); 1092 break; 1093 } 1094 default: 1095 UNREACHABLE(); 1096 break; 1097 } 1098 } 1099 } 1100 // Prepare for the undo-action (e.g., push if it's going to be popped). 1101 if (undo_action == RESTORE) { 1102 pushes++; 1103 RegExpMacroAssembler::StackCheckFlag stack_check = 1104 RegExpMacroAssembler::kNoStackLimitCheck; 1105 if (pushes == push_limit) { 1106 stack_check = RegExpMacroAssembler::kCheckStackLimit; 1107 pushes = 0; 1108 } 1109 1110 assembler->PushRegister(reg, stack_check); 1111 registers_to_pop->Set(reg); 1112 } else if (undo_action == CLEAR) { 1113 registers_to_clear->Set(reg); 1114 } 1115 // Perform the chronologically last action (or accumulated increment) 1116 // for the register. 1117 if (store_position != -1) { 1118 assembler->WriteCurrentPositionToRegister(reg, store_position); 1119 } else if (clear) { 1120 assembler->ClearRegisters(reg, reg); 1121 } else if (absolute) { 1122 assembler->SetRegister(reg, value); 1123 } else if (value != 0) { 1124 assembler->AdvanceRegister(reg, value); 1125 } 1126 } 1127 } 1128 1129 1130 // This is called as we come into a loop choice node and some other tricky 1131 // nodes. It normalizes the state of the code generator to ensure we can 1132 // generate generic code. 1133 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) { 1134 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1135 1136 ASSERT(!is_trivial()); 1137 1138 if (actions_ == NULL && backtrack() == NULL) { 1139 // Here we just have some deferred cp advances to fix and we are back to 1140 // a normal situation. We may also have to forget some information gained 1141 // through a quick check that was already performed. 1142 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_); 1143 // Create a new trivial state and generate the node with that. 1144 Trace new_state; 1145 successor->Emit(compiler, &new_state); 1146 return; 1147 } 1148 1149 // Generate deferred actions here along with code to undo them again. 1150 OutSet affected_registers; 1151 1152 if (backtrack() != NULL) { 1153 // Here we have a concrete backtrack location. These are set up by choice 1154 // nodes and so they indicate that we have a deferred save of the current 1155 // position which we may need to emit here. 1156 assembler->PushCurrentPosition(); 1157 } 1158 1159 int max_register = FindAffectedRegisters(&affected_registers); 1160 OutSet registers_to_pop; 1161 OutSet registers_to_clear; 1162 PerformDeferredActions(assembler, 1163 max_register, 1164 affected_registers, 1165 ®isters_to_pop, 1166 ®isters_to_clear); 1167 if (cp_offset_ != 0) { 1168 assembler->AdvanceCurrentPosition(cp_offset_); 1169 } 1170 1171 // Create a new trivial state and generate the node with that. 1172 Label undo; 1173 assembler->PushBacktrack(&undo); 1174 Trace new_state; 1175 successor->Emit(compiler, &new_state); 1176 1177 // On backtrack we need to restore state. 1178 assembler->Bind(&undo); 1179 RestoreAffectedRegisters(assembler, 1180 max_register, 1181 registers_to_pop, 1182 registers_to_clear); 1183 if (backtrack() == NULL) { 1184 assembler->Backtrack(); 1185 } else { 1186 assembler->PopCurrentPosition(); 1187 assembler->GoTo(backtrack()); 1188 } 1189 } 1190 1191 1192 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) { 1193 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1194 1195 // Omit flushing the trace. We discard the entire stack frame anyway. 1196 1197 if (!label()->is_bound()) { 1198 // We are completely independent of the trace, since we ignore it, 1199 // so this code can be used as the generic version. 1200 assembler->Bind(label()); 1201 } 1202 1203 // Throw away everything on the backtrack stack since the start 1204 // of the negative submatch and restore the character position. 1205 assembler->ReadCurrentPositionFromRegister(current_position_register_); 1206 assembler->ReadStackPointerFromRegister(stack_pointer_register_); 1207 if (clear_capture_count_ > 0) { 1208 // Clear any captures that might have been performed during the success 1209 // of the body of the negative look-ahead. 1210 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1; 1211 assembler->ClearRegisters(clear_capture_start_, clear_capture_end); 1212 } 1213 // Now that we have unwound the stack we find at the top of the stack the 1214 // backtrack that the BeginSubmatch node got. 1215 assembler->Backtrack(); 1216 } 1217 1218 1219 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) { 1220 if (!trace->is_trivial()) { 1221 trace->Flush(compiler, this); 1222 return; 1223 } 1224 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1225 if (!label()->is_bound()) { 1226 assembler->Bind(label()); 1227 } 1228 switch (action_) { 1229 case ACCEPT: 1230 assembler->Succeed(); 1231 return; 1232 case BACKTRACK: 1233 assembler->GoTo(trace->backtrack()); 1234 return; 1235 case NEGATIVE_SUBMATCH_SUCCESS: 1236 // This case is handled in a different virtual method. 1237 UNREACHABLE(); 1238 } 1239 UNIMPLEMENTED(); 1240 } 1241 1242 1243 void GuardedAlternative::AddGuard(Guard* guard) { 1244 if (guards_ == NULL) 1245 guards_ = new ZoneList<Guard*>(1); 1246 guards_->Add(guard); 1247 } 1248 1249 1250 ActionNode* ActionNode::SetRegister(int reg, 1251 int val, 1252 RegExpNode* on_success) { 1253 ActionNode* result = new ActionNode(SET_REGISTER, on_success); 1254 result->data_.u_store_register.reg = reg; 1255 result->data_.u_store_register.value = val; 1256 return result; 1257 } 1258 1259 1260 ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) { 1261 ActionNode* result = new ActionNode(INCREMENT_REGISTER, on_success); 1262 result->data_.u_increment_register.reg = reg; 1263 return result; 1264 } 1265 1266 1267 ActionNode* ActionNode::StorePosition(int reg, 1268 bool is_capture, 1269 RegExpNode* on_success) { 1270 ActionNode* result = new ActionNode(STORE_POSITION, on_success); 1271 result->data_.u_position_register.reg = reg; 1272 result->data_.u_position_register.is_capture = is_capture; 1273 return result; 1274 } 1275 1276 1277 ActionNode* ActionNode::ClearCaptures(Interval range, 1278 RegExpNode* on_success) { 1279 ActionNode* result = new ActionNode(CLEAR_CAPTURES, on_success); 1280 result->data_.u_clear_captures.range_from = range.from(); 1281 result->data_.u_clear_captures.range_to = range.to(); 1282 return result; 1283 } 1284 1285 1286 ActionNode* ActionNode::BeginSubmatch(int stack_reg, 1287 int position_reg, 1288 RegExpNode* on_success) { 1289 ActionNode* result = new ActionNode(BEGIN_SUBMATCH, on_success); 1290 result->data_.u_submatch.stack_pointer_register = stack_reg; 1291 result->data_.u_submatch.current_position_register = position_reg; 1292 return result; 1293 } 1294 1295 1296 ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg, 1297 int position_reg, 1298 int clear_register_count, 1299 int clear_register_from, 1300 RegExpNode* on_success) { 1301 ActionNode* result = new ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success); 1302 result->data_.u_submatch.stack_pointer_register = stack_reg; 1303 result->data_.u_submatch.current_position_register = position_reg; 1304 result->data_.u_submatch.clear_register_count = clear_register_count; 1305 result->data_.u_submatch.clear_register_from = clear_register_from; 1306 return result; 1307 } 1308 1309 1310 ActionNode* ActionNode::EmptyMatchCheck(int start_register, 1311 int repetition_register, 1312 int repetition_limit, 1313 RegExpNode* on_success) { 1314 ActionNode* result = new ActionNode(EMPTY_MATCH_CHECK, on_success); 1315 result->data_.u_empty_match_check.start_register = start_register; 1316 result->data_.u_empty_match_check.repetition_register = repetition_register; 1317 result->data_.u_empty_match_check.repetition_limit = repetition_limit; 1318 return result; 1319 } 1320 1321 1322 #define DEFINE_ACCEPT(Type) \ 1323 void Type##Node::Accept(NodeVisitor* visitor) { \ 1324 visitor->Visit##Type(this); \ 1325 } 1326 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT) 1327 #undef DEFINE_ACCEPT 1328 1329 1330 void LoopChoiceNode::Accept(NodeVisitor* visitor) { 1331 visitor->VisitLoopChoice(this); 1332 } 1333 1334 1335 // ------------------------------------------------------------------- 1336 // Emit code. 1337 1338 1339 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler, 1340 Guard* guard, 1341 Trace* trace) { 1342 switch (guard->op()) { 1343 case Guard::LT: 1344 ASSERT(!trace->mentions_reg(guard->reg())); 1345 macro_assembler->IfRegisterGE(guard->reg(), 1346 guard->value(), 1347 trace->backtrack()); 1348 break; 1349 case Guard::GEQ: 1350 ASSERT(!trace->mentions_reg(guard->reg())); 1351 macro_assembler->IfRegisterLT(guard->reg(), 1352 guard->value(), 1353 trace->backtrack()); 1354 break; 1355 } 1356 } 1357 1358 1359 // Returns the number of characters in the equivalence class, omitting those 1360 // that cannot occur in the source string because it is ASCII. 1361 static int GetCaseIndependentLetters(Isolate* isolate, 1362 uc16 character, 1363 bool ascii_subject, 1364 unibrow::uchar* letters) { 1365 int length = 1366 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters); 1367 // Unibrow returns 0 or 1 for characters where case independence is 1368 // trivial. 1369 if (length == 0) { 1370 letters[0] = character; 1371 length = 1; 1372 } 1373 if (!ascii_subject || character <= String::kMaxAsciiCharCode) { 1374 return length; 1375 } 1376 // The standard requires that non-ASCII characters cannot have ASCII 1377 // character codes in their equivalence class. 1378 return 0; 1379 } 1380 1381 1382 static inline bool EmitSimpleCharacter(Isolate* isolate, 1383 RegExpCompiler* compiler, 1384 uc16 c, 1385 Label* on_failure, 1386 int cp_offset, 1387 bool check, 1388 bool preloaded) { 1389 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1390 bool bound_checked = false; 1391 if (!preloaded) { 1392 assembler->LoadCurrentCharacter( 1393 cp_offset, 1394 on_failure, 1395 check); 1396 bound_checked = true; 1397 } 1398 assembler->CheckNotCharacter(c, on_failure); 1399 return bound_checked; 1400 } 1401 1402 1403 // Only emits non-letters (things that don't have case). Only used for case 1404 // independent matches. 1405 static inline bool EmitAtomNonLetter(Isolate* isolate, 1406 RegExpCompiler* compiler, 1407 uc16 c, 1408 Label* on_failure, 1409 int cp_offset, 1410 bool check, 1411 bool preloaded) { 1412 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 1413 bool ascii = compiler->ascii(); 1414 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1415 int length = GetCaseIndependentLetters(isolate, c, ascii, chars); 1416 if (length < 1) { 1417 // This can't match. Must be an ASCII subject and a non-ASCII character. 1418 // We do not need to do anything since the ASCII pass already handled this. 1419 return false; // Bounds not checked. 1420 } 1421 bool checked = false; 1422 // We handle the length > 1 case in a later pass. 1423 if (length == 1) { 1424 if (ascii && c > String::kMaxAsciiCharCodeU) { 1425 // Can't match - see above. 1426 return false; // Bounds not checked. 1427 } 1428 if (!preloaded) { 1429 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 1430 checked = check; 1431 } 1432 macro_assembler->CheckNotCharacter(c, on_failure); 1433 } 1434 return checked; 1435 } 1436 1437 1438 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler, 1439 bool ascii, 1440 uc16 c1, 1441 uc16 c2, 1442 Label* on_failure) { 1443 uc16 char_mask; 1444 if (ascii) { 1445 char_mask = String::kMaxAsciiCharCode; 1446 } else { 1447 char_mask = String::kMaxUtf16CodeUnit; 1448 } 1449 uc16 exor = c1 ^ c2; 1450 // Check whether exor has only one bit set. 1451 if (((exor - 1) & exor) == 0) { 1452 // If c1 and c2 differ only by one bit. 1453 // Ecma262UnCanonicalize always gives the highest number last. 1454 ASSERT(c2 > c1); 1455 uc16 mask = char_mask ^ exor; 1456 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure); 1457 return true; 1458 } 1459 ASSERT(c2 > c1); 1460 uc16 diff = c2 - c1; 1461 if (((diff - 1) & diff) == 0 && c1 >= diff) { 1462 // If the characters differ by 2^n but don't differ by one bit then 1463 // subtract the difference from the found character, then do the or 1464 // trick. We avoid the theoretical case where negative numbers are 1465 // involved in order to simplify code generation. 1466 uc16 mask = char_mask ^ diff; 1467 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff, 1468 diff, 1469 mask, 1470 on_failure); 1471 return true; 1472 } 1473 return false; 1474 } 1475 1476 1477 typedef bool EmitCharacterFunction(Isolate* isolate, 1478 RegExpCompiler* compiler, 1479 uc16 c, 1480 Label* on_failure, 1481 int cp_offset, 1482 bool check, 1483 bool preloaded); 1484 1485 // Only emits letters (things that have case). Only used for case independent 1486 // matches. 1487 static inline bool EmitAtomLetter(Isolate* isolate, 1488 RegExpCompiler* compiler, 1489 uc16 c, 1490 Label* on_failure, 1491 int cp_offset, 1492 bool check, 1493 bool preloaded) { 1494 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 1495 bool ascii = compiler->ascii(); 1496 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1497 int length = GetCaseIndependentLetters(isolate, c, ascii, chars); 1498 if (length <= 1) return false; 1499 // We may not need to check against the end of the input string 1500 // if this character lies before a character that matched. 1501 if (!preloaded) { 1502 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 1503 } 1504 Label ok; 1505 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4); 1506 switch (length) { 1507 case 2: { 1508 if (ShortCutEmitCharacterPair(macro_assembler, 1509 ascii, 1510 chars[0], 1511 chars[1], 1512 on_failure)) { 1513 } else { 1514 macro_assembler->CheckCharacter(chars[0], &ok); 1515 macro_assembler->CheckNotCharacter(chars[1], on_failure); 1516 macro_assembler->Bind(&ok); 1517 } 1518 break; 1519 } 1520 case 4: 1521 macro_assembler->CheckCharacter(chars[3], &ok); 1522 // Fall through! 1523 case 3: 1524 macro_assembler->CheckCharacter(chars[0], &ok); 1525 macro_assembler->CheckCharacter(chars[1], &ok); 1526 macro_assembler->CheckNotCharacter(chars[2], on_failure); 1527 macro_assembler->Bind(&ok); 1528 break; 1529 default: 1530 UNREACHABLE(); 1531 break; 1532 } 1533 return true; 1534 } 1535 1536 1537 static void EmitCharClass(RegExpMacroAssembler* macro_assembler, 1538 RegExpCharacterClass* cc, 1539 bool ascii, 1540 Label* on_failure, 1541 int cp_offset, 1542 bool check_offset, 1543 bool preloaded) { 1544 ZoneList<CharacterRange>* ranges = cc->ranges(); 1545 int max_char; 1546 if (ascii) { 1547 max_char = String::kMaxAsciiCharCode; 1548 } else { 1549 max_char = String::kMaxUtf16CodeUnit; 1550 } 1551 1552 Label success; 1553 1554 Label* char_is_in_class = 1555 cc->is_negated() ? on_failure : &success; 1556 1557 int range_count = ranges->length(); 1558 1559 int last_valid_range = range_count - 1; 1560 while (last_valid_range >= 0) { 1561 CharacterRange& range = ranges->at(last_valid_range); 1562 if (range.from() <= max_char) { 1563 break; 1564 } 1565 last_valid_range--; 1566 } 1567 1568 if (last_valid_range < 0) { 1569 if (!cc->is_negated()) { 1570 // TODO(plesner): We can remove this when the node level does our 1571 // ASCII optimizations for us. 1572 macro_assembler->GoTo(on_failure); 1573 } 1574 if (check_offset) { 1575 macro_assembler->CheckPosition(cp_offset, on_failure); 1576 } 1577 return; 1578 } 1579 1580 if (last_valid_range == 0 && 1581 !cc->is_negated() && 1582 ranges->at(0).IsEverything(max_char)) { 1583 // This is a common case hit by non-anchored expressions. 1584 if (check_offset) { 1585 macro_assembler->CheckPosition(cp_offset, on_failure); 1586 } 1587 return; 1588 } 1589 1590 if (!preloaded) { 1591 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset); 1592 } 1593 1594 if (cc->is_standard() && 1595 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(), 1596 on_failure)) { 1597 return; 1598 } 1599 1600 for (int i = 0; i < last_valid_range; i++) { 1601 CharacterRange& range = ranges->at(i); 1602 Label next_range; 1603 uc16 from = range.from(); 1604 uc16 to = range.to(); 1605 if (from > max_char) { 1606 continue; 1607 } 1608 if (to > max_char) to = max_char; 1609 if (to == from) { 1610 macro_assembler->CheckCharacter(to, char_is_in_class); 1611 } else { 1612 if (from != 0) { 1613 macro_assembler->CheckCharacterLT(from, &next_range); 1614 } 1615 if (to != max_char) { 1616 macro_assembler->CheckCharacterLT(to + 1, char_is_in_class); 1617 } else { 1618 macro_assembler->GoTo(char_is_in_class); 1619 } 1620 } 1621 macro_assembler->Bind(&next_range); 1622 } 1623 1624 CharacterRange& range = ranges->at(last_valid_range); 1625 uc16 from = range.from(); 1626 uc16 to = range.to(); 1627 1628 if (to > max_char) to = max_char; 1629 ASSERT(to >= from); 1630 1631 if (to == from) { 1632 if (cc->is_negated()) { 1633 macro_assembler->CheckCharacter(to, on_failure); 1634 } else { 1635 macro_assembler->CheckNotCharacter(to, on_failure); 1636 } 1637 } else { 1638 if (from != 0) { 1639 if (cc->is_negated()) { 1640 macro_assembler->CheckCharacterLT(from, &success); 1641 } else { 1642 macro_assembler->CheckCharacterLT(from, on_failure); 1643 } 1644 } 1645 if (to != String::kMaxUtf16CodeUnit) { 1646 if (cc->is_negated()) { 1647 macro_assembler->CheckCharacterLT(to + 1, on_failure); 1648 } else { 1649 macro_assembler->CheckCharacterGT(to, on_failure); 1650 } 1651 } else { 1652 if (cc->is_negated()) { 1653 macro_assembler->GoTo(on_failure); 1654 } 1655 } 1656 } 1657 macro_assembler->Bind(&success); 1658 } 1659 1660 1661 RegExpNode::~RegExpNode() { 1662 } 1663 1664 1665 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler, 1666 Trace* trace) { 1667 // If we are generating a greedy loop then don't stop and don't reuse code. 1668 if (trace->stop_node() != NULL) { 1669 return CONTINUE; 1670 } 1671 1672 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 1673 if (trace->is_trivial()) { 1674 if (label_.is_bound()) { 1675 // We are being asked to generate a generic version, but that's already 1676 // been done so just go to it. 1677 macro_assembler->GoTo(&label_); 1678 return DONE; 1679 } 1680 if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) { 1681 // To avoid too deep recursion we push the node to the work queue and just 1682 // generate a goto here. 1683 compiler->AddWork(this); 1684 macro_assembler->GoTo(&label_); 1685 return DONE; 1686 } 1687 // Generate generic version of the node and bind the label for later use. 1688 macro_assembler->Bind(&label_); 1689 return CONTINUE; 1690 } 1691 1692 // We are being asked to make a non-generic version. Keep track of how many 1693 // non-generic versions we generate so as not to overdo it. 1694 trace_count_++; 1695 if (FLAG_regexp_optimization && 1696 trace_count_ < kMaxCopiesCodeGenerated && 1697 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) { 1698 return CONTINUE; 1699 } 1700 1701 // If we get here code has been generated for this node too many times or 1702 // recursion is too deep. Time to switch to a generic version. The code for 1703 // generic versions above can handle deep recursion properly. 1704 trace->Flush(compiler, this); 1705 return DONE; 1706 } 1707 1708 1709 int ActionNode::EatsAtLeast(int still_to_find, 1710 int recursion_depth, 1711 bool not_at_start) { 1712 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; 1713 if (type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input! 1714 return on_success()->EatsAtLeast(still_to_find, 1715 recursion_depth + 1, 1716 not_at_start); 1717 } 1718 1719 1720 int AssertionNode::EatsAtLeast(int still_to_find, 1721 int recursion_depth, 1722 bool not_at_start) { 1723 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; 1724 // If we know we are not at the start and we are asked "how many characters 1725 // will you match if you succeed?" then we can answer anything since false 1726 // implies false. So lets just return the max answer (still_to_find) since 1727 // that won't prevent us from preloading a lot of characters for the other 1728 // branches in the node graph. 1729 if (type() == AT_START && not_at_start) return still_to_find; 1730 return on_success()->EatsAtLeast(still_to_find, 1731 recursion_depth + 1, 1732 not_at_start); 1733 } 1734 1735 1736 int BackReferenceNode::EatsAtLeast(int still_to_find, 1737 int recursion_depth, 1738 bool not_at_start) { 1739 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; 1740 return on_success()->EatsAtLeast(still_to_find, 1741 recursion_depth + 1, 1742 not_at_start); 1743 } 1744 1745 1746 int TextNode::EatsAtLeast(int still_to_find, 1747 int recursion_depth, 1748 bool not_at_start) { 1749 int answer = Length(); 1750 if (answer >= still_to_find) return answer; 1751 if (recursion_depth > RegExpCompiler::kMaxRecursion) return answer; 1752 // We are not at start after this node so we set the last argument to 'true'. 1753 return answer + on_success()->EatsAtLeast(still_to_find - answer, 1754 recursion_depth + 1, 1755 true); 1756 } 1757 1758 1759 int NegativeLookaheadChoiceNode::EatsAtLeast(int still_to_find, 1760 int recursion_depth, 1761 bool not_at_start) { 1762 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; 1763 // Alternative 0 is the negative lookahead, alternative 1 is what comes 1764 // afterwards. 1765 RegExpNode* node = alternatives_->at(1).node(); 1766 return node->EatsAtLeast(still_to_find, recursion_depth + 1, not_at_start); 1767 } 1768 1769 1770 void NegativeLookaheadChoiceNode::GetQuickCheckDetails( 1771 QuickCheckDetails* details, 1772 RegExpCompiler* compiler, 1773 int filled_in, 1774 bool not_at_start) { 1775 // Alternative 0 is the negative lookahead, alternative 1 is what comes 1776 // afterwards. 1777 RegExpNode* node = alternatives_->at(1).node(); 1778 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start); 1779 } 1780 1781 1782 int ChoiceNode::EatsAtLeastHelper(int still_to_find, 1783 int recursion_depth, 1784 RegExpNode* ignore_this_node, 1785 bool not_at_start) { 1786 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; 1787 int min = 100; 1788 int choice_count = alternatives_->length(); 1789 for (int i = 0; i < choice_count; i++) { 1790 RegExpNode* node = alternatives_->at(i).node(); 1791 if (node == ignore_this_node) continue; 1792 int node_eats_at_least = node->EatsAtLeast(still_to_find, 1793 recursion_depth + 1, 1794 not_at_start); 1795 if (node_eats_at_least < min) min = node_eats_at_least; 1796 } 1797 return min; 1798 } 1799 1800 1801 int LoopChoiceNode::EatsAtLeast(int still_to_find, 1802 int recursion_depth, 1803 bool not_at_start) { 1804 return EatsAtLeastHelper(still_to_find, 1805 recursion_depth, 1806 loop_node_, 1807 not_at_start); 1808 } 1809 1810 1811 int ChoiceNode::EatsAtLeast(int still_to_find, 1812 int recursion_depth, 1813 bool not_at_start) { 1814 return EatsAtLeastHelper(still_to_find, 1815 recursion_depth, 1816 NULL, 1817 not_at_start); 1818 } 1819 1820 1821 // Takes the left-most 1-bit and smears it out, setting all bits to its right. 1822 static inline uint32_t SmearBitsRight(uint32_t v) { 1823 v |= v >> 1; 1824 v |= v >> 2; 1825 v |= v >> 4; 1826 v |= v >> 8; 1827 v |= v >> 16; 1828 return v; 1829 } 1830 1831 1832 bool QuickCheckDetails::Rationalize(bool asc) { 1833 bool found_useful_op = false; 1834 uint32_t char_mask; 1835 if (asc) { 1836 char_mask = String::kMaxAsciiCharCode; 1837 } else { 1838 char_mask = String::kMaxUtf16CodeUnit; 1839 } 1840 mask_ = 0; 1841 value_ = 0; 1842 int char_shift = 0; 1843 for (int i = 0; i < characters_; i++) { 1844 Position* pos = &positions_[i]; 1845 if ((pos->mask & String::kMaxAsciiCharCode) != 0) { 1846 found_useful_op = true; 1847 } 1848 mask_ |= (pos->mask & char_mask) << char_shift; 1849 value_ |= (pos->value & char_mask) << char_shift; 1850 char_shift += asc ? 8 : 16; 1851 } 1852 return found_useful_op; 1853 } 1854 1855 1856 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler, 1857 Trace* trace, 1858 bool preload_has_checked_bounds, 1859 Label* on_possible_success, 1860 QuickCheckDetails* details, 1861 bool fall_through_on_failure) { 1862 if (details->characters() == 0) return false; 1863 GetQuickCheckDetails(details, compiler, 0, trace->at_start() == Trace::FALSE); 1864 if (details->cannot_match()) return false; 1865 if (!details->Rationalize(compiler->ascii())) return false; 1866 ASSERT(details->characters() == 1 || 1867 compiler->macro_assembler()->CanReadUnaligned()); 1868 uint32_t mask = details->mask(); 1869 uint32_t value = details->value(); 1870 1871 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1872 1873 if (trace->characters_preloaded() != details->characters()) { 1874 assembler->LoadCurrentCharacter(trace->cp_offset(), 1875 trace->backtrack(), 1876 !preload_has_checked_bounds, 1877 details->characters()); 1878 } 1879 1880 1881 bool need_mask = true; 1882 1883 if (details->characters() == 1) { 1884 // If number of characters preloaded is 1 then we used a byte or 16 bit 1885 // load so the value is already masked down. 1886 uint32_t char_mask; 1887 if (compiler->ascii()) { 1888 char_mask = String::kMaxAsciiCharCode; 1889 } else { 1890 char_mask = String::kMaxUtf16CodeUnit; 1891 } 1892 if ((mask & char_mask) == char_mask) need_mask = false; 1893 mask &= char_mask; 1894 } else { 1895 // For 2-character preloads in ASCII mode or 1-character preloads in 1896 // TWO_BYTE mode we also use a 16 bit load with zero extend. 1897 if (details->characters() == 2 && compiler->ascii()) { 1898 if ((mask & 0x7f7f) == 0x7f7f) need_mask = false; 1899 } else if (details->characters() == 1 && !compiler->ascii()) { 1900 if ((mask & 0xffff) == 0xffff) need_mask = false; 1901 } else { 1902 if (mask == 0xffffffff) need_mask = false; 1903 } 1904 } 1905 1906 if (fall_through_on_failure) { 1907 if (need_mask) { 1908 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success); 1909 } else { 1910 assembler->CheckCharacter(value, on_possible_success); 1911 } 1912 } else { 1913 if (need_mask) { 1914 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack()); 1915 } else { 1916 assembler->CheckNotCharacter(value, trace->backtrack()); 1917 } 1918 } 1919 return true; 1920 } 1921 1922 1923 // Here is the meat of GetQuickCheckDetails (see also the comment on the 1924 // super-class in the .h file). 1925 // 1926 // We iterate along the text object, building up for each character a 1927 // mask and value that can be used to test for a quick failure to match. 1928 // The masks and values for the positions will be combined into a single 1929 // machine word for the current character width in order to be used in 1930 // generating a quick check. 1931 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, 1932 RegExpCompiler* compiler, 1933 int characters_filled_in, 1934 bool not_at_start) { 1935 Isolate* isolate = Isolate::Current(); 1936 ASSERT(characters_filled_in < details->characters()); 1937 int characters = details->characters(); 1938 int char_mask; 1939 if (compiler->ascii()) { 1940 char_mask = String::kMaxAsciiCharCode; 1941 } else { 1942 char_mask = String::kMaxUtf16CodeUnit; 1943 } 1944 for (int k = 0; k < elms_->length(); k++) { 1945 TextElement elm = elms_->at(k); 1946 if (elm.type == TextElement::ATOM) { 1947 Vector<const uc16> quarks = elm.data.u_atom->data(); 1948 for (int i = 0; i < characters && i < quarks.length(); i++) { 1949 QuickCheckDetails::Position* pos = 1950 details->positions(characters_filled_in); 1951 uc16 c = quarks[i]; 1952 if (c > char_mask) { 1953 // If we expect a non-ASCII character from an ASCII string, 1954 // there is no way we can match. Not even case independent 1955 // matching can turn an ASCII character into non-ASCII or 1956 // vice versa. 1957 details->set_cannot_match(); 1958 pos->determines_perfectly = false; 1959 return; 1960 } 1961 if (compiler->ignore_case()) { 1962 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1963 int length = GetCaseIndependentLetters(isolate, c, compiler->ascii(), 1964 chars); 1965 ASSERT(length != 0); // Can only happen if c > char_mask (see above). 1966 if (length == 1) { 1967 // This letter has no case equivalents, so it's nice and simple 1968 // and the mask-compare will determine definitely whether we have 1969 // a match at this character position. 1970 pos->mask = char_mask; 1971 pos->value = c; 1972 pos->determines_perfectly = true; 1973 } else { 1974 uint32_t common_bits = char_mask; 1975 uint32_t bits = chars[0]; 1976 for (int j = 1; j < length; j++) { 1977 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits); 1978 common_bits ^= differing_bits; 1979 bits &= common_bits; 1980 } 1981 // If length is 2 and common bits has only one zero in it then 1982 // our mask and compare instruction will determine definitely 1983 // whether we have a match at this character position. Otherwise 1984 // it can only be an approximate check. 1985 uint32_t one_zero = (common_bits | ~char_mask); 1986 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) { 1987 pos->determines_perfectly = true; 1988 } 1989 pos->mask = common_bits; 1990 pos->value = bits; 1991 } 1992 } else { 1993 // Don't ignore case. Nice simple case where the mask-compare will 1994 // determine definitely whether we have a match at this character 1995 // position. 1996 pos->mask = char_mask; 1997 pos->value = c; 1998 pos->determines_perfectly = true; 1999 } 2000 characters_filled_in++; 2001 ASSERT(characters_filled_in <= details->characters()); 2002 if (characters_filled_in == details->characters()) { 2003 return; 2004 } 2005 } 2006 } else { 2007 QuickCheckDetails::Position* pos = 2008 details->positions(characters_filled_in); 2009 RegExpCharacterClass* tree = elm.data.u_char_class; 2010 ZoneList<CharacterRange>* ranges = tree->ranges(); 2011 if (tree->is_negated()) { 2012 // A quick check uses multi-character mask and compare. There is no 2013 // useful way to incorporate a negative char class into this scheme 2014 // so we just conservatively create a mask and value that will always 2015 // succeed. 2016 pos->mask = 0; 2017 pos->value = 0; 2018 } else { 2019 int first_range = 0; 2020 while (ranges->at(first_range).from() > char_mask) { 2021 first_range++; 2022 if (first_range == ranges->length()) { 2023 details->set_cannot_match(); 2024 pos->determines_perfectly = false; 2025 return; 2026 } 2027 } 2028 CharacterRange range = ranges->at(first_range); 2029 uc16 from = range.from(); 2030 uc16 to = range.to(); 2031 if (to > char_mask) { 2032 to = char_mask; 2033 } 2034 uint32_t differing_bits = (from ^ to); 2035 // A mask and compare is only perfect if the differing bits form a 2036 // number like 00011111 with one single block of trailing 1s. 2037 if ((differing_bits & (differing_bits + 1)) == 0 && 2038 from + differing_bits == to) { 2039 pos->determines_perfectly = true; 2040 } 2041 uint32_t common_bits = ~SmearBitsRight(differing_bits); 2042 uint32_t bits = (from & common_bits); 2043 for (int i = first_range + 1; i < ranges->length(); i++) { 2044 CharacterRange range = ranges->at(i); 2045 uc16 from = range.from(); 2046 uc16 to = range.to(); 2047 if (from > char_mask) continue; 2048 if (to > char_mask) to = char_mask; 2049 // Here we are combining more ranges into the mask and compare 2050 // value. With each new range the mask becomes more sparse and 2051 // so the chances of a false positive rise. A character class 2052 // with multiple ranges is assumed never to be equivalent to a 2053 // mask and compare operation. 2054 pos->determines_perfectly = false; 2055 uint32_t new_common_bits = (from ^ to); 2056 new_common_bits = ~SmearBitsRight(new_common_bits); 2057 common_bits &= new_common_bits; 2058 bits &= new_common_bits; 2059 uint32_t differing_bits = (from & common_bits) ^ bits; 2060 common_bits ^= differing_bits; 2061 bits &= common_bits; 2062 } 2063 pos->mask = common_bits; 2064 pos->value = bits; 2065 } 2066 characters_filled_in++; 2067 ASSERT(characters_filled_in <= details->characters()); 2068 if (characters_filled_in == details->characters()) { 2069 return; 2070 } 2071 } 2072 } 2073 ASSERT(characters_filled_in != details->characters()); 2074 on_success()-> GetQuickCheckDetails(details, 2075 compiler, 2076 characters_filled_in, 2077 true); 2078 } 2079 2080 2081 void QuickCheckDetails::Clear() { 2082 for (int i = 0; i < characters_; i++) { 2083 positions_[i].mask = 0; 2084 positions_[i].value = 0; 2085 positions_[i].determines_perfectly = false; 2086 } 2087 characters_ = 0; 2088 } 2089 2090 2091 void QuickCheckDetails::Advance(int by, bool ascii) { 2092 ASSERT(by >= 0); 2093 if (by >= characters_) { 2094 Clear(); 2095 return; 2096 } 2097 for (int i = 0; i < characters_ - by; i++) { 2098 positions_[i] = positions_[by + i]; 2099 } 2100 for (int i = characters_ - by; i < characters_; i++) { 2101 positions_[i].mask = 0; 2102 positions_[i].value = 0; 2103 positions_[i].determines_perfectly = false; 2104 } 2105 characters_ -= by; 2106 // We could change mask_ and value_ here but we would never advance unless 2107 // they had already been used in a check and they won't be used again because 2108 // it would gain us nothing. So there's no point. 2109 } 2110 2111 2112 void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) { 2113 ASSERT(characters_ == other->characters_); 2114 if (other->cannot_match_) { 2115 return; 2116 } 2117 if (cannot_match_) { 2118 *this = *other; 2119 return; 2120 } 2121 for (int i = from_index; i < characters_; i++) { 2122 QuickCheckDetails::Position* pos = positions(i); 2123 QuickCheckDetails::Position* other_pos = other->positions(i); 2124 if (pos->mask != other_pos->mask || 2125 pos->value != other_pos->value || 2126 !other_pos->determines_perfectly) { 2127 // Our mask-compare operation will be approximate unless we have the 2128 // exact same operation on both sides of the alternation. 2129 pos->determines_perfectly = false; 2130 } 2131 pos->mask &= other_pos->mask; 2132 pos->value &= pos->mask; 2133 other_pos->value &= pos->mask; 2134 uc16 differing_bits = (pos->value ^ other_pos->value); 2135 pos->mask &= ~differing_bits; 2136 pos->value &= pos->mask; 2137 } 2138 } 2139 2140 2141 class VisitMarker { 2142 public: 2143 explicit VisitMarker(NodeInfo* info) : info_(info) { 2144 ASSERT(!info->visited); 2145 info->visited = true; 2146 } 2147 ~VisitMarker() { 2148 info_->visited = false; 2149 } 2150 private: 2151 NodeInfo* info_; 2152 }; 2153 2154 2155 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 2156 RegExpCompiler* compiler, 2157 int characters_filled_in, 2158 bool not_at_start) { 2159 if (body_can_be_zero_length_ || info()->visited) return; 2160 VisitMarker marker(info()); 2161 return ChoiceNode::GetQuickCheckDetails(details, 2162 compiler, 2163 characters_filled_in, 2164 not_at_start); 2165 } 2166 2167 2168 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 2169 RegExpCompiler* compiler, 2170 int characters_filled_in, 2171 bool not_at_start) { 2172 not_at_start = (not_at_start || not_at_start_); 2173 int choice_count = alternatives_->length(); 2174 ASSERT(choice_count > 0); 2175 alternatives_->at(0).node()->GetQuickCheckDetails(details, 2176 compiler, 2177 characters_filled_in, 2178 not_at_start); 2179 for (int i = 1; i < choice_count; i++) { 2180 QuickCheckDetails new_details(details->characters()); 2181 RegExpNode* node = alternatives_->at(i).node(); 2182 node->GetQuickCheckDetails(&new_details, compiler, 2183 characters_filled_in, 2184 not_at_start); 2185 // Here we merge the quick match details of the two branches. 2186 details->Merge(&new_details, characters_filled_in); 2187 } 2188 } 2189 2190 2191 // Check for [0-9A-Z_a-z]. 2192 static void EmitWordCheck(RegExpMacroAssembler* assembler, 2193 Label* word, 2194 Label* non_word, 2195 bool fall_through_on_word) { 2196 if (assembler->CheckSpecialCharacterClass( 2197 fall_through_on_word ? 'w' : 'W', 2198 fall_through_on_word ? non_word : word)) { 2199 // Optimized implementation available. 2200 return; 2201 } 2202 assembler->CheckCharacterGT('z', non_word); 2203 assembler->CheckCharacterLT('0', non_word); 2204 assembler->CheckCharacterGT('a' - 1, word); 2205 assembler->CheckCharacterLT('9' + 1, word); 2206 assembler->CheckCharacterLT('A', non_word); 2207 assembler->CheckCharacterLT('Z' + 1, word); 2208 if (fall_through_on_word) { 2209 assembler->CheckNotCharacter('_', non_word); 2210 } else { 2211 assembler->CheckCharacter('_', word); 2212 } 2213 } 2214 2215 2216 // Emit the code to check for a ^ in multiline mode (1-character lookbehind 2217 // that matches newline or the start of input). 2218 static void EmitHat(RegExpCompiler* compiler, 2219 RegExpNode* on_success, 2220 Trace* trace) { 2221 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2222 // We will be loading the previous character into the current character 2223 // register. 2224 Trace new_trace(*trace); 2225 new_trace.InvalidateCurrentCharacter(); 2226 2227 Label ok; 2228 if (new_trace.cp_offset() == 0) { 2229 // The start of input counts as a newline in this context, so skip to 2230 // ok if we are at the start. 2231 assembler->CheckAtStart(&ok); 2232 } 2233 // We already checked that we are not at the start of input so it must be 2234 // OK to load the previous character. 2235 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1, 2236 new_trace.backtrack(), 2237 false); 2238 if (!assembler->CheckSpecialCharacterClass('n', 2239 new_trace.backtrack())) { 2240 // Newline means \n, \r, 0x2028 or 0x2029. 2241 if (!compiler->ascii()) { 2242 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok); 2243 } 2244 assembler->CheckCharacter('\n', &ok); 2245 assembler->CheckNotCharacter('\r', new_trace.backtrack()); 2246 } 2247 assembler->Bind(&ok); 2248 on_success->Emit(compiler, &new_trace); 2249 } 2250 2251 2252 // Emit the code to handle \b and \B (word-boundary or non-word-boundary) 2253 // when we know whether the next character must be a word character or not. 2254 static void EmitHalfBoundaryCheck(AssertionNode::AssertionNodeType type, 2255 RegExpCompiler* compiler, 2256 RegExpNode* on_success, 2257 Trace* trace) { 2258 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2259 Label done; 2260 2261 Trace new_trace(*trace); 2262 2263 bool expect_word_character = (type == AssertionNode::AFTER_WORD_CHARACTER); 2264 Label* on_word = expect_word_character ? &done : new_trace.backtrack(); 2265 Label* on_non_word = expect_word_character ? new_trace.backtrack() : &done; 2266 2267 // Check whether previous character was a word character. 2268 switch (trace->at_start()) { 2269 case Trace::TRUE: 2270 if (expect_word_character) { 2271 assembler->GoTo(on_non_word); 2272 } 2273 break; 2274 case Trace::UNKNOWN: 2275 ASSERT_EQ(0, trace->cp_offset()); 2276 assembler->CheckAtStart(on_non_word); 2277 // Fall through. 2278 case Trace::FALSE: 2279 int prev_char_offset = trace->cp_offset() - 1; 2280 assembler->LoadCurrentCharacter(prev_char_offset, NULL, false, 1); 2281 EmitWordCheck(assembler, on_word, on_non_word, expect_word_character); 2282 // We may or may not have loaded the previous character. 2283 new_trace.InvalidateCurrentCharacter(); 2284 } 2285 2286 assembler->Bind(&done); 2287 2288 on_success->Emit(compiler, &new_trace); 2289 } 2290 2291 2292 // Emit the code to handle \b and \B (word-boundary or non-word-boundary). 2293 static void EmitBoundaryCheck(AssertionNode::AssertionNodeType type, 2294 RegExpCompiler* compiler, 2295 RegExpNode* on_success, 2296 Trace* trace) { 2297 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2298 Label before_non_word; 2299 Label before_word; 2300 if (trace->characters_preloaded() != 1) { 2301 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word); 2302 } 2303 // Fall through on non-word. 2304 EmitWordCheck(assembler, &before_word, &before_non_word, false); 2305 2306 // We will be loading the previous character into the current character 2307 // register. 2308 Trace new_trace(*trace); 2309 new_trace.InvalidateCurrentCharacter(); 2310 2311 Label ok; 2312 Label* boundary; 2313 Label* not_boundary; 2314 if (type == AssertionNode::AT_BOUNDARY) { 2315 boundary = &ok; 2316 not_boundary = new_trace.backtrack(); 2317 } else { 2318 not_boundary = &ok; 2319 boundary = new_trace.backtrack(); 2320 } 2321 2322 // Next character is not a word character. 2323 assembler->Bind(&before_non_word); 2324 if (new_trace.cp_offset() == 0) { 2325 // The start of input counts as a non-word character, so the question is 2326 // decided if we are at the start. 2327 assembler->CheckAtStart(not_boundary); 2328 } 2329 // We already checked that we are not at the start of input so it must be 2330 // OK to load the previous character. 2331 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, 2332 &ok, // Unused dummy label in this call. 2333 false); 2334 // Fall through on non-word. 2335 EmitWordCheck(assembler, boundary, not_boundary, false); 2336 assembler->GoTo(not_boundary); 2337 2338 // Next character is a word character. 2339 assembler->Bind(&before_word); 2340 if (new_trace.cp_offset() == 0) { 2341 // The start of input counts as a non-word character, so the question is 2342 // decided if we are at the start. 2343 assembler->CheckAtStart(boundary); 2344 } 2345 // We already checked that we are not at the start of input so it must be 2346 // OK to load the previous character. 2347 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, 2348 &ok, // Unused dummy label in this call. 2349 false); 2350 bool fall_through_on_word = (type == AssertionNode::AT_NON_BOUNDARY); 2351 EmitWordCheck(assembler, not_boundary, boundary, fall_through_on_word); 2352 2353 assembler->Bind(&ok); 2354 2355 on_success->Emit(compiler, &new_trace); 2356 } 2357 2358 2359 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details, 2360 RegExpCompiler* compiler, 2361 int filled_in, 2362 bool not_at_start) { 2363 if (type_ == AT_START && not_at_start) { 2364 details->set_cannot_match(); 2365 return; 2366 } 2367 return on_success()->GetQuickCheckDetails(details, 2368 compiler, 2369 filled_in, 2370 not_at_start); 2371 } 2372 2373 2374 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) { 2375 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2376 switch (type_) { 2377 case AT_END: { 2378 Label ok; 2379 assembler->CheckPosition(trace->cp_offset(), &ok); 2380 assembler->GoTo(trace->backtrack()); 2381 assembler->Bind(&ok); 2382 break; 2383 } 2384 case AT_START: { 2385 if (trace->at_start() == Trace::FALSE) { 2386 assembler->GoTo(trace->backtrack()); 2387 return; 2388 } 2389 if (trace->at_start() == Trace::UNKNOWN) { 2390 assembler->CheckNotAtStart(trace->backtrack()); 2391 Trace at_start_trace = *trace; 2392 at_start_trace.set_at_start(true); 2393 on_success()->Emit(compiler, &at_start_trace); 2394 return; 2395 } 2396 } 2397 break; 2398 case AFTER_NEWLINE: 2399 EmitHat(compiler, on_success(), trace); 2400 return; 2401 case AT_BOUNDARY: 2402 case AT_NON_BOUNDARY: { 2403 EmitBoundaryCheck(type_, compiler, on_success(), trace); 2404 return; 2405 } 2406 case AFTER_WORD_CHARACTER: 2407 case AFTER_NONWORD_CHARACTER: { 2408 EmitHalfBoundaryCheck(type_, compiler, on_success(), trace); 2409 } 2410 } 2411 on_success()->Emit(compiler, trace); 2412 } 2413 2414 2415 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) { 2416 if (quick_check == NULL) return false; 2417 if (offset >= quick_check->characters()) return false; 2418 return quick_check->positions(offset)->determines_perfectly; 2419 } 2420 2421 2422 static void UpdateBoundsCheck(int index, int* checked_up_to) { 2423 if (index > *checked_up_to) { 2424 *checked_up_to = index; 2425 } 2426 } 2427 2428 2429 // We call this repeatedly to generate code for each pass over the text node. 2430 // The passes are in increasing order of difficulty because we hope one 2431 // of the first passes will fail in which case we are saved the work of the 2432 // later passes. for example for the case independent regexp /%[asdfghjkl]a/ 2433 // we will check the '%' in the first pass, the case independent 'a' in the 2434 // second pass and the character class in the last pass. 2435 // 2436 // The passes are done from right to left, so for example to test for /bar/ 2437 // we will first test for an 'r' with offset 2, then an 'a' with offset 1 2438 // and then a 'b' with offset 0. This means we can avoid the end-of-input 2439 // bounds check most of the time. In the example we only need to check for 2440 // end-of-input when loading the putative 'r'. 2441 // 2442 // A slight complication involves the fact that the first character may already 2443 // be fetched into a register by the previous node. In this case we want to 2444 // do the test for that character first. We do this in separate passes. The 2445 // 'preloaded' argument indicates that we are doing such a 'pass'. If such a 2446 // pass has been performed then subsequent passes will have true in 2447 // first_element_checked to indicate that that character does not need to be 2448 // checked again. 2449 // 2450 // In addition to all this we are passed a Trace, which can 2451 // contain an AlternativeGeneration object. In this AlternativeGeneration 2452 // object we can see details of any quick check that was already passed in 2453 // order to get to the code we are now generating. The quick check can involve 2454 // loading characters, which means we do not need to recheck the bounds 2455 // up to the limit the quick check already checked. In addition the quick 2456 // check can have involved a mask and compare operation which may simplify 2457 // or obviate the need for further checks at some character positions. 2458 void TextNode::TextEmitPass(RegExpCompiler* compiler, 2459 TextEmitPassType pass, 2460 bool preloaded, 2461 Trace* trace, 2462 bool first_element_checked, 2463 int* checked_up_to) { 2464 Isolate* isolate = Isolate::Current(); 2465 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2466 bool ascii = compiler->ascii(); 2467 Label* backtrack = trace->backtrack(); 2468 QuickCheckDetails* quick_check = trace->quick_check_performed(); 2469 int element_count = elms_->length(); 2470 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) { 2471 TextElement elm = elms_->at(i); 2472 int cp_offset = trace->cp_offset() + elm.cp_offset; 2473 if (elm.type == TextElement::ATOM) { 2474 Vector<const uc16> quarks = elm.data.u_atom->data(); 2475 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { 2476 if (first_element_checked && i == 0 && j == 0) continue; 2477 if (DeterminedAlready(quick_check, elm.cp_offset + j)) continue; 2478 EmitCharacterFunction* emit_function = NULL; 2479 switch (pass) { 2480 case NON_ASCII_MATCH: 2481 ASSERT(ascii); 2482 if (quarks[j] > String::kMaxAsciiCharCode) { 2483 assembler->GoTo(backtrack); 2484 return; 2485 } 2486 break; 2487 case NON_LETTER_CHARACTER_MATCH: 2488 emit_function = &EmitAtomNonLetter; 2489 break; 2490 case SIMPLE_CHARACTER_MATCH: 2491 emit_function = &EmitSimpleCharacter; 2492 break; 2493 case CASE_CHARACTER_MATCH: 2494 emit_function = &EmitAtomLetter; 2495 break; 2496 default: 2497 break; 2498 } 2499 if (emit_function != NULL) { 2500 bool bound_checked = emit_function(isolate, 2501 compiler, 2502 quarks[j], 2503 backtrack, 2504 cp_offset + j, 2505 *checked_up_to < cp_offset + j, 2506 preloaded); 2507 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); 2508 } 2509 } 2510 } else { 2511 ASSERT_EQ(elm.type, TextElement::CHAR_CLASS); 2512 if (pass == CHARACTER_CLASS_MATCH) { 2513 if (first_element_checked && i == 0) continue; 2514 if (DeterminedAlready(quick_check, elm.cp_offset)) continue; 2515 RegExpCharacterClass* cc = elm.data.u_char_class; 2516 EmitCharClass(assembler, 2517 cc, 2518 ascii, 2519 backtrack, 2520 cp_offset, 2521 *checked_up_to < cp_offset, 2522 preloaded); 2523 UpdateBoundsCheck(cp_offset, checked_up_to); 2524 } 2525 } 2526 } 2527 } 2528 2529 2530 int TextNode::Length() { 2531 TextElement elm = elms_->last(); 2532 ASSERT(elm.cp_offset >= 0); 2533 if (elm.type == TextElement::ATOM) { 2534 return elm.cp_offset + elm.data.u_atom->data().length(); 2535 } else { 2536 return elm.cp_offset + 1; 2537 } 2538 } 2539 2540 2541 bool TextNode::SkipPass(int int_pass, bool ignore_case) { 2542 TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass); 2543 if (ignore_case) { 2544 return pass == SIMPLE_CHARACTER_MATCH; 2545 } else { 2546 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH; 2547 } 2548 } 2549 2550 2551 // This generates the code to match a text node. A text node can contain 2552 // straight character sequences (possibly to be matched in a case-independent 2553 // way) and character classes. For efficiency we do not do this in a single 2554 // pass from left to right. Instead we pass over the text node several times, 2555 // emitting code for some character positions every time. See the comment on 2556 // TextEmitPass for details. 2557 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) { 2558 LimitResult limit_result = LimitVersions(compiler, trace); 2559 if (limit_result == DONE) return; 2560 ASSERT(limit_result == CONTINUE); 2561 2562 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) { 2563 compiler->SetRegExpTooBig(); 2564 return; 2565 } 2566 2567 if (compiler->ascii()) { 2568 int dummy = 0; 2569 TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy); 2570 } 2571 2572 bool first_elt_done = false; 2573 int bound_checked_to = trace->cp_offset() - 1; 2574 bound_checked_to += trace->bound_checked_up_to(); 2575 2576 // If a character is preloaded into the current character register then 2577 // check that now. 2578 if (trace->characters_preloaded() == 1) { 2579 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { 2580 if (!SkipPass(pass, compiler->ignore_case())) { 2581 TextEmitPass(compiler, 2582 static_cast<TextEmitPassType>(pass), 2583 true, 2584 trace, 2585 false, 2586 &bound_checked_to); 2587 } 2588 } 2589 first_elt_done = true; 2590 } 2591 2592 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { 2593 if (!SkipPass(pass, compiler->ignore_case())) { 2594 TextEmitPass(compiler, 2595 static_cast<TextEmitPassType>(pass), 2596 false, 2597 trace, 2598 first_elt_done, 2599 &bound_checked_to); 2600 } 2601 } 2602 2603 Trace successor_trace(*trace); 2604 successor_trace.set_at_start(false); 2605 successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler); 2606 RecursionCheck rc(compiler); 2607 on_success()->Emit(compiler, &successor_trace); 2608 } 2609 2610 2611 void Trace::InvalidateCurrentCharacter() { 2612 characters_preloaded_ = 0; 2613 } 2614 2615 2616 void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) { 2617 ASSERT(by > 0); 2618 // We don't have an instruction for shifting the current character register 2619 // down or for using a shifted value for anything so lets just forget that 2620 // we preloaded any characters into it. 2621 characters_preloaded_ = 0; 2622 // Adjust the offsets of the quick check performed information. This 2623 // information is used to find out what we already determined about the 2624 // characters by means of mask and compare. 2625 quick_check_performed_.Advance(by, compiler->ascii()); 2626 cp_offset_ += by; 2627 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) { 2628 compiler->SetRegExpTooBig(); 2629 cp_offset_ = 0; 2630 } 2631 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by); 2632 } 2633 2634 2635 void TextNode::MakeCaseIndependent(bool is_ascii) { 2636 int element_count = elms_->length(); 2637 for (int i = 0; i < element_count; i++) { 2638 TextElement elm = elms_->at(i); 2639 if (elm.type == TextElement::CHAR_CLASS) { 2640 RegExpCharacterClass* cc = elm.data.u_char_class; 2641 // None of the standard character classes is different in the case 2642 // independent case and it slows us down if we don't know that. 2643 if (cc->is_standard()) continue; 2644 ZoneList<CharacterRange>* ranges = cc->ranges(); 2645 int range_count = ranges->length(); 2646 for (int j = 0; j < range_count; j++) { 2647 ranges->at(j).AddCaseEquivalents(ranges, is_ascii); 2648 } 2649 } 2650 } 2651 } 2652 2653 2654 int TextNode::GreedyLoopTextLength() { 2655 TextElement elm = elms_->at(elms_->length() - 1); 2656 if (elm.type == TextElement::CHAR_CLASS) { 2657 return elm.cp_offset + 1; 2658 } else { 2659 return elm.cp_offset + elm.data.u_atom->data().length(); 2660 } 2661 } 2662 2663 2664 // Finds the fixed match length of a sequence of nodes that goes from 2665 // this alternative and back to this choice node. If there are variable 2666 // length nodes or other complications in the way then return a sentinel 2667 // value indicating that a greedy loop cannot be constructed. 2668 int ChoiceNode::GreedyLoopTextLengthForAlternative( 2669 GuardedAlternative* alternative) { 2670 int length = 0; 2671 RegExpNode* node = alternative->node(); 2672 // Later we will generate code for all these text nodes using recursion 2673 // so we have to limit the max number. 2674 int recursion_depth = 0; 2675 while (node != this) { 2676 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) { 2677 return kNodeIsTooComplexForGreedyLoops; 2678 } 2679 int node_length = node->GreedyLoopTextLength(); 2680 if (node_length == kNodeIsTooComplexForGreedyLoops) { 2681 return kNodeIsTooComplexForGreedyLoops; 2682 } 2683 length += node_length; 2684 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node); 2685 node = seq_node->on_success(); 2686 } 2687 return length; 2688 } 2689 2690 2691 void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) { 2692 ASSERT_EQ(loop_node_, NULL); 2693 AddAlternative(alt); 2694 loop_node_ = alt.node(); 2695 } 2696 2697 2698 void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) { 2699 ASSERT_EQ(continue_node_, NULL); 2700 AddAlternative(alt); 2701 continue_node_ = alt.node(); 2702 } 2703 2704 2705 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 2706 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 2707 if (trace->stop_node() == this) { 2708 int text_length = 2709 GreedyLoopTextLengthForAlternative(&(alternatives_->at(0))); 2710 ASSERT(text_length != kNodeIsTooComplexForGreedyLoops); 2711 // Update the counter-based backtracking info on the stack. This is an 2712 // optimization for greedy loops (see below). 2713 ASSERT(trace->cp_offset() == text_length); 2714 macro_assembler->AdvanceCurrentPosition(text_length); 2715 macro_assembler->GoTo(trace->loop_label()); 2716 return; 2717 } 2718 ASSERT(trace->stop_node() == NULL); 2719 if (!trace->is_trivial()) { 2720 trace->Flush(compiler, this); 2721 return; 2722 } 2723 ChoiceNode::Emit(compiler, trace); 2724 } 2725 2726 2727 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler, 2728 bool not_at_start) { 2729 int preload_characters = EatsAtLeast(4, 0, not_at_start); 2730 if (compiler->macro_assembler()->CanReadUnaligned()) { 2731 bool ascii = compiler->ascii(); 2732 if (ascii) { 2733 if (preload_characters > 4) preload_characters = 4; 2734 // We can't preload 3 characters because there is no machine instruction 2735 // to do that. We can't just load 4 because we could be reading 2736 // beyond the end of the string, which could cause a memory fault. 2737 if (preload_characters == 3) preload_characters = 2; 2738 } else { 2739 if (preload_characters > 2) preload_characters = 2; 2740 } 2741 } else { 2742 if (preload_characters > 1) preload_characters = 1; 2743 } 2744 return preload_characters; 2745 } 2746 2747 2748 // This class is used when generating the alternatives in a choice node. It 2749 // records the way the alternative is being code generated. 2750 class AlternativeGeneration: public Malloced { 2751 public: 2752 AlternativeGeneration() 2753 : possible_success(), 2754 expects_preload(false), 2755 after(), 2756 quick_check_details() { } 2757 Label possible_success; 2758 bool expects_preload; 2759 Label after; 2760 QuickCheckDetails quick_check_details; 2761 }; 2762 2763 2764 // Creates a list of AlternativeGenerations. If the list has a reasonable 2765 // size then it is on the stack, otherwise the excess is on the heap. 2766 class AlternativeGenerationList { 2767 public: 2768 explicit AlternativeGenerationList(int count) 2769 : alt_gens_(count) { 2770 for (int i = 0; i < count && i < kAFew; i++) { 2771 alt_gens_.Add(a_few_alt_gens_ + i); 2772 } 2773 for (int i = kAFew; i < count; i++) { 2774 alt_gens_.Add(new AlternativeGeneration()); 2775 } 2776 } 2777 ~AlternativeGenerationList() { 2778 for (int i = kAFew; i < alt_gens_.length(); i++) { 2779 delete alt_gens_[i]; 2780 alt_gens_[i] = NULL; 2781 } 2782 } 2783 2784 AlternativeGeneration* at(int i) { 2785 return alt_gens_[i]; 2786 } 2787 2788 private: 2789 static const int kAFew = 10; 2790 ZoneList<AlternativeGeneration*> alt_gens_; 2791 AlternativeGeneration a_few_alt_gens_[kAFew]; 2792 }; 2793 2794 2795 /* Code generation for choice nodes. 2796 * 2797 * We generate quick checks that do a mask and compare to eliminate a 2798 * choice. If the quick check succeeds then it jumps to the continuation to 2799 * do slow checks and check subsequent nodes. If it fails (the common case) 2800 * it falls through to the next choice. 2801 * 2802 * Here is the desired flow graph. Nodes directly below each other imply 2803 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative 2804 * 3 doesn't have a quick check so we have to call the slow check. 2805 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire 2806 * regexp continuation is generated directly after the Sn node, up to the 2807 * next GoTo if we decide to reuse some already generated code. Some 2808 * nodes expect preload_characters to be preloaded into the current 2809 * character register. R nodes do this preloading. Vertices are marked 2810 * F for failures and S for success (possible success in the case of quick 2811 * nodes). L, V, < and > are used as arrow heads. 2812 * 2813 * ----------> R 2814 * | 2815 * V 2816 * Q1 -----> S1 2817 * | S / 2818 * F| / 2819 * | F/ 2820 * | / 2821 * | R 2822 * | / 2823 * V L 2824 * Q2 -----> S2 2825 * | S / 2826 * F| / 2827 * | F/ 2828 * | / 2829 * | R 2830 * | / 2831 * V L 2832 * S3 2833 * | 2834 * F| 2835 * | 2836 * R 2837 * | 2838 * backtrack V 2839 * <----------Q4 2840 * \ F | 2841 * \ |S 2842 * \ F V 2843 * \-----S4 2844 * 2845 * For greedy loops we reverse our expectation and expect to match rather 2846 * than fail. Therefore we want the loop code to look like this (U is the 2847 * unwind code that steps back in the greedy loop). The following alternatives 2848 * look the same as above. 2849 * _____ 2850 * / \ 2851 * V | 2852 * ----------> S1 | 2853 * /| | 2854 * / |S | 2855 * F/ \_____/ 2856 * / 2857 * |<----------- 2858 * | \ 2859 * V \ 2860 * Q2 ---> S2 \ 2861 * | S / | 2862 * F| / | 2863 * | F/ | 2864 * | / | 2865 * | R | 2866 * | / | 2867 * F VL | 2868 * <------U | 2869 * back |S | 2870 * \______________/ 2871 */ 2872 2873 2874 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 2875 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 2876 int choice_count = alternatives_->length(); 2877 #ifdef DEBUG 2878 for (int i = 0; i < choice_count - 1; i++) { 2879 GuardedAlternative alternative = alternatives_->at(i); 2880 ZoneList<Guard*>* guards = alternative.guards(); 2881 int guard_count = (guards == NULL) ? 0 : guards->length(); 2882 for (int j = 0; j < guard_count; j++) { 2883 ASSERT(!trace->mentions_reg(guards->at(j)->reg())); 2884 } 2885 } 2886 #endif 2887 2888 LimitResult limit_result = LimitVersions(compiler, trace); 2889 if (limit_result == DONE) return; 2890 ASSERT(limit_result == CONTINUE); 2891 2892 int new_flush_budget = trace->flush_budget() / choice_count; 2893 if (trace->flush_budget() == 0 && trace->actions() != NULL) { 2894 trace->Flush(compiler, this); 2895 return; 2896 } 2897 2898 RecursionCheck rc(compiler); 2899 2900 Trace* current_trace = trace; 2901 2902 int text_length = GreedyLoopTextLengthForAlternative(&(alternatives_->at(0))); 2903 bool greedy_loop = false; 2904 Label greedy_loop_label; 2905 Trace counter_backtrack_trace; 2906 counter_backtrack_trace.set_backtrack(&greedy_loop_label); 2907 if (not_at_start()) counter_backtrack_trace.set_at_start(false); 2908 2909 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) { 2910 // Here we have special handling for greedy loops containing only text nodes 2911 // and other simple nodes. These are handled by pushing the current 2912 // position on the stack and then incrementing the current position each 2913 // time around the switch. On backtrack we decrement the current position 2914 // and check it against the pushed value. This avoids pushing backtrack 2915 // information for each iteration of the loop, which could take up a lot of 2916 // space. 2917 greedy_loop = true; 2918 ASSERT(trace->stop_node() == NULL); 2919 macro_assembler->PushCurrentPosition(); 2920 current_trace = &counter_backtrack_trace; 2921 Label greedy_match_failed; 2922 Trace greedy_match_trace; 2923 if (not_at_start()) greedy_match_trace.set_at_start(false); 2924 greedy_match_trace.set_backtrack(&greedy_match_failed); 2925 Label loop_label; 2926 macro_assembler->Bind(&loop_label); 2927 greedy_match_trace.set_stop_node(this); 2928 greedy_match_trace.set_loop_label(&loop_label); 2929 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace); 2930 macro_assembler->Bind(&greedy_match_failed); 2931 } 2932 2933 Label second_choice; // For use in greedy matches. 2934 macro_assembler->Bind(&second_choice); 2935 2936 int first_normal_choice = greedy_loop ? 1 : 0; 2937 2938 int preload_characters = 2939 CalculatePreloadCharacters(compiler, 2940 current_trace->at_start() == Trace::FALSE); 2941 bool preload_is_current = 2942 (current_trace->characters_preloaded() == preload_characters); 2943 bool preload_has_checked_bounds = preload_is_current; 2944 2945 AlternativeGenerationList alt_gens(choice_count); 2946 2947 // For now we just call all choices one after the other. The idea ultimately 2948 // is to use the Dispatch table to try only the relevant ones. 2949 for (int i = first_normal_choice; i < choice_count; i++) { 2950 GuardedAlternative alternative = alternatives_->at(i); 2951 AlternativeGeneration* alt_gen = alt_gens.at(i); 2952 alt_gen->quick_check_details.set_characters(preload_characters); 2953 ZoneList<Guard*>* guards = alternative.guards(); 2954 int guard_count = (guards == NULL) ? 0 : guards->length(); 2955 Trace new_trace(*current_trace); 2956 new_trace.set_characters_preloaded(preload_is_current ? 2957 preload_characters : 2958 0); 2959 if (preload_has_checked_bounds) { 2960 new_trace.set_bound_checked_up_to(preload_characters); 2961 } 2962 new_trace.quick_check_performed()->Clear(); 2963 if (not_at_start_) new_trace.set_at_start(Trace::FALSE); 2964 alt_gen->expects_preload = preload_is_current; 2965 bool generate_full_check_inline = false; 2966 if (FLAG_regexp_optimization && 2967 try_to_emit_quick_check_for_alternative(i) && 2968 alternative.node()->EmitQuickCheck(compiler, 2969 &new_trace, 2970 preload_has_checked_bounds, 2971 &alt_gen->possible_success, 2972 &alt_gen->quick_check_details, 2973 i < choice_count - 1)) { 2974 // Quick check was generated for this choice. 2975 preload_is_current = true; 2976 preload_has_checked_bounds = true; 2977 // On the last choice in the ChoiceNode we generated the quick 2978 // check to fall through on possible success. So now we need to 2979 // generate the full check inline. 2980 if (i == choice_count - 1) { 2981 macro_assembler->Bind(&alt_gen->possible_success); 2982 new_trace.set_quick_check_performed(&alt_gen->quick_check_details); 2983 new_trace.set_characters_preloaded(preload_characters); 2984 new_trace.set_bound_checked_up_to(preload_characters); 2985 generate_full_check_inline = true; 2986 } 2987 } else if (alt_gen->quick_check_details.cannot_match()) { 2988 if (i == choice_count - 1 && !greedy_loop) { 2989 macro_assembler->GoTo(trace->backtrack()); 2990 } 2991 continue; 2992 } else { 2993 // No quick check was generated. Put the full code here. 2994 // If this is not the first choice then there could be slow checks from 2995 // previous cases that go here when they fail. There's no reason to 2996 // insist that they preload characters since the slow check we are about 2997 // to generate probably can't use it. 2998 if (i != first_normal_choice) { 2999 alt_gen->expects_preload = false; 3000 new_trace.InvalidateCurrentCharacter(); 3001 } 3002 if (i < choice_count - 1) { 3003 new_trace.set_backtrack(&alt_gen->after); 3004 } 3005 generate_full_check_inline = true; 3006 } 3007 if (generate_full_check_inline) { 3008 if (new_trace.actions() != NULL) { 3009 new_trace.set_flush_budget(new_flush_budget); 3010 } 3011 for (int j = 0; j < guard_count; j++) { 3012 GenerateGuard(macro_assembler, guards->at(j), &new_trace); 3013 } 3014 alternative.node()->Emit(compiler, &new_trace); 3015 preload_is_current = false; 3016 } 3017 macro_assembler->Bind(&alt_gen->after); 3018 } 3019 if (greedy_loop) { 3020 macro_assembler->Bind(&greedy_loop_label); 3021 // If we have unwound to the bottom then backtrack. 3022 macro_assembler->CheckGreedyLoop(trace->backtrack()); 3023 // Otherwise try the second priority at an earlier position. 3024 macro_assembler->AdvanceCurrentPosition(-text_length); 3025 macro_assembler->GoTo(&second_choice); 3026 } 3027 3028 // At this point we need to generate slow checks for the alternatives where 3029 // the quick check was inlined. We can recognize these because the associated 3030 // label was bound. 3031 for (int i = first_normal_choice; i < choice_count - 1; i++) { 3032 AlternativeGeneration* alt_gen = alt_gens.at(i); 3033 Trace new_trace(*current_trace); 3034 // If there are actions to be flushed we have to limit how many times 3035 // they are flushed. Take the budget of the parent trace and distribute 3036 // it fairly amongst the children. 3037 if (new_trace.actions() != NULL) { 3038 new_trace.set_flush_budget(new_flush_budget); 3039 } 3040 EmitOutOfLineContinuation(compiler, 3041 &new_trace, 3042 alternatives_->at(i), 3043 alt_gen, 3044 preload_characters, 3045 alt_gens.at(i + 1)->expects_preload); 3046 } 3047 } 3048 3049 3050 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler, 3051 Trace* trace, 3052 GuardedAlternative alternative, 3053 AlternativeGeneration* alt_gen, 3054 int preload_characters, 3055 bool next_expects_preload) { 3056 if (!alt_gen->possible_success.is_linked()) return; 3057 3058 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 3059 macro_assembler->Bind(&alt_gen->possible_success); 3060 Trace out_of_line_trace(*trace); 3061 out_of_line_trace.set_characters_preloaded(preload_characters); 3062 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details); 3063 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE); 3064 ZoneList<Guard*>* guards = alternative.guards(); 3065 int guard_count = (guards == NULL) ? 0 : guards->length(); 3066 if (next_expects_preload) { 3067 Label reload_current_char; 3068 out_of_line_trace.set_backtrack(&reload_current_char); 3069 for (int j = 0; j < guard_count; j++) { 3070 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace); 3071 } 3072 alternative.node()->Emit(compiler, &out_of_line_trace); 3073 macro_assembler->Bind(&reload_current_char); 3074 // Reload the current character, since the next quick check expects that. 3075 // We don't need to check bounds here because we only get into this 3076 // code through a quick check which already did the checked load. 3077 macro_assembler->LoadCurrentCharacter(trace->cp_offset(), 3078 NULL, 3079 false, 3080 preload_characters); 3081 macro_assembler->GoTo(&(alt_gen->after)); 3082 } else { 3083 out_of_line_trace.set_backtrack(&(alt_gen->after)); 3084 for (int j = 0; j < guard_count; j++) { 3085 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace); 3086 } 3087 alternative.node()->Emit(compiler, &out_of_line_trace); 3088 } 3089 } 3090 3091 3092 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3093 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3094 LimitResult limit_result = LimitVersions(compiler, trace); 3095 if (limit_result == DONE) return; 3096 ASSERT(limit_result == CONTINUE); 3097 3098 RecursionCheck rc(compiler); 3099 3100 switch (type_) { 3101 case STORE_POSITION: { 3102 Trace::DeferredCapture 3103 new_capture(data_.u_position_register.reg, 3104 data_.u_position_register.is_capture, 3105 trace); 3106 Trace new_trace = *trace; 3107 new_trace.add_action(&new_capture); 3108 on_success()->Emit(compiler, &new_trace); 3109 break; 3110 } 3111 case INCREMENT_REGISTER: { 3112 Trace::DeferredIncrementRegister 3113 new_increment(data_.u_increment_register.reg); 3114 Trace new_trace = *trace; 3115 new_trace.add_action(&new_increment); 3116 on_success()->Emit(compiler, &new_trace); 3117 break; 3118 } 3119 case SET_REGISTER: { 3120 Trace::DeferredSetRegister 3121 new_set(data_.u_store_register.reg, data_.u_store_register.value); 3122 Trace new_trace = *trace; 3123 new_trace.add_action(&new_set); 3124 on_success()->Emit(compiler, &new_trace); 3125 break; 3126 } 3127 case CLEAR_CAPTURES: { 3128 Trace::DeferredClearCaptures 3129 new_capture(Interval(data_.u_clear_captures.range_from, 3130 data_.u_clear_captures.range_to)); 3131 Trace new_trace = *trace; 3132 new_trace.add_action(&new_capture); 3133 on_success()->Emit(compiler, &new_trace); 3134 break; 3135 } 3136 case BEGIN_SUBMATCH: 3137 if (!trace->is_trivial()) { 3138 trace->Flush(compiler, this); 3139 } else { 3140 assembler->WriteCurrentPositionToRegister( 3141 data_.u_submatch.current_position_register, 0); 3142 assembler->WriteStackPointerToRegister( 3143 data_.u_submatch.stack_pointer_register); 3144 on_success()->Emit(compiler, trace); 3145 } 3146 break; 3147 case EMPTY_MATCH_CHECK: { 3148 int start_pos_reg = data_.u_empty_match_check.start_register; 3149 int stored_pos = 0; 3150 int rep_reg = data_.u_empty_match_check.repetition_register; 3151 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister); 3152 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos); 3153 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) { 3154 // If we know we haven't advanced and there is no minimum we 3155 // can just backtrack immediately. 3156 assembler->GoTo(trace->backtrack()); 3157 } else if (know_dist && stored_pos < trace->cp_offset()) { 3158 // If we know we've advanced we can generate the continuation 3159 // immediately. 3160 on_success()->Emit(compiler, trace); 3161 } else if (!trace->is_trivial()) { 3162 trace->Flush(compiler, this); 3163 } else { 3164 Label skip_empty_check; 3165 // If we have a minimum number of repetitions we check the current 3166 // number first and skip the empty check if it's not enough. 3167 if (has_minimum) { 3168 int limit = data_.u_empty_match_check.repetition_limit; 3169 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check); 3170 } 3171 // If the match is empty we bail out, otherwise we fall through 3172 // to the on-success continuation. 3173 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register, 3174 trace->backtrack()); 3175 assembler->Bind(&skip_empty_check); 3176 on_success()->Emit(compiler, trace); 3177 } 3178 break; 3179 } 3180 case POSITIVE_SUBMATCH_SUCCESS: { 3181 if (!trace->is_trivial()) { 3182 trace->Flush(compiler, this); 3183 return; 3184 } 3185 assembler->ReadCurrentPositionFromRegister( 3186 data_.u_submatch.current_position_register); 3187 assembler->ReadStackPointerFromRegister( 3188 data_.u_submatch.stack_pointer_register); 3189 int clear_register_count = data_.u_submatch.clear_register_count; 3190 if (clear_register_count == 0) { 3191 on_success()->Emit(compiler, trace); 3192 return; 3193 } 3194 int clear_registers_from = data_.u_submatch.clear_register_from; 3195 Label clear_registers_backtrack; 3196 Trace new_trace = *trace; 3197 new_trace.set_backtrack(&clear_registers_backtrack); 3198 on_success()->Emit(compiler, &new_trace); 3199 3200 assembler->Bind(&clear_registers_backtrack); 3201 int clear_registers_to = clear_registers_from + clear_register_count - 1; 3202 assembler->ClearRegisters(clear_registers_from, clear_registers_to); 3203 3204 ASSERT(trace->backtrack() == NULL); 3205 assembler->Backtrack(); 3206 return; 3207 } 3208 default: 3209 UNREACHABLE(); 3210 } 3211 } 3212 3213 3214 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3215 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3216 if (!trace->is_trivial()) { 3217 trace->Flush(compiler, this); 3218 return; 3219 } 3220 3221 LimitResult limit_result = LimitVersions(compiler, trace); 3222 if (limit_result == DONE) return; 3223 ASSERT(limit_result == CONTINUE); 3224 3225 RecursionCheck rc(compiler); 3226 3227 ASSERT_EQ(start_reg_ + 1, end_reg_); 3228 if (compiler->ignore_case()) { 3229 assembler->CheckNotBackReferenceIgnoreCase(start_reg_, 3230 trace->backtrack()); 3231 } else { 3232 assembler->CheckNotBackReference(start_reg_, trace->backtrack()); 3233 } 3234 on_success()->Emit(compiler, trace); 3235 } 3236 3237 3238 // ------------------------------------------------------------------- 3239 // Dot/dotty output 3240 3241 3242 #ifdef DEBUG 3243 3244 3245 class DotPrinter: public NodeVisitor { 3246 public: 3247 explicit DotPrinter(bool ignore_case) 3248 : ignore_case_(ignore_case), 3249 stream_(&alloc_) { } 3250 void PrintNode(const char* label, RegExpNode* node); 3251 void Visit(RegExpNode* node); 3252 void PrintAttributes(RegExpNode* from); 3253 StringStream* stream() { return &stream_; } 3254 void PrintOnFailure(RegExpNode* from, RegExpNode* to); 3255 #define DECLARE_VISIT(Type) \ 3256 virtual void Visit##Type(Type##Node* that); 3257 FOR_EACH_NODE_TYPE(DECLARE_VISIT) 3258 #undef DECLARE_VISIT 3259 private: 3260 bool ignore_case_; 3261 HeapStringAllocator alloc_; 3262 StringStream stream_; 3263 }; 3264 3265 3266 void DotPrinter::PrintNode(const char* label, RegExpNode* node) { 3267 stream()->Add("digraph G {\n graph [label=\""); 3268 for (int i = 0; label[i]; i++) { 3269 switch (label[i]) { 3270 case '\\': 3271 stream()->Add("\\\\"); 3272 break; 3273 case '"': 3274 stream()->Add("\""); 3275 break; 3276 default: 3277 stream()->Put(label[i]); 3278 break; 3279 } 3280 } 3281 stream()->Add("\"];\n"); 3282 Visit(node); 3283 stream()->Add("}\n"); 3284 printf("%s", *(stream()->ToCString())); 3285 } 3286 3287 3288 void DotPrinter::Visit(RegExpNode* node) { 3289 if (node->info()->visited) return; 3290 node->info()->visited = true; 3291 node->Accept(this); 3292 } 3293 3294 3295 void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) { 3296 stream()->Add(" n%p -> n%p [style=dotted];\n", from, on_failure); 3297 Visit(on_failure); 3298 } 3299 3300 3301 class TableEntryBodyPrinter { 3302 public: 3303 TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice) 3304 : stream_(stream), choice_(choice) { } 3305 void Call(uc16 from, DispatchTable::Entry entry) { 3306 OutSet* out_set = entry.out_set(); 3307 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { 3308 if (out_set->Get(i)) { 3309 stream()->Add(" n%p:s%io%i -> n%p;\n", 3310 choice(), 3311 from, 3312 i, 3313 choice()->alternatives()->at(i).node()); 3314 } 3315 } 3316 } 3317 private: 3318 StringStream* stream() { return stream_; } 3319 ChoiceNode* choice() { return choice_; } 3320 StringStream* stream_; 3321 ChoiceNode* choice_; 3322 }; 3323 3324 3325 class TableEntryHeaderPrinter { 3326 public: 3327 explicit TableEntryHeaderPrinter(StringStream* stream) 3328 : first_(true), stream_(stream) { } 3329 void Call(uc16 from, DispatchTable::Entry entry) { 3330 if (first_) { 3331 first_ = false; 3332 } else { 3333 stream()->Add("|"); 3334 } 3335 stream()->Add("{\\%k-\\%k|{", from, entry.to()); 3336 OutSet* out_set = entry.out_set(); 3337 int priority = 0; 3338 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { 3339 if (out_set->Get(i)) { 3340 if (priority > 0) stream()->Add("|"); 3341 stream()->Add("<s%io%i> %i", from, i, priority); 3342 priority++; 3343 } 3344 } 3345 stream()->Add("}}"); 3346 } 3347 3348 private: 3349 bool first_; 3350 StringStream* stream() { return stream_; } 3351 StringStream* stream_; 3352 }; 3353 3354 3355 class AttributePrinter { 3356 public: 3357 explicit AttributePrinter(DotPrinter* out) 3358 : out_(out), first_(true) { } 3359 void PrintSeparator() { 3360 if (first_) { 3361 first_ = false; 3362 } else { 3363 out_->stream()->Add("|"); 3364 } 3365 } 3366 void PrintBit(const char* name, bool value) { 3367 if (!value) return; 3368 PrintSeparator(); 3369 out_->stream()->Add("{%s}", name); 3370 } 3371 void PrintPositive(const char* name, int value) { 3372 if (value < 0) return; 3373 PrintSeparator(); 3374 out_->stream()->Add("{%s|%x}", name, value); 3375 } 3376 private: 3377 DotPrinter* out_; 3378 bool first_; 3379 }; 3380 3381 3382 void DotPrinter::PrintAttributes(RegExpNode* that) { 3383 stream()->Add(" a%p [shape=Mrecord, color=grey, fontcolor=grey, " 3384 "margin=0.1, fontsize=10, label=\"{", 3385 that); 3386 AttributePrinter printer(this); 3387 NodeInfo* info = that->info(); 3388 printer.PrintBit("NI", info->follows_newline_interest); 3389 printer.PrintBit("WI", info->follows_word_interest); 3390 printer.PrintBit("SI", info->follows_start_interest); 3391 Label* label = that->label(); 3392 if (label->is_bound()) 3393 printer.PrintPositive("@", label->pos()); 3394 stream()->Add("}\"];\n"); 3395 stream()->Add(" a%p -> n%p [style=dashed, color=grey, " 3396 "arrowhead=none];\n", that, that); 3397 } 3398 3399 3400 static const bool kPrintDispatchTable = false; 3401 void DotPrinter::VisitChoice(ChoiceNode* that) { 3402 if (kPrintDispatchTable) { 3403 stream()->Add(" n%p [shape=Mrecord, label=\"", that); 3404 TableEntryHeaderPrinter header_printer(stream()); 3405 that->GetTable(ignore_case_)->ForEach(&header_printer); 3406 stream()->Add("\"]\n", that); 3407 PrintAttributes(that); 3408 TableEntryBodyPrinter body_printer(stream(), that); 3409 that->GetTable(ignore_case_)->ForEach(&body_printer); 3410 } else { 3411 stream()->Add(" n%p [shape=Mrecord, label=\"?\"];\n", that); 3412 for (int i = 0; i < that->alternatives()->length(); i++) { 3413 GuardedAlternative alt = that->alternatives()->at(i); 3414 stream()->Add(" n%p -> n%p;\n", that, alt.node()); 3415 } 3416 } 3417 for (int i = 0; i < that->alternatives()->length(); i++) { 3418 GuardedAlternative alt = that->alternatives()->at(i); 3419 alt.node()->Accept(this); 3420 } 3421 } 3422 3423 3424 void DotPrinter::VisitText(TextNode* that) { 3425 stream()->Add(" n%p [label=\"", that); 3426 for (int i = 0; i < that->elements()->length(); i++) { 3427 if (i > 0) stream()->Add(" "); 3428 TextElement elm = that->elements()->at(i); 3429 switch (elm.type) { 3430 case TextElement::ATOM: { 3431 stream()->Add("'%w'", elm.data.u_atom->data()); 3432 break; 3433 } 3434 case TextElement::CHAR_CLASS: { 3435 RegExpCharacterClass* node = elm.data.u_char_class; 3436 stream()->Add("["); 3437 if (node->is_negated()) 3438 stream()->Add("^"); 3439 for (int j = 0; j < node->ranges()->length(); j++) { 3440 CharacterRange range = node->ranges()->at(j); 3441 stream()->Add("%k-%k", range.from(), range.to()); 3442 } 3443 stream()->Add("]"); 3444 break; 3445 } 3446 default: 3447 UNREACHABLE(); 3448 } 3449 } 3450 stream()->Add("\", shape=box, peripheries=2];\n"); 3451 PrintAttributes(that); 3452 stream()->Add(" n%p -> n%p;\n", that, that->on_success()); 3453 Visit(that->on_success()); 3454 } 3455 3456 3457 void DotPrinter::VisitBackReference(BackReferenceNode* that) { 3458 stream()->Add(" n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n", 3459 that, 3460 that->start_register(), 3461 that->end_register()); 3462 PrintAttributes(that); 3463 stream()->Add(" n%p -> n%p;\n", that, that->on_success()); 3464 Visit(that->on_success()); 3465 } 3466 3467 3468 void DotPrinter::VisitEnd(EndNode* that) { 3469 stream()->Add(" n%p [style=bold, shape=point];\n", that); 3470 PrintAttributes(that); 3471 } 3472 3473 3474 void DotPrinter::VisitAssertion(AssertionNode* that) { 3475 stream()->Add(" n%p [", that); 3476 switch (that->type()) { 3477 case AssertionNode::AT_END: 3478 stream()->Add("label=\"$\", shape=septagon"); 3479 break; 3480 case AssertionNode::AT_START: 3481 stream()->Add("label=\"^\", shape=septagon"); 3482 break; 3483 case AssertionNode::AT_BOUNDARY: 3484 stream()->Add("label=\"\\b\", shape=septagon"); 3485 break; 3486 case AssertionNode::AT_NON_BOUNDARY: 3487 stream()->Add("label=\"\\B\", shape=septagon"); 3488 break; 3489 case AssertionNode::AFTER_NEWLINE: 3490 stream()->Add("label=\"(?<=\\n)\", shape=septagon"); 3491 break; 3492 case AssertionNode::AFTER_WORD_CHARACTER: 3493 stream()->Add("label=\"(?<=\\w)\", shape=septagon"); 3494 break; 3495 case AssertionNode::AFTER_NONWORD_CHARACTER: 3496 stream()->Add("label=\"(?<=\\W)\", shape=septagon"); 3497 break; 3498 } 3499 stream()->Add("];\n"); 3500 PrintAttributes(that); 3501 RegExpNode* successor = that->on_success(); 3502 stream()->Add(" n%p -> n%p;\n", that, successor); 3503 Visit(successor); 3504 } 3505 3506 3507 void DotPrinter::VisitAction(ActionNode* that) { 3508 stream()->Add(" n%p [", that); 3509 switch (that->type_) { 3510 case ActionNode::SET_REGISTER: 3511 stream()->Add("label=\"$%i:=%i\", shape=octagon", 3512 that->data_.u_store_register.reg, 3513 that->data_.u_store_register.value); 3514 break; 3515 case ActionNode::INCREMENT_REGISTER: 3516 stream()->Add("label=\"$%i++\", shape=octagon", 3517 that->data_.u_increment_register.reg); 3518 break; 3519 case ActionNode::STORE_POSITION: 3520 stream()->Add("label=\"$%i:=$pos\", shape=octagon", 3521 that->data_.u_position_register.reg); 3522 break; 3523 case ActionNode::BEGIN_SUBMATCH: 3524 stream()->Add("label=\"$%i:=$pos,begin\", shape=septagon", 3525 that->data_.u_submatch.current_position_register); 3526 break; 3527 case ActionNode::POSITIVE_SUBMATCH_SUCCESS: 3528 stream()->Add("label=\"escape\", shape=septagon"); 3529 break; 3530 case ActionNode::EMPTY_MATCH_CHECK: 3531 stream()->Add("label=\"$%i=$pos?,$%i<%i?\", shape=septagon", 3532 that->data_.u_empty_match_check.start_register, 3533 that->data_.u_empty_match_check.repetition_register, 3534 that->data_.u_empty_match_check.repetition_limit); 3535 break; 3536 case ActionNode::CLEAR_CAPTURES: { 3537 stream()->Add("label=\"clear $%i to $%i\", shape=septagon", 3538 that->data_.u_clear_captures.range_from, 3539 that->data_.u_clear_captures.range_to); 3540 break; 3541 } 3542 } 3543 stream()->Add("];\n"); 3544 PrintAttributes(that); 3545 RegExpNode* successor = that->on_success(); 3546 stream()->Add(" n%p -> n%p;\n", that, successor); 3547 Visit(successor); 3548 } 3549 3550 3551 class DispatchTableDumper { 3552 public: 3553 explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { } 3554 void Call(uc16 key, DispatchTable::Entry entry); 3555 StringStream* stream() { return stream_; } 3556 private: 3557 StringStream* stream_; 3558 }; 3559 3560 3561 void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) { 3562 stream()->Add("[%k-%k]: {", key, entry.to()); 3563 OutSet* set = entry.out_set(); 3564 bool first = true; 3565 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { 3566 if (set->Get(i)) { 3567 if (first) { 3568 first = false; 3569 } else { 3570 stream()->Add(", "); 3571 } 3572 stream()->Add("%i", i); 3573 } 3574 } 3575 stream()->Add("}\n"); 3576 } 3577 3578 3579 void DispatchTable::Dump() { 3580 HeapStringAllocator alloc; 3581 StringStream stream(&alloc); 3582 DispatchTableDumper dumper(&stream); 3583 tree()->ForEach(&dumper); 3584 OS::PrintError("%s", *stream.ToCString()); 3585 } 3586 3587 3588 void RegExpEngine::DotPrint(const char* label, 3589 RegExpNode* node, 3590 bool ignore_case) { 3591 DotPrinter printer(ignore_case); 3592 printer.PrintNode(label, node); 3593 } 3594 3595 3596 #endif // DEBUG 3597 3598 3599 // ------------------------------------------------------------------- 3600 // Tree to graph conversion 3601 3602 static const uc16 kSpaceRanges[] = { 0x0009, 0x000D, 0x0020, 0x0020, 0x00A0, 3603 0x00A0, 0x1680, 0x1680, 0x180E, 0x180E, 0x2000, 0x200A, 0x2028, 0x2029, 3604 0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000, 0xFEFF, 0xFEFF }; 3605 static const int kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges); 3606 3607 static const uc16 kWordRanges[] = { '0', '9', 'A', 'Z', '_', '_', 'a', 'z' }; 3608 static const int kWordRangeCount = ARRAY_SIZE(kWordRanges); 3609 3610 static const uc16 kDigitRanges[] = { '0', '9' }; 3611 static const int kDigitRangeCount = ARRAY_SIZE(kDigitRanges); 3612 3613 static const uc16 kLineTerminatorRanges[] = { 0x000A, 0x000A, 0x000D, 0x000D, 3614 0x2028, 0x2029 }; 3615 static const int kLineTerminatorRangeCount = ARRAY_SIZE(kLineTerminatorRanges); 3616 3617 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler, 3618 RegExpNode* on_success) { 3619 ZoneList<TextElement>* elms = new ZoneList<TextElement>(1); 3620 elms->Add(TextElement::Atom(this)); 3621 return new TextNode(elms, on_success); 3622 } 3623 3624 3625 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, 3626 RegExpNode* on_success) { 3627 return new TextNode(elements(), on_success); 3628 } 3629 3630 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges, 3631 const uc16* special_class, 3632 int length) { 3633 ASSERT(ranges->length() != 0); 3634 ASSERT(length != 0); 3635 ASSERT(special_class[0] != 0); 3636 if (ranges->length() != (length >> 1) + 1) { 3637 return false; 3638 } 3639 CharacterRange range = ranges->at(0); 3640 if (range.from() != 0) { 3641 return false; 3642 } 3643 for (int i = 0; i < length; i += 2) { 3644 if (special_class[i] != (range.to() + 1)) { 3645 return false; 3646 } 3647 range = ranges->at((i >> 1) + 1); 3648 if (special_class[i+1] != range.from() - 1) { 3649 return false; 3650 } 3651 } 3652 if (range.to() != 0xffff) { 3653 return false; 3654 } 3655 return true; 3656 } 3657 3658 3659 static bool CompareRanges(ZoneList<CharacterRange>* ranges, 3660 const uc16* special_class, 3661 int length) { 3662 if (ranges->length() * 2 != length) { 3663 return false; 3664 } 3665 for (int i = 0; i < length; i += 2) { 3666 CharacterRange range = ranges->at(i >> 1); 3667 if (range.from() != special_class[i] || range.to() != special_class[i+1]) { 3668 return false; 3669 } 3670 } 3671 return true; 3672 } 3673 3674 3675 bool RegExpCharacterClass::is_standard() { 3676 // TODO(lrn): Remove need for this function, by not throwing away information 3677 // along the way. 3678 if (is_negated_) { 3679 return false; 3680 } 3681 if (set_.is_standard()) { 3682 return true; 3683 } 3684 if (CompareRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) { 3685 set_.set_standard_set_type('s'); 3686 return true; 3687 } 3688 if (CompareInverseRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) { 3689 set_.set_standard_set_type('S'); 3690 return true; 3691 } 3692 if (CompareInverseRanges(set_.ranges(), 3693 kLineTerminatorRanges, 3694 kLineTerminatorRangeCount)) { 3695 set_.set_standard_set_type('.'); 3696 return true; 3697 } 3698 if (CompareRanges(set_.ranges(), 3699 kLineTerminatorRanges, 3700 kLineTerminatorRangeCount)) { 3701 set_.set_standard_set_type('n'); 3702 return true; 3703 } 3704 if (CompareRanges(set_.ranges(), kWordRanges, kWordRangeCount)) { 3705 set_.set_standard_set_type('w'); 3706 return true; 3707 } 3708 if (CompareInverseRanges(set_.ranges(), kWordRanges, kWordRangeCount)) { 3709 set_.set_standard_set_type('W'); 3710 return true; 3711 } 3712 return false; 3713 } 3714 3715 3716 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, 3717 RegExpNode* on_success) { 3718 return new TextNode(this, on_success); 3719 } 3720 3721 3722 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler, 3723 RegExpNode* on_success) { 3724 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 3725 int length = alternatives->length(); 3726 ChoiceNode* result = new ChoiceNode(length); 3727 for (int i = 0; i < length; i++) { 3728 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler, 3729 on_success)); 3730 result->AddAlternative(alternative); 3731 } 3732 return result; 3733 } 3734 3735 3736 RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler, 3737 RegExpNode* on_success) { 3738 return ToNode(min(), 3739 max(), 3740 is_greedy(), 3741 body(), 3742 compiler, 3743 on_success); 3744 } 3745 3746 3747 // Scoped object to keep track of how much we unroll quantifier loops in the 3748 // regexp graph generator. 3749 class RegExpExpansionLimiter { 3750 public: 3751 static const int kMaxExpansionFactor = 6; 3752 RegExpExpansionLimiter(RegExpCompiler* compiler, int factor) 3753 : compiler_(compiler), 3754 saved_expansion_factor_(compiler->current_expansion_factor()), 3755 ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) { 3756 ASSERT(factor > 0); 3757 if (ok_to_expand_) { 3758 if (factor > kMaxExpansionFactor) { 3759 // Avoid integer overflow of the current expansion factor. 3760 ok_to_expand_ = false; 3761 compiler->set_current_expansion_factor(kMaxExpansionFactor + 1); 3762 } else { 3763 int new_factor = saved_expansion_factor_ * factor; 3764 ok_to_expand_ = (new_factor <= kMaxExpansionFactor); 3765 compiler->set_current_expansion_factor(new_factor); 3766 } 3767 } 3768 } 3769 3770 ~RegExpExpansionLimiter() { 3771 compiler_->set_current_expansion_factor(saved_expansion_factor_); 3772 } 3773 3774 bool ok_to_expand() { return ok_to_expand_; } 3775 3776 private: 3777 RegExpCompiler* compiler_; 3778 int saved_expansion_factor_; 3779 bool ok_to_expand_; 3780 3781 DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter); 3782 }; 3783 3784 3785 RegExpNode* RegExpQuantifier::ToNode(int min, 3786 int max, 3787 bool is_greedy, 3788 RegExpTree* body, 3789 RegExpCompiler* compiler, 3790 RegExpNode* on_success, 3791 bool not_at_start) { 3792 // x{f, t} becomes this: 3793 // 3794 // (r++)<-. 3795 // | ` 3796 // | (x) 3797 // v ^ 3798 // (r=0)-->(?)---/ [if r < t] 3799 // | 3800 // [if r >= f] \----> ... 3801 // 3802 3803 // 15.10.2.5 RepeatMatcher algorithm. 3804 // The parser has already eliminated the case where max is 0. In the case 3805 // where max_match is zero the parser has removed the quantifier if min was 3806 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom. 3807 3808 // If we know that we cannot match zero length then things are a little 3809 // simpler since we don't need to make the special zero length match check 3810 // from step 2.1. If the min and max are small we can unroll a little in 3811 // this case. 3812 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,} 3813 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3} 3814 if (max == 0) return on_success; // This can happen due to recursion. 3815 bool body_can_be_empty = (body->min_match() == 0); 3816 int body_start_reg = RegExpCompiler::kNoRegister; 3817 Interval capture_registers = body->CaptureRegisters(); 3818 bool needs_capture_clearing = !capture_registers.is_empty(); 3819 if (body_can_be_empty) { 3820 body_start_reg = compiler->AllocateRegister(); 3821 } else if (FLAG_regexp_optimization && !needs_capture_clearing) { 3822 // Only unroll if there are no captures and the body can't be 3823 // empty. 3824 { 3825 RegExpExpansionLimiter limiter( 3826 compiler, min + ((max != min) ? 1 : 0)); 3827 if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) { 3828 int new_max = (max == kInfinity) ? max : max - min; 3829 // Recurse once to get the loop or optional matches after the fixed 3830 // ones. 3831 RegExpNode* answer = ToNode( 3832 0, new_max, is_greedy, body, compiler, on_success, true); 3833 // Unroll the forced matches from 0 to min. This can cause chains of 3834 // TextNodes (which the parser does not generate). These should be 3835 // combined if it turns out they hinder good code generation. 3836 for (int i = 0; i < min; i++) { 3837 answer = body->ToNode(compiler, answer); 3838 } 3839 return answer; 3840 } 3841 } 3842 if (max <= kMaxUnrolledMaxMatches && min == 0) { 3843 ASSERT(max > 0); // Due to the 'if' above. 3844 RegExpExpansionLimiter limiter(compiler, max); 3845 if (limiter.ok_to_expand()) { 3846 // Unroll the optional matches up to max. 3847 RegExpNode* answer = on_success; 3848 for (int i = 0; i < max; i++) { 3849 ChoiceNode* alternation = new ChoiceNode(2); 3850 if (is_greedy) { 3851 alternation->AddAlternative( 3852 GuardedAlternative(body->ToNode(compiler, answer))); 3853 alternation->AddAlternative(GuardedAlternative(on_success)); 3854 } else { 3855 alternation->AddAlternative(GuardedAlternative(on_success)); 3856 alternation->AddAlternative( 3857 GuardedAlternative(body->ToNode(compiler, answer))); 3858 } 3859 answer = alternation; 3860 if (not_at_start) alternation->set_not_at_start(); 3861 } 3862 return answer; 3863 } 3864 } 3865 } 3866 bool has_min = min > 0; 3867 bool has_max = max < RegExpTree::kInfinity; 3868 bool needs_counter = has_min || has_max; 3869 int reg_ctr = needs_counter 3870 ? compiler->AllocateRegister() 3871 : RegExpCompiler::kNoRegister; 3872 LoopChoiceNode* center = new LoopChoiceNode(body->min_match() == 0); 3873 if (not_at_start) center->set_not_at_start(); 3874 RegExpNode* loop_return = needs_counter 3875 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center)) 3876 : static_cast<RegExpNode*>(center); 3877 if (body_can_be_empty) { 3878 // If the body can be empty we need to check if it was and then 3879 // backtrack. 3880 loop_return = ActionNode::EmptyMatchCheck(body_start_reg, 3881 reg_ctr, 3882 min, 3883 loop_return); 3884 } 3885 RegExpNode* body_node = body->ToNode(compiler, loop_return); 3886 if (body_can_be_empty) { 3887 // If the body can be empty we need to store the start position 3888 // so we can bail out if it was empty. 3889 body_node = ActionNode::StorePosition(body_start_reg, false, body_node); 3890 } 3891 if (needs_capture_clearing) { 3892 // Before entering the body of this loop we need to clear captures. 3893 body_node = ActionNode::ClearCaptures(capture_registers, body_node); 3894 } 3895 GuardedAlternative body_alt(body_node); 3896 if (has_max) { 3897 Guard* body_guard = new Guard(reg_ctr, Guard::LT, max); 3898 body_alt.AddGuard(body_guard); 3899 } 3900 GuardedAlternative rest_alt(on_success); 3901 if (has_min) { 3902 Guard* rest_guard = new Guard(reg_ctr, Guard::GEQ, min); 3903 rest_alt.AddGuard(rest_guard); 3904 } 3905 if (is_greedy) { 3906 center->AddLoopAlternative(body_alt); 3907 center->AddContinueAlternative(rest_alt); 3908 } else { 3909 center->AddContinueAlternative(rest_alt); 3910 center->AddLoopAlternative(body_alt); 3911 } 3912 if (needs_counter) { 3913 return ActionNode::SetRegister(reg_ctr, 0, center); 3914 } else { 3915 return center; 3916 } 3917 } 3918 3919 3920 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, 3921 RegExpNode* on_success) { 3922 NodeInfo info; 3923 switch (type()) { 3924 case START_OF_LINE: 3925 return AssertionNode::AfterNewline(on_success); 3926 case START_OF_INPUT: 3927 return AssertionNode::AtStart(on_success); 3928 case BOUNDARY: 3929 return AssertionNode::AtBoundary(on_success); 3930 case NON_BOUNDARY: 3931 return AssertionNode::AtNonBoundary(on_success); 3932 case END_OF_INPUT: 3933 return AssertionNode::AtEnd(on_success); 3934 case END_OF_LINE: { 3935 // Compile $ in multiline regexps as an alternation with a positive 3936 // lookahead in one side and an end-of-input on the other side. 3937 // We need two registers for the lookahead. 3938 int stack_pointer_register = compiler->AllocateRegister(); 3939 int position_register = compiler->AllocateRegister(); 3940 // The ChoiceNode to distinguish between a newline and end-of-input. 3941 ChoiceNode* result = new ChoiceNode(2); 3942 // Create a newline atom. 3943 ZoneList<CharacterRange>* newline_ranges = 3944 new ZoneList<CharacterRange>(3); 3945 CharacterRange::AddClassEscape('n', newline_ranges); 3946 RegExpCharacterClass* newline_atom = new RegExpCharacterClass('n'); 3947 TextNode* newline_matcher = new TextNode( 3948 newline_atom, 3949 ActionNode::PositiveSubmatchSuccess(stack_pointer_register, 3950 position_register, 3951 0, // No captures inside. 3952 -1, // Ignored if no captures. 3953 on_success)); 3954 // Create an end-of-input matcher. 3955 RegExpNode* end_of_line = ActionNode::BeginSubmatch( 3956 stack_pointer_register, 3957 position_register, 3958 newline_matcher); 3959 // Add the two alternatives to the ChoiceNode. 3960 GuardedAlternative eol_alternative(end_of_line); 3961 result->AddAlternative(eol_alternative); 3962 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success)); 3963 result->AddAlternative(end_alternative); 3964 return result; 3965 } 3966 default: 3967 UNREACHABLE(); 3968 } 3969 return on_success; 3970 } 3971 3972 3973 RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler, 3974 RegExpNode* on_success) { 3975 return new BackReferenceNode(RegExpCapture::StartRegister(index()), 3976 RegExpCapture::EndRegister(index()), 3977 on_success); 3978 } 3979 3980 3981 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler, 3982 RegExpNode* on_success) { 3983 return on_success; 3984 } 3985 3986 3987 RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler, 3988 RegExpNode* on_success) { 3989 int stack_pointer_register = compiler->AllocateRegister(); 3990 int position_register = compiler->AllocateRegister(); 3991 3992 const int registers_per_capture = 2; 3993 const int register_of_first_capture = 2; 3994 int register_count = capture_count_ * registers_per_capture; 3995 int register_start = 3996 register_of_first_capture + capture_from_ * registers_per_capture; 3997 3998 RegExpNode* success; 3999 if (is_positive()) { 4000 RegExpNode* node = ActionNode::BeginSubmatch( 4001 stack_pointer_register, 4002 position_register, 4003 body()->ToNode( 4004 compiler, 4005 ActionNode::PositiveSubmatchSuccess(stack_pointer_register, 4006 position_register, 4007 register_count, 4008 register_start, 4009 on_success))); 4010 return node; 4011 } else { 4012 // We use a ChoiceNode for a negative lookahead because it has most of 4013 // the characteristics we need. It has the body of the lookahead as its 4014 // first alternative and the expression after the lookahead of the second 4015 // alternative. If the first alternative succeeds then the 4016 // NegativeSubmatchSuccess will unwind the stack including everything the 4017 // choice node set up and backtrack. If the first alternative fails then 4018 // the second alternative is tried, which is exactly the desired result 4019 // for a negative lookahead. The NegativeLookaheadChoiceNode is a special 4020 // ChoiceNode that knows to ignore the first exit when calculating quick 4021 // checks. 4022 GuardedAlternative body_alt( 4023 body()->ToNode( 4024 compiler, 4025 success = new NegativeSubmatchSuccess(stack_pointer_register, 4026 position_register, 4027 register_count, 4028 register_start))); 4029 ChoiceNode* choice_node = 4030 new NegativeLookaheadChoiceNode(body_alt, 4031 GuardedAlternative(on_success)); 4032 return ActionNode::BeginSubmatch(stack_pointer_register, 4033 position_register, 4034 choice_node); 4035 } 4036 } 4037 4038 4039 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler, 4040 RegExpNode* on_success) { 4041 return ToNode(body(), index(), compiler, on_success); 4042 } 4043 4044 4045 RegExpNode* RegExpCapture::ToNode(RegExpTree* body, 4046 int index, 4047 RegExpCompiler* compiler, 4048 RegExpNode* on_success) { 4049 int start_reg = RegExpCapture::StartRegister(index); 4050 int end_reg = RegExpCapture::EndRegister(index); 4051 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success); 4052 RegExpNode* body_node = body->ToNode(compiler, store_end); 4053 return ActionNode::StorePosition(start_reg, true, body_node); 4054 } 4055 4056 4057 RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler, 4058 RegExpNode* on_success) { 4059 ZoneList<RegExpTree*>* children = nodes(); 4060 RegExpNode* current = on_success; 4061 for (int i = children->length() - 1; i >= 0; i--) { 4062 current = children->at(i)->ToNode(compiler, current); 4063 } 4064 return current; 4065 } 4066 4067 4068 static void AddClass(const uc16* elmv, 4069 int elmc, 4070 ZoneList<CharacterRange>* ranges) { 4071 for (int i = 0; i < elmc; i += 2) { 4072 ASSERT(elmv[i] <= elmv[i + 1]); 4073 ranges->Add(CharacterRange(elmv[i], elmv[i + 1])); 4074 } 4075 } 4076 4077 4078 static void AddClassNegated(const uc16 *elmv, 4079 int elmc, 4080 ZoneList<CharacterRange>* ranges) { 4081 ASSERT(elmv[0] != 0x0000); 4082 ASSERT(elmv[elmc-1] != String::kMaxUtf16CodeUnit); 4083 uc16 last = 0x0000; 4084 for (int i = 0; i < elmc; i += 2) { 4085 ASSERT(last <= elmv[i] - 1); 4086 ASSERT(elmv[i] <= elmv[i + 1]); 4087 ranges->Add(CharacterRange(last, elmv[i] - 1)); 4088 last = elmv[i + 1] + 1; 4089 } 4090 ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit)); 4091 } 4092 4093 4094 void CharacterRange::AddClassEscape(uc16 type, 4095 ZoneList<CharacterRange>* ranges) { 4096 switch (type) { 4097 case 's': 4098 AddClass(kSpaceRanges, kSpaceRangeCount, ranges); 4099 break; 4100 case 'S': 4101 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges); 4102 break; 4103 case 'w': 4104 AddClass(kWordRanges, kWordRangeCount, ranges); 4105 break; 4106 case 'W': 4107 AddClassNegated(kWordRanges, kWordRangeCount, ranges); 4108 break; 4109 case 'd': 4110 AddClass(kDigitRanges, kDigitRangeCount, ranges); 4111 break; 4112 case 'D': 4113 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges); 4114 break; 4115 case '.': 4116 AddClassNegated(kLineTerminatorRanges, 4117 kLineTerminatorRangeCount, 4118 ranges); 4119 break; 4120 // This is not a character range as defined by the spec but a 4121 // convenient shorthand for a character class that matches any 4122 // character. 4123 case '*': 4124 ranges->Add(CharacterRange::Everything()); 4125 break; 4126 // This is the set of characters matched by the $ and ^ symbols 4127 // in multiline mode. 4128 case 'n': 4129 AddClass(kLineTerminatorRanges, 4130 kLineTerminatorRangeCount, 4131 ranges); 4132 break; 4133 default: 4134 UNREACHABLE(); 4135 } 4136 } 4137 4138 4139 Vector<const uc16> CharacterRange::GetWordBounds() { 4140 return Vector<const uc16>(kWordRanges, kWordRangeCount); 4141 } 4142 4143 4144 class CharacterRangeSplitter { 4145 public: 4146 CharacterRangeSplitter(ZoneList<CharacterRange>** included, 4147 ZoneList<CharacterRange>** excluded) 4148 : included_(included), 4149 excluded_(excluded) { } 4150 void Call(uc16 from, DispatchTable::Entry entry); 4151 4152 static const int kInBase = 0; 4153 static const int kInOverlay = 1; 4154 4155 private: 4156 ZoneList<CharacterRange>** included_; 4157 ZoneList<CharacterRange>** excluded_; 4158 }; 4159 4160 4161 void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) { 4162 if (!entry.out_set()->Get(kInBase)) return; 4163 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay) 4164 ? included_ 4165 : excluded_; 4166 if (*target == NULL) *target = new ZoneList<CharacterRange>(2); 4167 (*target)->Add(CharacterRange(entry.from(), entry.to())); 4168 } 4169 4170 4171 void CharacterRange::Split(ZoneList<CharacterRange>* base, 4172 Vector<const uc16> overlay, 4173 ZoneList<CharacterRange>** included, 4174 ZoneList<CharacterRange>** excluded) { 4175 ASSERT_EQ(NULL, *included); 4176 ASSERT_EQ(NULL, *excluded); 4177 DispatchTable table; 4178 for (int i = 0; i < base->length(); i++) 4179 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase); 4180 for (int i = 0; i < overlay.length(); i += 2) { 4181 table.AddRange(CharacterRange(overlay[i], overlay[i+1]), 4182 CharacterRangeSplitter::kInOverlay); 4183 } 4184 CharacterRangeSplitter callback(included, excluded); 4185 table.ForEach(&callback); 4186 } 4187 4188 4189 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges, 4190 bool is_ascii) { 4191 Isolate* isolate = Isolate::Current(); 4192 uc16 bottom = from(); 4193 uc16 top = to(); 4194 if (is_ascii) { 4195 if (bottom > String::kMaxAsciiCharCode) return; 4196 if (top > String::kMaxAsciiCharCode) top = String::kMaxAsciiCharCode; 4197 } 4198 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 4199 if (top == bottom) { 4200 // If this is a singleton we just expand the one character. 4201 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars); 4202 for (int i = 0; i < length; i++) { 4203 uc32 chr = chars[i]; 4204 if (chr != bottom) { 4205 ranges->Add(CharacterRange::Singleton(chars[i])); 4206 } 4207 } 4208 } else { 4209 // If this is a range we expand the characters block by block, 4210 // expanding contiguous subranges (blocks) one at a time. 4211 // The approach is as follows. For a given start character we 4212 // look up the remainder of the block that contains it (represented 4213 // by the end point), for instance we find 'z' if the character 4214 // is 'c'. A block is characterized by the property 4215 // that all characters uncanonicalize in the same way, except that 4216 // each entry in the result is incremented by the distance from the first 4217 // element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and 4218 // the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. 4219 // Once we've found the end point we look up its uncanonicalization 4220 // and produce a range for each element. For instance for [c-f] 4221 // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only 4222 // add a range if it is not already contained in the input, so [c-f] 4223 // will be skipped but [C-F] will be added. If this range is not 4224 // completely contained in a block we do this for all the blocks 4225 // covered by the range (handling characters that is not in a block 4226 // as a "singleton block"). 4227 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 4228 int pos = bottom; 4229 while (pos < top) { 4230 int length = isolate->jsregexp_canonrange()->get(pos, '\0', range); 4231 uc16 block_end; 4232 if (length == 0) { 4233 block_end = pos; 4234 } else { 4235 ASSERT_EQ(1, length); 4236 block_end = range[0]; 4237 } 4238 int end = (block_end > top) ? top : block_end; 4239 length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', range); 4240 for (int i = 0; i < length; i++) { 4241 uc32 c = range[i]; 4242 uc16 range_from = c - (block_end - pos); 4243 uc16 range_to = c - (block_end - end); 4244 if (!(bottom <= range_from && range_to <= top)) { 4245 ranges->Add(CharacterRange(range_from, range_to)); 4246 } 4247 } 4248 pos = end + 1; 4249 } 4250 } 4251 } 4252 4253 4254 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) { 4255 ASSERT_NOT_NULL(ranges); 4256 int n = ranges->length(); 4257 if (n <= 1) return true; 4258 int max = ranges->at(0).to(); 4259 for (int i = 1; i < n; i++) { 4260 CharacterRange next_range = ranges->at(i); 4261 if (next_range.from() <= max + 1) return false; 4262 max = next_range.to(); 4263 } 4264 return true; 4265 } 4266 4267 SetRelation CharacterRange::WordCharacterRelation( 4268 ZoneList<CharacterRange>* range) { 4269 ASSERT(IsCanonical(range)); 4270 int i = 0; // Word character range index. 4271 int j = 0; // Argument range index. 4272 ASSERT_NE(0, kWordRangeCount); 4273 SetRelation result; 4274 if (range->length() == 0) { 4275 result.SetElementsInSecondSet(); 4276 return result; 4277 } 4278 CharacterRange argument_range = range->at(0); 4279 CharacterRange word_range = CharacterRange(kWordRanges[0], kWordRanges[1]); 4280 while (i < kWordRangeCount && j < range->length()) { 4281 // Check the two ranges for the five cases: 4282 // - no overlap. 4283 // - partial overlap (there are elements in both ranges that isn't 4284 // in the other, and there are also elements that are in both). 4285 // - argument range entirely inside word range. 4286 // - word range entirely inside argument range. 4287 // - ranges are completely equal. 4288 4289 // First check for no overlap. The earlier range is not in the other set. 4290 if (argument_range.from() > word_range.to()) { 4291 // Ranges are disjoint. The earlier word range contains elements that 4292 // cannot be in the argument set. 4293 result.SetElementsInSecondSet(); 4294 } else if (word_range.from() > argument_range.to()) { 4295 // Ranges are disjoint. The earlier argument range contains elements that 4296 // cannot be in the word set. 4297 result.SetElementsInFirstSet(); 4298 } else if (word_range.from() <= argument_range.from() && 4299 word_range.to() >= argument_range.from()) { 4300 result.SetElementsInBothSets(); 4301 // argument range completely inside word range. 4302 if (word_range.from() < argument_range.from() || 4303 word_range.to() > argument_range.from()) { 4304 result.SetElementsInSecondSet(); 4305 } 4306 } else if (word_range.from() >= argument_range.from() && 4307 word_range.to() <= argument_range.from()) { 4308 result.SetElementsInBothSets(); 4309 result.SetElementsInFirstSet(); 4310 } else { 4311 // There is overlap, and neither is a subrange of the other 4312 result.SetElementsInFirstSet(); 4313 result.SetElementsInSecondSet(); 4314 result.SetElementsInBothSets(); 4315 } 4316 if (result.NonTrivialIntersection()) { 4317 // The result is as (im)precise as we can possibly make it. 4318 return result; 4319 } 4320 // Progress the range(s) with minimal to-character. 4321 uc16 word_to = word_range.to(); 4322 uc16 argument_to = argument_range.to(); 4323 if (argument_to <= word_to) { 4324 j++; 4325 if (j < range->length()) { 4326 argument_range = range->at(j); 4327 } 4328 } 4329 if (word_to <= argument_to) { 4330 i += 2; 4331 if (i < kWordRangeCount) { 4332 word_range = CharacterRange(kWordRanges[i], kWordRanges[i + 1]); 4333 } 4334 } 4335 } 4336 // Check if anything wasn't compared in the loop. 4337 if (i < kWordRangeCount) { 4338 // word range contains something not in argument range. 4339 result.SetElementsInSecondSet(); 4340 } else if (j < range->length()) { 4341 // Argument range contains something not in word range. 4342 result.SetElementsInFirstSet(); 4343 } 4344 4345 return result; 4346 } 4347 4348 4349 ZoneList<CharacterRange>* CharacterSet::ranges() { 4350 if (ranges_ == NULL) { 4351 ranges_ = new ZoneList<CharacterRange>(2); 4352 CharacterRange::AddClassEscape(standard_set_type_, ranges_); 4353 } 4354 return ranges_; 4355 } 4356 4357 4358 // Move a number of elements in a zonelist to another position 4359 // in the same list. Handles overlapping source and target areas. 4360 static void MoveRanges(ZoneList<CharacterRange>* list, 4361 int from, 4362 int to, 4363 int count) { 4364 // Ranges are potentially overlapping. 4365 if (from < to) { 4366 for (int i = count - 1; i >= 0; i--) { 4367 list->at(to + i) = list->at(from + i); 4368 } 4369 } else { 4370 for (int i = 0; i < count; i++) { 4371 list->at(to + i) = list->at(from + i); 4372 } 4373 } 4374 } 4375 4376 4377 static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list, 4378 int count, 4379 CharacterRange insert) { 4380 // Inserts a range into list[0..count[, which must be sorted 4381 // by from value and non-overlapping and non-adjacent, using at most 4382 // list[0..count] for the result. Returns the number of resulting 4383 // canonicalized ranges. Inserting a range may collapse existing ranges into 4384 // fewer ranges, so the return value can be anything in the range 1..count+1. 4385 uc16 from = insert.from(); 4386 uc16 to = insert.to(); 4387 int start_pos = 0; 4388 int end_pos = count; 4389 for (int i = count - 1; i >= 0; i--) { 4390 CharacterRange current = list->at(i); 4391 if (current.from() > to + 1) { 4392 end_pos = i; 4393 } else if (current.to() + 1 < from) { 4394 start_pos = i + 1; 4395 break; 4396 } 4397 } 4398 4399 // Inserted range overlaps, or is adjacent to, ranges at positions 4400 // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are 4401 // not affected by the insertion. 4402 // If start_pos == end_pos, the range must be inserted before start_pos. 4403 // if start_pos < end_pos, the entire range from start_pos to end_pos 4404 // must be merged with the insert range. 4405 4406 if (start_pos == end_pos) { 4407 // Insert between existing ranges at position start_pos. 4408 if (start_pos < count) { 4409 MoveRanges(list, start_pos, start_pos + 1, count - start_pos); 4410 } 4411 list->at(start_pos) = insert; 4412 return count + 1; 4413 } 4414 if (start_pos + 1 == end_pos) { 4415 // Replace single existing range at position start_pos. 4416 CharacterRange to_replace = list->at(start_pos); 4417 int new_from = Min(to_replace.from(), from); 4418 int new_to = Max(to_replace.to(), to); 4419 list->at(start_pos) = CharacterRange(new_from, new_to); 4420 return count; 4421 } 4422 // Replace a number of existing ranges from start_pos to end_pos - 1. 4423 // Move the remaining ranges down. 4424 4425 int new_from = Min(list->at(start_pos).from(), from); 4426 int new_to = Max(list->at(end_pos - 1).to(), to); 4427 if (end_pos < count) { 4428 MoveRanges(list, end_pos, start_pos + 1, count - end_pos); 4429 } 4430 list->at(start_pos) = CharacterRange(new_from, new_to); 4431 return count - (end_pos - start_pos) + 1; 4432 } 4433 4434 4435 void CharacterSet::Canonicalize() { 4436 // Special/default classes are always considered canonical. The result 4437 // of calling ranges() will be sorted. 4438 if (ranges_ == NULL) return; 4439 CharacterRange::Canonicalize(ranges_); 4440 } 4441 4442 4443 void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) { 4444 if (character_ranges->length() <= 1) return; 4445 // Check whether ranges are already canonical (increasing, non-overlapping, 4446 // non-adjacent). 4447 int n = character_ranges->length(); 4448 int max = character_ranges->at(0).to(); 4449 int i = 1; 4450 while (i < n) { 4451 CharacterRange current = character_ranges->at(i); 4452 if (current.from() <= max + 1) { 4453 break; 4454 } 4455 max = current.to(); 4456 i++; 4457 } 4458 // Canonical until the i'th range. If that's all of them, we are done. 4459 if (i == n) return; 4460 4461 // The ranges at index i and forward are not canonicalized. Make them so by 4462 // doing the equivalent of insertion sort (inserting each into the previous 4463 // list, in order). 4464 // Notice that inserting a range can reduce the number of ranges in the 4465 // result due to combining of adjacent and overlapping ranges. 4466 int read = i; // Range to insert. 4467 int num_canonical = i; // Length of canonicalized part of list. 4468 do { 4469 num_canonical = InsertRangeInCanonicalList(character_ranges, 4470 num_canonical, 4471 character_ranges->at(read)); 4472 read++; 4473 } while (read < n); 4474 character_ranges->Rewind(num_canonical); 4475 4476 ASSERT(CharacterRange::IsCanonical(character_ranges)); 4477 } 4478 4479 4480 // Utility function for CharacterRange::Merge. Adds a range at the end of 4481 // a canonicalized range list, if necessary merging the range with the last 4482 // range of the list. 4483 static void AddRangeToSet(ZoneList<CharacterRange>* set, CharacterRange range) { 4484 if (set == NULL) return; 4485 ASSERT(set->length() == 0 || set->at(set->length() - 1).to() < range.from()); 4486 int n = set->length(); 4487 if (n > 0) { 4488 CharacterRange lastRange = set->at(n - 1); 4489 if (lastRange.to() == range.from() - 1) { 4490 set->at(n - 1) = CharacterRange(lastRange.from(), range.to()); 4491 return; 4492 } 4493 } 4494 set->Add(range); 4495 } 4496 4497 4498 static void AddRangeToSelectedSet(int selector, 4499 ZoneList<CharacterRange>* first_set, 4500 ZoneList<CharacterRange>* second_set, 4501 ZoneList<CharacterRange>* intersection_set, 4502 CharacterRange range) { 4503 switch (selector) { 4504 case kInsideFirst: 4505 AddRangeToSet(first_set, range); 4506 break; 4507 case kInsideSecond: 4508 AddRangeToSet(second_set, range); 4509 break; 4510 case kInsideBoth: 4511 AddRangeToSet(intersection_set, range); 4512 break; 4513 } 4514 } 4515 4516 4517 4518 void CharacterRange::Merge(ZoneList<CharacterRange>* first_set, 4519 ZoneList<CharacterRange>* second_set, 4520 ZoneList<CharacterRange>* first_set_only_out, 4521 ZoneList<CharacterRange>* second_set_only_out, 4522 ZoneList<CharacterRange>* both_sets_out) { 4523 // Inputs are canonicalized. 4524 ASSERT(CharacterRange::IsCanonical(first_set)); 4525 ASSERT(CharacterRange::IsCanonical(second_set)); 4526 // Outputs are empty, if applicable. 4527 ASSERT(first_set_only_out == NULL || first_set_only_out->length() == 0); 4528 ASSERT(second_set_only_out == NULL || second_set_only_out->length() == 0); 4529 ASSERT(both_sets_out == NULL || both_sets_out->length() == 0); 4530 4531 // Merge sets by iterating through the lists in order of lowest "from" value, 4532 // and putting intervals into one of three sets. 4533 4534 if (first_set->length() == 0) { 4535 second_set_only_out->AddAll(*second_set); 4536 return; 4537 } 4538 if (second_set->length() == 0) { 4539 first_set_only_out->AddAll(*first_set); 4540 return; 4541 } 4542 // Indices into input lists. 4543 int i1 = 0; 4544 int i2 = 0; 4545 // Cache length of input lists. 4546 int n1 = first_set->length(); 4547 int n2 = second_set->length(); 4548 // Current range. May be invalid if state is kInsideNone. 4549 int from = 0; 4550 int to = -1; 4551 // Where current range comes from. 4552 int state = kInsideNone; 4553 4554 while (i1 < n1 || i2 < n2) { 4555 CharacterRange next_range; 4556 int range_source; 4557 if (i2 == n2 || 4558 (i1 < n1 && first_set->at(i1).from() < second_set->at(i2).from())) { 4559 // Next smallest element is in first set. 4560 next_range = first_set->at(i1++); 4561 range_source = kInsideFirst; 4562 } else { 4563 // Next smallest element is in second set. 4564 next_range = second_set->at(i2++); 4565 range_source = kInsideSecond; 4566 } 4567 if (to < next_range.from()) { 4568 // Ranges disjoint: |current| |next| 4569 AddRangeToSelectedSet(state, 4570 first_set_only_out, 4571 second_set_only_out, 4572 both_sets_out, 4573 CharacterRange(from, to)); 4574 from = next_range.from(); 4575 to = next_range.to(); 4576 state = range_source; 4577 } else { 4578 if (from < next_range.from()) { 4579 AddRangeToSelectedSet(state, 4580 first_set_only_out, 4581 second_set_only_out, 4582 both_sets_out, 4583 CharacterRange(from, next_range.from()-1)); 4584 } 4585 if (to < next_range.to()) { 4586 // Ranges overlap: |current| 4587 // |next| 4588 AddRangeToSelectedSet(state | range_source, 4589 first_set_only_out, 4590 second_set_only_out, 4591 both_sets_out, 4592 CharacterRange(next_range.from(), to)); 4593 from = to + 1; 4594 to = next_range.to(); 4595 state = range_source; 4596 } else { 4597 // Range included: |current| , possibly ending at same character. 4598 // |next| 4599 AddRangeToSelectedSet( 4600 state | range_source, 4601 first_set_only_out, 4602 second_set_only_out, 4603 both_sets_out, 4604 CharacterRange(next_range.from(), next_range.to())); 4605 from = next_range.to() + 1; 4606 // If ranges end at same character, both ranges are consumed completely. 4607 if (next_range.to() == to) state = kInsideNone; 4608 } 4609 } 4610 } 4611 AddRangeToSelectedSet(state, 4612 first_set_only_out, 4613 second_set_only_out, 4614 both_sets_out, 4615 CharacterRange(from, to)); 4616 } 4617 4618 4619 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges, 4620 ZoneList<CharacterRange>* negated_ranges) { 4621 ASSERT(CharacterRange::IsCanonical(ranges)); 4622 ASSERT_EQ(0, negated_ranges->length()); 4623 int range_count = ranges->length(); 4624 uc16 from = 0; 4625 int i = 0; 4626 if (range_count > 0 && ranges->at(0).from() == 0) { 4627 from = ranges->at(0).to(); 4628 i = 1; 4629 } 4630 while (i < range_count) { 4631 CharacterRange range = ranges->at(i); 4632 negated_ranges->Add(CharacterRange(from + 1, range.from() - 1)); 4633 from = range.to(); 4634 i++; 4635 } 4636 if (from < String::kMaxUtf16CodeUnit) { 4637 negated_ranges->Add(CharacterRange(from + 1, String::kMaxUtf16CodeUnit)); 4638 } 4639 } 4640 4641 4642 4643 // ------------------------------------------------------------------- 4644 // Interest propagation 4645 4646 4647 RegExpNode* RegExpNode::TryGetSibling(NodeInfo* info) { 4648 for (int i = 0; i < siblings_.length(); i++) { 4649 RegExpNode* sibling = siblings_.Get(i); 4650 if (sibling->info()->Matches(info)) 4651 return sibling; 4652 } 4653 return NULL; 4654 } 4655 4656 4657 RegExpNode* RegExpNode::EnsureSibling(NodeInfo* info, bool* cloned) { 4658 ASSERT_EQ(false, *cloned); 4659 siblings_.Ensure(this); 4660 RegExpNode* result = TryGetSibling(info); 4661 if (result != NULL) return result; 4662 result = this->Clone(); 4663 NodeInfo* new_info = result->info(); 4664 new_info->ResetCompilationState(); 4665 new_info->AddFromPreceding(info); 4666 AddSibling(result); 4667 *cloned = true; 4668 return result; 4669 } 4670 4671 4672 template <class C> 4673 static RegExpNode* PropagateToEndpoint(C* node, NodeInfo* info) { 4674 NodeInfo full_info(*node->info()); 4675 full_info.AddFromPreceding(info); 4676 bool cloned = false; 4677 return RegExpNode::EnsureSibling(node, &full_info, &cloned); 4678 } 4679 4680 4681 // ------------------------------------------------------------------- 4682 // Splay tree 4683 4684 4685 OutSet* OutSet::Extend(unsigned value) { 4686 if (Get(value)) 4687 return this; 4688 if (successors() != NULL) { 4689 for (int i = 0; i < successors()->length(); i++) { 4690 OutSet* successor = successors()->at(i); 4691 if (successor->Get(value)) 4692 return successor; 4693 } 4694 } else { 4695 successors_ = new ZoneList<OutSet*>(2); 4696 } 4697 OutSet* result = new OutSet(first_, remaining_); 4698 result->Set(value); 4699 successors()->Add(result); 4700 return result; 4701 } 4702 4703 4704 void OutSet::Set(unsigned value) { 4705 if (value < kFirstLimit) { 4706 first_ |= (1 << value); 4707 } else { 4708 if (remaining_ == NULL) 4709 remaining_ = new ZoneList<unsigned>(1); 4710 if (remaining_->is_empty() || !remaining_->Contains(value)) 4711 remaining_->Add(value); 4712 } 4713 } 4714 4715 4716 bool OutSet::Get(unsigned value) { 4717 if (value < kFirstLimit) { 4718 return (first_ & (1 << value)) != 0; 4719 } else if (remaining_ == NULL) { 4720 return false; 4721 } else { 4722 return remaining_->Contains(value); 4723 } 4724 } 4725 4726 4727 const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar; 4728 4729 4730 void DispatchTable::AddRange(CharacterRange full_range, int value) { 4731 CharacterRange current = full_range; 4732 if (tree()->is_empty()) { 4733 // If this is the first range we just insert into the table. 4734 ZoneSplayTree<Config>::Locator loc; 4735 ASSERT_RESULT(tree()->Insert(current.from(), &loc)); 4736 loc.set_value(Entry(current.from(), current.to(), empty()->Extend(value))); 4737 return; 4738 } 4739 // First see if there is a range to the left of this one that 4740 // overlaps. 4741 ZoneSplayTree<Config>::Locator loc; 4742 if (tree()->FindGreatestLessThan(current.from(), &loc)) { 4743 Entry* entry = &loc.value(); 4744 // If we've found a range that overlaps with this one, and it 4745 // starts strictly to the left of this one, we have to fix it 4746 // because the following code only handles ranges that start on 4747 // or after the start point of the range we're adding. 4748 if (entry->from() < current.from() && entry->to() >= current.from()) { 4749 // Snap the overlapping range in half around the start point of 4750 // the range we're adding. 4751 CharacterRange left(entry->from(), current.from() - 1); 4752 CharacterRange right(current.from(), entry->to()); 4753 // The left part of the overlapping range doesn't overlap. 4754 // Truncate the whole entry to be just the left part. 4755 entry->set_to(left.to()); 4756 // The right part is the one that overlaps. We add this part 4757 // to the map and let the next step deal with merging it with 4758 // the range we're adding. 4759 ZoneSplayTree<Config>::Locator loc; 4760 ASSERT_RESULT(tree()->Insert(right.from(), &loc)); 4761 loc.set_value(Entry(right.from(), 4762 right.to(), 4763 entry->out_set())); 4764 } 4765 } 4766 while (current.is_valid()) { 4767 if (tree()->FindLeastGreaterThan(current.from(), &loc) && 4768 (loc.value().from() <= current.to()) && 4769 (loc.value().to() >= current.from())) { 4770 Entry* entry = &loc.value(); 4771 // We have overlap. If there is space between the start point of 4772 // the range we're adding and where the overlapping range starts 4773 // then we have to add a range covering just that space. 4774 if (current.from() < entry->from()) { 4775 ZoneSplayTree<Config>::Locator ins; 4776 ASSERT_RESULT(tree()->Insert(current.from(), &ins)); 4777 ins.set_value(Entry(current.from(), 4778 entry->from() - 1, 4779 empty()->Extend(value))); 4780 current.set_from(entry->from()); 4781 } 4782 ASSERT_EQ(current.from(), entry->from()); 4783 // If the overlapping range extends beyond the one we want to add 4784 // we have to snap the right part off and add it separately. 4785 if (entry->to() > current.to()) { 4786 ZoneSplayTree<Config>::Locator ins; 4787 ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins)); 4788 ins.set_value(Entry(current.to() + 1, 4789 entry->to(), 4790 entry->out_set())); 4791 entry->set_to(current.to()); 4792 } 4793 ASSERT(entry->to() <= current.to()); 4794 // The overlapping range is now completely contained by the range 4795 // we're adding so we can just update it and move the start point 4796 // of the range we're adding just past it. 4797 entry->AddValue(value); 4798 // Bail out if the last interval ended at 0xFFFF since otherwise 4799 // adding 1 will wrap around to 0. 4800 if (entry->to() == String::kMaxUtf16CodeUnit) 4801 break; 4802 ASSERT(entry->to() + 1 > current.from()); 4803 current.set_from(entry->to() + 1); 4804 } else { 4805 // There is no overlap so we can just add the range 4806 ZoneSplayTree<Config>::Locator ins; 4807 ASSERT_RESULT(tree()->Insert(current.from(), &ins)); 4808 ins.set_value(Entry(current.from(), 4809 current.to(), 4810 empty()->Extend(value))); 4811 break; 4812 } 4813 } 4814 } 4815 4816 4817 OutSet* DispatchTable::Get(uc16 value) { 4818 ZoneSplayTree<Config>::Locator loc; 4819 if (!tree()->FindGreatestLessThan(value, &loc)) 4820 return empty(); 4821 Entry* entry = &loc.value(); 4822 if (value <= entry->to()) 4823 return entry->out_set(); 4824 else 4825 return empty(); 4826 } 4827 4828 4829 // ------------------------------------------------------------------- 4830 // Analysis 4831 4832 4833 void Analysis::EnsureAnalyzed(RegExpNode* that) { 4834 StackLimitCheck check(Isolate::Current()); 4835 if (check.HasOverflowed()) { 4836 fail("Stack overflow"); 4837 return; 4838 } 4839 if (that->info()->been_analyzed || that->info()->being_analyzed) 4840 return; 4841 that->info()->being_analyzed = true; 4842 that->Accept(this); 4843 that->info()->being_analyzed = false; 4844 that->info()->been_analyzed = true; 4845 } 4846 4847 4848 void Analysis::VisitEnd(EndNode* that) { 4849 // nothing to do 4850 } 4851 4852 4853 void TextNode::CalculateOffsets() { 4854 int element_count = elements()->length(); 4855 // Set up the offsets of the elements relative to the start. This is a fixed 4856 // quantity since a TextNode can only contain fixed-width things. 4857 int cp_offset = 0; 4858 for (int i = 0; i < element_count; i++) { 4859 TextElement& elm = elements()->at(i); 4860 elm.cp_offset = cp_offset; 4861 if (elm.type == TextElement::ATOM) { 4862 cp_offset += elm.data.u_atom->data().length(); 4863 } else { 4864 cp_offset++; 4865 } 4866 } 4867 } 4868 4869 4870 void Analysis::VisitText(TextNode* that) { 4871 if (ignore_case_) { 4872 that->MakeCaseIndependent(is_ascii_); 4873 } 4874 EnsureAnalyzed(that->on_success()); 4875 if (!has_failed()) { 4876 that->CalculateOffsets(); 4877 } 4878 } 4879 4880 4881 void Analysis::VisitAction(ActionNode* that) { 4882 RegExpNode* target = that->on_success(); 4883 EnsureAnalyzed(target); 4884 if (!has_failed()) { 4885 // If the next node is interested in what it follows then this node 4886 // has to be interested too so it can pass the information on. 4887 that->info()->AddFromFollowing(target->info()); 4888 } 4889 } 4890 4891 4892 void Analysis::VisitChoice(ChoiceNode* that) { 4893 NodeInfo* info = that->info(); 4894 for (int i = 0; i < that->alternatives()->length(); i++) { 4895 RegExpNode* node = that->alternatives()->at(i).node(); 4896 EnsureAnalyzed(node); 4897 if (has_failed()) return; 4898 // Anything the following nodes need to know has to be known by 4899 // this node also, so it can pass it on. 4900 info->AddFromFollowing(node->info()); 4901 } 4902 } 4903 4904 4905 void Analysis::VisitLoopChoice(LoopChoiceNode* that) { 4906 NodeInfo* info = that->info(); 4907 for (int i = 0; i < that->alternatives()->length(); i++) { 4908 RegExpNode* node = that->alternatives()->at(i).node(); 4909 if (node != that->loop_node()) { 4910 EnsureAnalyzed(node); 4911 if (has_failed()) return; 4912 info->AddFromFollowing(node->info()); 4913 } 4914 } 4915 // Check the loop last since it may need the value of this node 4916 // to get a correct result. 4917 EnsureAnalyzed(that->loop_node()); 4918 if (!has_failed()) { 4919 info->AddFromFollowing(that->loop_node()->info()); 4920 } 4921 } 4922 4923 4924 void Analysis::VisitBackReference(BackReferenceNode* that) { 4925 EnsureAnalyzed(that->on_success()); 4926 } 4927 4928 4929 void Analysis::VisitAssertion(AssertionNode* that) { 4930 EnsureAnalyzed(that->on_success()); 4931 AssertionNode::AssertionNodeType type = that->type(); 4932 if (type == AssertionNode::AT_BOUNDARY || 4933 type == AssertionNode::AT_NON_BOUNDARY) { 4934 // Check if the following character is known to be a word character 4935 // or known to not be a word character. 4936 ZoneList<CharacterRange>* following_chars = that->FirstCharacterSet(); 4937 4938 CharacterRange::Canonicalize(following_chars); 4939 4940 SetRelation word_relation = 4941 CharacterRange::WordCharacterRelation(following_chars); 4942 if (word_relation.Disjoint()) { 4943 // Includes the case where following_chars is empty (e.g., end-of-input). 4944 // Following character is definitely *not* a word character. 4945 type = (type == AssertionNode::AT_BOUNDARY) ? 4946 AssertionNode::AFTER_WORD_CHARACTER : 4947 AssertionNode::AFTER_NONWORD_CHARACTER; 4948 that->set_type(type); 4949 } else if (word_relation.ContainedIn()) { 4950 // Following character is definitely a word character. 4951 type = (type == AssertionNode::AT_BOUNDARY) ? 4952 AssertionNode::AFTER_NONWORD_CHARACTER : 4953 AssertionNode::AFTER_WORD_CHARACTER; 4954 that->set_type(type); 4955 } 4956 } 4957 } 4958 4959 4960 ZoneList<CharacterRange>* RegExpNode::FirstCharacterSet() { 4961 if (first_character_set_ == NULL) { 4962 if (ComputeFirstCharacterSet(kFirstCharBudget) < 0) { 4963 // If we can't find an exact solution within the budget, we 4964 // set the value to the set of every character, i.e., all characters 4965 // are possible. 4966 ZoneList<CharacterRange>* all_set = new ZoneList<CharacterRange>(1); 4967 all_set->Add(CharacterRange::Everything()); 4968 first_character_set_ = all_set; 4969 } 4970 } 4971 return first_character_set_; 4972 } 4973 4974 4975 int RegExpNode::ComputeFirstCharacterSet(int budget) { 4976 // Default behavior is to not be able to determine the first character. 4977 return kComputeFirstCharacterSetFail; 4978 } 4979 4980 4981 int LoopChoiceNode::ComputeFirstCharacterSet(int budget) { 4982 budget--; 4983 if (budget >= 0) { 4984 // Find loop min-iteration. It's the value of the guarded choice node 4985 // with a GEQ guard, if any. 4986 int min_repetition = 0; 4987 4988 for (int i = 0; i <= 1; i++) { 4989 GuardedAlternative alternative = alternatives()->at(i); 4990 ZoneList<Guard*>* guards = alternative.guards(); 4991 if (guards != NULL && guards->length() > 0) { 4992 Guard* guard = guards->at(0); 4993 if (guard->op() == Guard::GEQ) { 4994 min_repetition = guard->value(); 4995 break; 4996 } 4997 } 4998 } 4999 5000 budget = loop_node()->ComputeFirstCharacterSet(budget); 5001 if (budget >= 0) { 5002 ZoneList<CharacterRange>* character_set = 5003 loop_node()->first_character_set(); 5004 if (body_can_be_zero_length() || min_repetition == 0) { 5005 budget = continue_node()->ComputeFirstCharacterSet(budget); 5006 if (budget < 0) return budget; 5007 ZoneList<CharacterRange>* body_set = 5008 continue_node()->first_character_set(); 5009 ZoneList<CharacterRange>* union_set = 5010 new ZoneList<CharacterRange>(Max(character_set->length(), 5011 body_set->length())); 5012 CharacterRange::Merge(character_set, 5013 body_set, 5014 union_set, 5015 union_set, 5016 union_set); 5017 character_set = union_set; 5018 } 5019 set_first_character_set(character_set); 5020 } 5021 } 5022 return budget; 5023 } 5024 5025 5026 int NegativeLookaheadChoiceNode::ComputeFirstCharacterSet(int budget) { 5027 budget--; 5028 if (budget >= 0) { 5029 GuardedAlternative successor = this->alternatives()->at(1); 5030 RegExpNode* successor_node = successor.node(); 5031 budget = successor_node->ComputeFirstCharacterSet(budget); 5032 if (budget >= 0) { 5033 set_first_character_set(successor_node->first_character_set()); 5034 } 5035 } 5036 return budget; 5037 } 5038 5039 5040 // The first character set of an EndNode is unknowable. Just use the 5041 // default implementation that fails and returns all characters as possible. 5042 5043 5044 int AssertionNode::ComputeFirstCharacterSet(int budget) { 5045 budget -= 1; 5046 if (budget >= 0) { 5047 switch (type_) { 5048 case AT_END: { 5049 set_first_character_set(new ZoneList<CharacterRange>(0)); 5050 break; 5051 } 5052 case AT_START: 5053 case AT_BOUNDARY: 5054 case AT_NON_BOUNDARY: 5055 case AFTER_NEWLINE: 5056 case AFTER_NONWORD_CHARACTER: 5057 case AFTER_WORD_CHARACTER: { 5058 ASSERT_NOT_NULL(on_success()); 5059 budget = on_success()->ComputeFirstCharacterSet(budget); 5060 if (budget >= 0) { 5061 set_first_character_set(on_success()->first_character_set()); 5062 } 5063 break; 5064 } 5065 } 5066 } 5067 return budget; 5068 } 5069 5070 5071 int ActionNode::ComputeFirstCharacterSet(int budget) { 5072 if (type_ == POSITIVE_SUBMATCH_SUCCESS) return kComputeFirstCharacterSetFail; 5073 budget--; 5074 if (budget >= 0) { 5075 ASSERT_NOT_NULL(on_success()); 5076 budget = on_success()->ComputeFirstCharacterSet(budget); 5077 if (budget >= 0) { 5078 set_first_character_set(on_success()->first_character_set()); 5079 } 5080 } 5081 return budget; 5082 } 5083 5084 5085 int BackReferenceNode::ComputeFirstCharacterSet(int budget) { 5086 // We don't know anything about the first character of a backreference 5087 // at this point. 5088 // The potential first characters are the first characters of the capture, 5089 // and the first characters of the on_success node, depending on whether the 5090 // capture can be empty and whether it is known to be participating or known 5091 // not to be. 5092 return kComputeFirstCharacterSetFail; 5093 } 5094 5095 5096 int TextNode::ComputeFirstCharacterSet(int budget) { 5097 budget--; 5098 if (budget >= 0) { 5099 ASSERT_NE(0, elements()->length()); 5100 TextElement text = elements()->at(0); 5101 if (text.type == TextElement::ATOM) { 5102 RegExpAtom* atom = text.data.u_atom; 5103 ASSERT_NE(0, atom->length()); 5104 uc16 first_char = atom->data()[0]; 5105 ZoneList<CharacterRange>* range = new ZoneList<CharacterRange>(1); 5106 range->Add(CharacterRange(first_char, first_char)); 5107 set_first_character_set(range); 5108 } else { 5109 ASSERT(text.type == TextElement::CHAR_CLASS); 5110 RegExpCharacterClass* char_class = text.data.u_char_class; 5111 ZoneList<CharacterRange>* ranges = char_class->ranges(); 5112 // TODO(lrn): Canonicalize ranges when they are created 5113 // instead of waiting until now. 5114 CharacterRange::Canonicalize(ranges); 5115 if (char_class->is_negated()) { 5116 int length = ranges->length(); 5117 int new_length = length + 1; 5118 if (length > 0) { 5119 if (ranges->at(0).from() == 0) new_length--; 5120 if (ranges->at(length - 1).to() == String::kMaxUtf16CodeUnit) { 5121 new_length--; 5122 } 5123 } 5124 ZoneList<CharacterRange>* negated_ranges = 5125 new ZoneList<CharacterRange>(new_length); 5126 CharacterRange::Negate(ranges, negated_ranges); 5127 set_first_character_set(negated_ranges); 5128 } else { 5129 set_first_character_set(ranges); 5130 } 5131 } 5132 } 5133 return budget; 5134 } 5135 5136 5137 5138 // ------------------------------------------------------------------- 5139 // Dispatch table construction 5140 5141 5142 void DispatchTableConstructor::VisitEnd(EndNode* that) { 5143 AddRange(CharacterRange::Everything()); 5144 } 5145 5146 5147 void DispatchTableConstructor::BuildTable(ChoiceNode* node) { 5148 node->set_being_calculated(true); 5149 ZoneList<GuardedAlternative>* alternatives = node->alternatives(); 5150 for (int i = 0; i < alternatives->length(); i++) { 5151 set_choice_index(i); 5152 alternatives->at(i).node()->Accept(this); 5153 } 5154 node->set_being_calculated(false); 5155 } 5156 5157 5158 class AddDispatchRange { 5159 public: 5160 explicit AddDispatchRange(DispatchTableConstructor* constructor) 5161 : constructor_(constructor) { } 5162 void Call(uc32 from, DispatchTable::Entry entry); 5163 private: 5164 DispatchTableConstructor* constructor_; 5165 }; 5166 5167 5168 void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) { 5169 CharacterRange range(from, entry.to()); 5170 constructor_->AddRange(range); 5171 } 5172 5173 5174 void DispatchTableConstructor::VisitChoice(ChoiceNode* node) { 5175 if (node->being_calculated()) 5176 return; 5177 DispatchTable* table = node->GetTable(ignore_case_); 5178 AddDispatchRange adder(this); 5179 table->ForEach(&adder); 5180 } 5181 5182 5183 void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) { 5184 // TODO(160): Find the node that we refer back to and propagate its start 5185 // set back to here. For now we just accept anything. 5186 AddRange(CharacterRange::Everything()); 5187 } 5188 5189 5190 void DispatchTableConstructor::VisitAssertion(AssertionNode* that) { 5191 RegExpNode* target = that->on_success(); 5192 target->Accept(this); 5193 } 5194 5195 5196 static int CompareRangeByFrom(const CharacterRange* a, 5197 const CharacterRange* b) { 5198 return Compare<uc16>(a->from(), b->from()); 5199 } 5200 5201 5202 void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) { 5203 ranges->Sort(CompareRangeByFrom); 5204 uc16 last = 0; 5205 for (int i = 0; i < ranges->length(); i++) { 5206 CharacterRange range = ranges->at(i); 5207 if (last < range.from()) 5208 AddRange(CharacterRange(last, range.from() - 1)); 5209 if (range.to() >= last) { 5210 if (range.to() == String::kMaxUtf16CodeUnit) { 5211 return; 5212 } else { 5213 last = range.to() + 1; 5214 } 5215 } 5216 } 5217 AddRange(CharacterRange(last, String::kMaxUtf16CodeUnit)); 5218 } 5219 5220 5221 void DispatchTableConstructor::VisitText(TextNode* that) { 5222 TextElement elm = that->elements()->at(0); 5223 switch (elm.type) { 5224 case TextElement::ATOM: { 5225 uc16 c = elm.data.u_atom->data()[0]; 5226 AddRange(CharacterRange(c, c)); 5227 break; 5228 } 5229 case TextElement::CHAR_CLASS: { 5230 RegExpCharacterClass* tree = elm.data.u_char_class; 5231 ZoneList<CharacterRange>* ranges = tree->ranges(); 5232 if (tree->is_negated()) { 5233 AddInverse(ranges); 5234 } else { 5235 for (int i = 0; i < ranges->length(); i++) 5236 AddRange(ranges->at(i)); 5237 } 5238 break; 5239 } 5240 default: { 5241 UNIMPLEMENTED(); 5242 } 5243 } 5244 } 5245 5246 5247 void DispatchTableConstructor::VisitAction(ActionNode* that) { 5248 RegExpNode* target = that->on_success(); 5249 target->Accept(this); 5250 } 5251 5252 5253 RegExpEngine::CompilationResult RegExpEngine::Compile(RegExpCompileData* data, 5254 bool ignore_case, 5255 bool is_multiline, 5256 Handle<String> pattern, 5257 bool is_ascii) { 5258 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) { 5259 return IrregexpRegExpTooBig(); 5260 } 5261 RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii); 5262 // Wrap the body of the regexp in capture #0. 5263 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree, 5264 0, 5265 &compiler, 5266 compiler.accept()); 5267 RegExpNode* node = captured_body; 5268 bool is_end_anchored = data->tree->IsAnchoredAtEnd(); 5269 bool is_start_anchored = data->tree->IsAnchoredAtStart(); 5270 int max_length = data->tree->max_match(); 5271 if (!is_start_anchored) { 5272 // Add a .*? at the beginning, outside the body capture, unless 5273 // this expression is anchored at the beginning. 5274 RegExpNode* loop_node = 5275 RegExpQuantifier::ToNode(0, 5276 RegExpTree::kInfinity, 5277 false, 5278 new RegExpCharacterClass('*'), 5279 &compiler, 5280 captured_body, 5281 data->contains_anchor); 5282 5283 if (data->contains_anchor) { 5284 // Unroll loop once, to take care of the case that might start 5285 // at the start of input. 5286 ChoiceNode* first_step_node = new ChoiceNode(2); 5287 first_step_node->AddAlternative(GuardedAlternative(captured_body)); 5288 first_step_node->AddAlternative(GuardedAlternative( 5289 new TextNode(new RegExpCharacterClass('*'), loop_node))); 5290 node = first_step_node; 5291 } else { 5292 node = loop_node; 5293 } 5294 } 5295 data->node = node; 5296 Analysis analysis(ignore_case, is_ascii); 5297 analysis.EnsureAnalyzed(node); 5298 if (analysis.has_failed()) { 5299 const char* error_message = analysis.error_message(); 5300 return CompilationResult(error_message); 5301 } 5302 5303 // Create the correct assembler for the architecture. 5304 #ifndef V8_INTERPRETED_REGEXP 5305 // Native regexp implementation. 5306 5307 NativeRegExpMacroAssembler::Mode mode = 5308 is_ascii ? NativeRegExpMacroAssembler::ASCII 5309 : NativeRegExpMacroAssembler::UC16; 5310 5311 #if V8_TARGET_ARCH_IA32 5312 RegExpMacroAssemblerIA32 macro_assembler(mode, (data->capture_count + 1) * 2); 5313 #elif V8_TARGET_ARCH_X64 5314 RegExpMacroAssemblerX64 macro_assembler(mode, (data->capture_count + 1) * 2); 5315 #elif V8_TARGET_ARCH_ARM 5316 RegExpMacroAssemblerARM macro_assembler(mode, (data->capture_count + 1) * 2); 5317 #elif V8_TARGET_ARCH_MIPS 5318 RegExpMacroAssemblerMIPS macro_assembler(mode, (data->capture_count + 1) * 2); 5319 #endif 5320 5321 #else // V8_INTERPRETED_REGEXP 5322 // Interpreted regexp implementation. 5323 EmbeddedVector<byte, 1024> codes; 5324 RegExpMacroAssemblerIrregexp macro_assembler(codes); 5325 #endif // V8_INTERPRETED_REGEXP 5326 5327 // Inserted here, instead of in Assembler, because it depends on information 5328 // in the AST that isn't replicated in the Node structure. 5329 static const int kMaxBacksearchLimit = 1024; 5330 if (is_end_anchored && 5331 !is_start_anchored && 5332 max_length < kMaxBacksearchLimit) { 5333 macro_assembler.SetCurrentPositionFromEnd(max_length); 5334 } 5335 5336 return compiler.Assemble(¯o_assembler, 5337 node, 5338 data->capture_count, 5339 pattern); 5340 } 5341 5342 5343 }} // namespace v8::internal 5344