1 // Copyright 2006-2009 the V8 project authors. All rights reserved. 2 // Redistribution and use in source and binary forms, with or without 3 // modification, are permitted provided that the following conditions are 4 // met: 5 // 6 // * Redistributions of source code must retain the above copyright 7 // notice, this list of conditions and the following disclaimer. 8 // * Redistributions in binary form must reproduce the above 9 // copyright notice, this list of conditions and the following 10 // disclaimer in the documentation and/or other materials provided 11 // with the distribution. 12 // * Neither the name of Google Inc. nor the names of its 13 // contributors may be used to endorse or promote products derived 14 // from this software without specific prior written permission. 15 // 16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 28 #include "v8.h" 29 30 #include "ast.h" 31 #include "compiler.h" 32 #include "execution.h" 33 #include "factory.h" 34 #include "jsregexp.h" 35 #include "platform.h" 36 #include "string-search.h" 37 #include "runtime.h" 38 #include "compilation-cache.h" 39 #include "string-stream.h" 40 #include "parser.h" 41 #include "regexp-macro-assembler.h" 42 #include "regexp-macro-assembler-tracer.h" 43 #include "regexp-macro-assembler-irregexp.h" 44 #include "regexp-stack.h" 45 46 #ifndef V8_INTERPRETED_REGEXP 47 #if V8_TARGET_ARCH_IA32 48 #include "ia32/regexp-macro-assembler-ia32.h" 49 #elif V8_TARGET_ARCH_X64 50 #include "x64/regexp-macro-assembler-x64.h" 51 #elif V8_TARGET_ARCH_ARM 52 #include "arm/regexp-macro-assembler-arm.h" 53 #elif V8_TARGET_ARCH_MIPS 54 #include "mips/regexp-macro-assembler-mips.h" 55 #else 56 #error Unsupported target architecture. 57 #endif 58 #endif 59 60 #include "interpreter-irregexp.h" 61 62 63 namespace v8 { 64 namespace internal { 65 66 Handle<Object> RegExpImpl::CreateRegExpLiteral(Handle<JSFunction> constructor, 67 Handle<String> pattern, 68 Handle<String> flags, 69 bool* has_pending_exception) { 70 // Call the construct code with 2 arguments. 71 Object** argv[2] = { Handle<Object>::cast(pattern).location(), 72 Handle<Object>::cast(flags).location() }; 73 return Execution::New(constructor, 2, argv, has_pending_exception); 74 } 75 76 77 static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) { 78 int flags = JSRegExp::NONE; 79 for (int i = 0; i < str->length(); i++) { 80 switch (str->Get(i)) { 81 case 'i': 82 flags |= JSRegExp::IGNORE_CASE; 83 break; 84 case 'g': 85 flags |= JSRegExp::GLOBAL; 86 break; 87 case 'm': 88 flags |= JSRegExp::MULTILINE; 89 break; 90 } 91 } 92 return JSRegExp::Flags(flags); 93 } 94 95 96 static inline void ThrowRegExpException(Handle<JSRegExp> re, 97 Handle<String> pattern, 98 Handle<String> error_text, 99 const char* message) { 100 Isolate* isolate = re->GetIsolate(); 101 Factory* factory = isolate->factory(); 102 Handle<FixedArray> elements = factory->NewFixedArray(2); 103 elements->set(0, *pattern); 104 elements->set(1, *error_text); 105 Handle<JSArray> array = factory->NewJSArrayWithElements(elements); 106 Handle<Object> regexp_err = factory->NewSyntaxError(message, array); 107 isolate->Throw(*regexp_err); 108 } 109 110 111 // Generic RegExp methods. Dispatches to implementation specific methods. 112 113 114 Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re, 115 Handle<String> pattern, 116 Handle<String> flag_str) { 117 Isolate* isolate = re->GetIsolate(); 118 JSRegExp::Flags flags = RegExpFlagsFromString(flag_str); 119 CompilationCache* compilation_cache = isolate->compilation_cache(); 120 Handle<FixedArray> cached = compilation_cache->LookupRegExp(pattern, flags); 121 bool in_cache = !cached.is_null(); 122 LOG(isolate, RegExpCompileEvent(re, in_cache)); 123 124 Handle<Object> result; 125 if (in_cache) { 126 re->set_data(*cached); 127 return re; 128 } 129 pattern = FlattenGetString(pattern); 130 CompilationZoneScope zone_scope(DELETE_ON_EXIT); 131 PostponeInterruptsScope postpone(isolate); 132 RegExpCompileData parse_result; 133 FlatStringReader reader(isolate, pattern); 134 if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(), 135 &parse_result)) { 136 // Throw an exception if we fail to parse the pattern. 137 ThrowRegExpException(re, 138 pattern, 139 parse_result.error, 140 "malformed_regexp"); 141 return Handle<Object>::null(); 142 } 143 144 if (parse_result.simple && !flags.is_ignore_case()) { 145 // Parse-tree is a single atom that is equal to the pattern. 146 AtomCompile(re, pattern, flags, pattern); 147 } else if (parse_result.tree->IsAtom() && 148 !flags.is_ignore_case() && 149 parse_result.capture_count == 0) { 150 RegExpAtom* atom = parse_result.tree->AsAtom(); 151 Vector<const uc16> atom_pattern = atom->data(); 152 Handle<String> atom_string = 153 isolate->factory()->NewStringFromTwoByte(atom_pattern); 154 AtomCompile(re, pattern, flags, atom_string); 155 } else { 156 IrregexpInitialize(re, pattern, flags, parse_result.capture_count); 157 } 158 ASSERT(re->data()->IsFixedArray()); 159 // Compilation succeeded so the data is set on the regexp 160 // and we can store it in the cache. 161 Handle<FixedArray> data(FixedArray::cast(re->data())); 162 compilation_cache->PutRegExp(pattern, flags, data); 163 164 return re; 165 } 166 167 168 Handle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp, 169 Handle<String> subject, 170 int index, 171 Handle<JSArray> last_match_info) { 172 switch (regexp->TypeTag()) { 173 case JSRegExp::ATOM: 174 return AtomExec(regexp, subject, index, last_match_info); 175 case JSRegExp::IRREGEXP: { 176 Handle<Object> result = 177 IrregexpExec(regexp, subject, index, last_match_info); 178 ASSERT(!result.is_null() || Isolate::Current()->has_pending_exception()); 179 return result; 180 } 181 default: 182 UNREACHABLE(); 183 return Handle<Object>::null(); 184 } 185 } 186 187 188 // RegExp Atom implementation: Simple string search using indexOf. 189 190 191 void RegExpImpl::AtomCompile(Handle<JSRegExp> re, 192 Handle<String> pattern, 193 JSRegExp::Flags flags, 194 Handle<String> match_pattern) { 195 re->GetIsolate()->factory()->SetRegExpAtomData(re, 196 JSRegExp::ATOM, 197 pattern, 198 flags, 199 match_pattern); 200 } 201 202 203 static void SetAtomLastCapture(FixedArray* array, 204 String* subject, 205 int from, 206 int to) { 207 NoHandleAllocation no_handles; 208 RegExpImpl::SetLastCaptureCount(array, 2); 209 RegExpImpl::SetLastSubject(array, subject); 210 RegExpImpl::SetLastInput(array, subject); 211 RegExpImpl::SetCapture(array, 0, from); 212 RegExpImpl::SetCapture(array, 1, to); 213 } 214 215 /* template <typename SubjectChar>, typename PatternChar> 216 static int ReStringMatch(Vector<const SubjectChar> sub_vector, 217 Vector<const PatternChar> pat_vector, 218 int start_index) { 219 220 int pattern_length = pat_vector.length(); 221 if (pattern_length == 0) return start_index; 222 223 int subject_length = sub_vector.length(); 224 if (start_index + pattern_length > subject_length) return -1; 225 return SearchString(sub_vector, pat_vector, start_index); 226 } 227 */ 228 Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re, 229 Handle<String> subject, 230 int index, 231 Handle<JSArray> last_match_info) { 232 Isolate* isolate = re->GetIsolate(); 233 234 ASSERT(0 <= index); 235 ASSERT(index <= subject->length()); 236 237 if (!subject->IsFlat()) FlattenString(subject); 238 AssertNoAllocation no_heap_allocation; // ensure vectors stay valid 239 // Extract flattened substrings of cons strings before determining asciiness. 240 String* seq_sub = *subject; 241 if (seq_sub->IsConsString()) seq_sub = ConsString::cast(seq_sub)->first(); 242 243 String* needle = String::cast(re->DataAt(JSRegExp::kAtomPatternIndex)); 244 int needle_len = needle->length(); 245 246 if (needle_len != 0) { 247 if (index + needle_len > subject->length()) 248 return isolate->factory()->null_value(); 249 250 // dispatch on type of strings 251 index = (needle->IsAsciiRepresentation() 252 ? (seq_sub->IsAsciiRepresentation() 253 ? SearchString(isolate, 254 seq_sub->ToAsciiVector(), 255 needle->ToAsciiVector(), 256 index) 257 : SearchString(isolate, 258 seq_sub->ToUC16Vector(), 259 needle->ToAsciiVector(), 260 index)) 261 : (seq_sub->IsAsciiRepresentation() 262 ? SearchString(isolate, 263 seq_sub->ToAsciiVector(), 264 needle->ToUC16Vector(), 265 index) 266 : SearchString(isolate, 267 seq_sub->ToUC16Vector(), 268 needle->ToUC16Vector(), 269 index))); 270 if (index == -1) return FACTORY->null_value(); 271 } 272 ASSERT(last_match_info->HasFastElements()); 273 274 { 275 NoHandleAllocation no_handles; 276 FixedArray* array = FixedArray::cast(last_match_info->elements()); 277 SetAtomLastCapture(array, *subject, index, index + needle_len); 278 } 279 return last_match_info; 280 } 281 282 283 // Irregexp implementation. 284 285 // Ensures that the regexp object contains a compiled version of the 286 // source for either ASCII or non-ASCII strings. 287 // If the compiled version doesn't already exist, it is compiled 288 // from the source pattern. 289 // If compilation fails, an exception is thrown and this function 290 // returns false. 291 bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, bool is_ascii) { 292 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii)); 293 #ifdef V8_INTERPRETED_REGEXP 294 if (compiled_code->IsByteArray()) return true; 295 #else // V8_INTERPRETED_REGEXP (RegExp native code) 296 if (compiled_code->IsCode()) return true; 297 #endif 298 return CompileIrregexp(re, is_ascii); 299 } 300 301 302 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, bool is_ascii) { 303 // Compile the RegExp. 304 Isolate* isolate = re->GetIsolate(); 305 CompilationZoneScope zone_scope(DELETE_ON_EXIT); 306 PostponeInterruptsScope postpone(isolate); 307 Object* entry = re->DataAt(JSRegExp::code_index(is_ascii)); 308 if (entry->IsJSObject()) { 309 // If it's a JSObject, a previous compilation failed and threw this object. 310 // Re-throw the object without trying again. 311 isolate->Throw(entry); 312 return false; 313 } 314 ASSERT(entry->IsTheHole()); 315 316 JSRegExp::Flags flags = re->GetFlags(); 317 318 Handle<String> pattern(re->Pattern()); 319 if (!pattern->IsFlat()) { 320 FlattenString(pattern); 321 } 322 323 RegExpCompileData compile_data; 324 FlatStringReader reader(isolate, pattern); 325 if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(), 326 &compile_data)) { 327 // Throw an exception if we fail to parse the pattern. 328 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once. 329 ThrowRegExpException(re, 330 pattern, 331 compile_data.error, 332 "malformed_regexp"); 333 return false; 334 } 335 RegExpEngine::CompilationResult result = 336 RegExpEngine::Compile(&compile_data, 337 flags.is_ignore_case(), 338 flags.is_multiline(), 339 pattern, 340 is_ascii); 341 if (result.error_message != NULL) { 342 // Unable to compile regexp. 343 Factory* factory = isolate->factory(); 344 Handle<FixedArray> elements = factory->NewFixedArray(2); 345 elements->set(0, *pattern); 346 Handle<String> error_message = 347 factory->NewStringFromUtf8(CStrVector(result.error_message)); 348 elements->set(1, *error_message); 349 Handle<JSArray> array = factory->NewJSArrayWithElements(elements); 350 Handle<Object> regexp_err = 351 factory->NewSyntaxError("malformed_regexp", array); 352 isolate->Throw(*regexp_err); 353 re->SetDataAt(JSRegExp::code_index(is_ascii), *regexp_err); 354 return false; 355 } 356 357 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data())); 358 data->set(JSRegExp::code_index(is_ascii), result.code); 359 int register_max = IrregexpMaxRegisterCount(*data); 360 if (result.num_registers > register_max) { 361 SetIrregexpMaxRegisterCount(*data, result.num_registers); 362 } 363 364 return true; 365 } 366 367 368 int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) { 369 return Smi::cast( 370 re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value(); 371 } 372 373 374 void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) { 375 re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value)); 376 } 377 378 379 int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) { 380 return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value(); 381 } 382 383 384 int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) { 385 return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value(); 386 } 387 388 389 ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_ascii) { 390 return ByteArray::cast(re->get(JSRegExp::code_index(is_ascii))); 391 } 392 393 394 Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_ascii) { 395 return Code::cast(re->get(JSRegExp::code_index(is_ascii))); 396 } 397 398 399 void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re, 400 Handle<String> pattern, 401 JSRegExp::Flags flags, 402 int capture_count) { 403 // Initialize compiled code entries to null. 404 re->GetIsolate()->factory()->SetRegExpIrregexpData(re, 405 JSRegExp::IRREGEXP, 406 pattern, 407 flags, 408 capture_count); 409 } 410 411 412 int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp, 413 Handle<String> subject) { 414 if (!subject->IsFlat()) { 415 FlattenString(subject); 416 } 417 // Check the asciiness of the underlying storage. 418 bool is_ascii; 419 { 420 AssertNoAllocation no_gc; 421 String* sequential_string = *subject; 422 if (subject->IsConsString()) { 423 sequential_string = ConsString::cast(*subject)->first(); 424 } 425 is_ascii = sequential_string->IsAsciiRepresentation(); 426 } 427 if (!EnsureCompiledIrregexp(regexp, is_ascii)) { 428 return -1; 429 } 430 #ifdef V8_INTERPRETED_REGEXP 431 // Byte-code regexp needs space allocated for all its registers. 432 return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())); 433 #else // V8_INTERPRETED_REGEXP 434 // Native regexp only needs room to output captures. Registers are handled 435 // internally. 436 return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2; 437 #endif // V8_INTERPRETED_REGEXP 438 } 439 440 441 RegExpImpl::IrregexpResult RegExpImpl::IrregexpExecOnce( 442 Handle<JSRegExp> regexp, 443 Handle<String> subject, 444 int index, 445 Vector<int> output) { 446 Isolate* isolate = regexp->GetIsolate(); 447 448 Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate); 449 450 ASSERT(index >= 0); 451 ASSERT(index <= subject->length()); 452 ASSERT(subject->IsFlat()); 453 454 // A flat ASCII string might have a two-byte first part. 455 if (subject->IsConsString()) { 456 subject = Handle<String>(ConsString::cast(*subject)->first(), isolate); 457 } 458 459 #ifndef V8_INTERPRETED_REGEXP 460 ASSERT(output.length() >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2); 461 do { 462 bool is_ascii = subject->IsAsciiRepresentation(); 463 Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii), isolate); 464 NativeRegExpMacroAssembler::Result res = 465 NativeRegExpMacroAssembler::Match(code, 466 subject, 467 output.start(), 468 output.length(), 469 index, 470 isolate); 471 if (res != NativeRegExpMacroAssembler::RETRY) { 472 ASSERT(res != NativeRegExpMacroAssembler::EXCEPTION || 473 isolate->has_pending_exception()); 474 STATIC_ASSERT( 475 static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS); 476 STATIC_ASSERT( 477 static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE); 478 STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION) 479 == RE_EXCEPTION); 480 return static_cast<IrregexpResult>(res); 481 } 482 // If result is RETRY, the string has changed representation, and we 483 // must restart from scratch. 484 // In this case, it means we must make sure we are prepared to handle 485 // the, potentially, different subject (the string can switch between 486 // being internal and external, and even between being ASCII and UC16, 487 // but the characters are always the same). 488 IrregexpPrepare(regexp, subject); 489 } while (true); 490 UNREACHABLE(); 491 return RE_EXCEPTION; 492 #else // V8_INTERPRETED_REGEXP 493 494 ASSERT(output.length() >= IrregexpNumberOfRegisters(*irregexp)); 495 bool is_ascii = subject->IsAsciiRepresentation(); 496 // We must have done EnsureCompiledIrregexp, so we can get the number of 497 // registers. 498 int* register_vector = output.start(); 499 int number_of_capture_registers = 500 (IrregexpNumberOfCaptures(*irregexp) + 1) * 2; 501 for (int i = number_of_capture_registers - 1; i >= 0; i--) { 502 register_vector[i] = -1; 503 } 504 Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_ascii), isolate); 505 506 if (IrregexpInterpreter::Match(isolate, 507 byte_codes, 508 subject, 509 register_vector, 510 index)) { 511 return RE_SUCCESS; 512 } 513 return RE_FAILURE; 514 #endif // V8_INTERPRETED_REGEXP 515 } 516 517 518 Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> jsregexp, 519 Handle<String> subject, 520 int previous_index, 521 Handle<JSArray> last_match_info) { 522 ASSERT_EQ(jsregexp->TypeTag(), JSRegExp::IRREGEXP); 523 524 // Prepare space for the return values. 525 #ifdef V8_INTERPRETED_REGEXP 526 #ifdef DEBUG 527 if (FLAG_trace_regexp_bytecodes) { 528 String* pattern = jsregexp->Pattern(); 529 PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString())); 530 PrintF("\n\nSubject string: '%s'\n\n", *(subject->ToCString())); 531 } 532 #endif 533 #endif 534 int required_registers = RegExpImpl::IrregexpPrepare(jsregexp, subject); 535 if (required_registers < 0) { 536 // Compiling failed with an exception. 537 ASSERT(Isolate::Current()->has_pending_exception()); 538 return Handle<Object>::null(); 539 } 540 541 OffsetsVector registers(required_registers); 542 543 IrregexpResult res = RegExpImpl::IrregexpExecOnce( 544 jsregexp, subject, previous_index, Vector<int>(registers.vector(), 545 registers.length())); 546 if (res == RE_SUCCESS) { 547 int capture_register_count = 548 (IrregexpNumberOfCaptures(FixedArray::cast(jsregexp->data())) + 1) * 2; 549 last_match_info->EnsureSize(capture_register_count + kLastMatchOverhead); 550 AssertNoAllocation no_gc; 551 int* register_vector = registers.vector(); 552 FixedArray* array = FixedArray::cast(last_match_info->elements()); 553 for (int i = 0; i < capture_register_count; i += 2) { 554 SetCapture(array, i, register_vector[i]); 555 SetCapture(array, i + 1, register_vector[i + 1]); 556 } 557 SetLastCaptureCount(array, capture_register_count); 558 SetLastSubject(array, *subject); 559 SetLastInput(array, *subject); 560 return last_match_info; 561 } 562 if (res == RE_EXCEPTION) { 563 ASSERT(Isolate::Current()->has_pending_exception()); 564 return Handle<Object>::null(); 565 } 566 ASSERT(res == RE_FAILURE); 567 return Isolate::Current()->factory()->null_value(); 568 } 569 570 571 // ------------------------------------------------------------------- 572 // Implementation of the Irregexp regular expression engine. 573 // 574 // The Irregexp regular expression engine is intended to be a complete 575 // implementation of ECMAScript regular expressions. It generates either 576 // bytecodes or native code. 577 578 // The Irregexp regexp engine is structured in three steps. 579 // 1) The parser generates an abstract syntax tree. See ast.cc. 580 // 2) From the AST a node network is created. The nodes are all 581 // subclasses of RegExpNode. The nodes represent states when 582 // executing a regular expression. Several optimizations are 583 // performed on the node network. 584 // 3) From the nodes we generate either byte codes or native code 585 // that can actually execute the regular expression (perform 586 // the search). The code generation step is described in more 587 // detail below. 588 589 // Code generation. 590 // 591 // The nodes are divided into four main categories. 592 // * Choice nodes 593 // These represent places where the regular expression can 594 // match in more than one way. For example on entry to an 595 // alternation (foo|bar) or a repetition (*, +, ? or {}). 596 // * Action nodes 597 // These represent places where some action should be 598 // performed. Examples include recording the current position 599 // in the input string to a register (in order to implement 600 // captures) or other actions on register for example in order 601 // to implement the counters needed for {} repetitions. 602 // * Matching nodes 603 // These attempt to match some element part of the input string. 604 // Examples of elements include character classes, plain strings 605 // or back references. 606 // * End nodes 607 // These are used to implement the actions required on finding 608 // a successful match or failing to find a match. 609 // 610 // The code generated (whether as byte codes or native code) maintains 611 // some state as it runs. This consists of the following elements: 612 // 613 // * The capture registers. Used for string captures. 614 // * Other registers. Used for counters etc. 615 // * The current position. 616 // * The stack of backtracking information. Used when a matching node 617 // fails to find a match and needs to try an alternative. 618 // 619 // Conceptual regular expression execution model: 620 // 621 // There is a simple conceptual model of regular expression execution 622 // which will be presented first. The actual code generated is a more 623 // efficient simulation of the simple conceptual model: 624 // 625 // * Choice nodes are implemented as follows: 626 // For each choice except the last { 627 // push current position 628 // push backtrack code location 629 // <generate code to test for choice> 630 // backtrack code location: 631 // pop current position 632 // } 633 // <generate code to test for last choice> 634 // 635 // * Actions nodes are generated as follows 636 // <push affected registers on backtrack stack> 637 // <generate code to perform action> 638 // push backtrack code location 639 // <generate code to test for following nodes> 640 // backtrack code location: 641 // <pop affected registers to restore their state> 642 // <pop backtrack location from stack and go to it> 643 // 644 // * Matching nodes are generated as follows: 645 // if input string matches at current position 646 // update current position 647 // <generate code to test for following nodes> 648 // else 649 // <pop backtrack location from stack and go to it> 650 // 651 // Thus it can be seen that the current position is saved and restored 652 // by the choice nodes, whereas the registers are saved and restored by 653 // by the action nodes that manipulate them. 654 // 655 // The other interesting aspect of this model is that nodes are generated 656 // at the point where they are needed by a recursive call to Emit(). If 657 // the node has already been code generated then the Emit() call will 658 // generate a jump to the previously generated code instead. In order to 659 // limit recursion it is possible for the Emit() function to put the node 660 // on a work list for later generation and instead generate a jump. The 661 // destination of the jump is resolved later when the code is generated. 662 // 663 // Actual regular expression code generation. 664 // 665 // Code generation is actually more complicated than the above. In order 666 // to improve the efficiency of the generated code some optimizations are 667 // performed 668 // 669 // * Choice nodes have 1-character lookahead. 670 // A choice node looks at the following character and eliminates some of 671 // the choices immediately based on that character. This is not yet 672 // implemented. 673 // * Simple greedy loops store reduced backtracking information. 674 // A quantifier like /.*foo/m will greedily match the whole input. It will 675 // then need to backtrack to a point where it can match "foo". The naive 676 // implementation of this would push each character position onto the 677 // backtracking stack, then pop them off one by one. This would use space 678 // proportional to the length of the input string. However since the "." 679 // can only match in one way and always has a constant length (in this case 680 // of 1) it suffices to store the current position on the top of the stack 681 // once. Matching now becomes merely incrementing the current position and 682 // backtracking becomes decrementing the current position and checking the 683 // result against the stored current position. This is faster and saves 684 // space. 685 // * The current state is virtualized. 686 // This is used to defer expensive operations until it is clear that they 687 // are needed and to generate code for a node more than once, allowing 688 // specialized an efficient versions of the code to be created. This is 689 // explained in the section below. 690 // 691 // Execution state virtualization. 692 // 693 // Instead of emitting code, nodes that manipulate the state can record their 694 // manipulation in an object called the Trace. The Trace object can record a 695 // current position offset, an optional backtrack code location on the top of 696 // the virtualized backtrack stack and some register changes. When a node is 697 // to be emitted it can flush the Trace or update it. Flushing the Trace 698 // will emit code to bring the actual state into line with the virtual state. 699 // Avoiding flushing the state can postpone some work (eg updates of capture 700 // registers). Postponing work can save time when executing the regular 701 // expression since it may be found that the work never has to be done as a 702 // failure to match can occur. In addition it is much faster to jump to a 703 // known backtrack code location than it is to pop an unknown backtrack 704 // location from the stack and jump there. 705 // 706 // The virtual state found in the Trace affects code generation. For example 707 // the virtual state contains the difference between the actual current 708 // position and the virtual current position, and matching code needs to use 709 // this offset to attempt a match in the correct location of the input 710 // string. Therefore code generated for a non-trivial trace is specialized 711 // to that trace. The code generator therefore has the ability to generate 712 // code for each node several times. In order to limit the size of the 713 // generated code there is an arbitrary limit on how many specialized sets of 714 // code may be generated for a given node. If the limit is reached, the 715 // trace is flushed and a generic version of the code for a node is emitted. 716 // This is subsequently used for that node. The code emitted for non-generic 717 // trace is not recorded in the node and so it cannot currently be reused in 718 // the event that code generation is requested for an identical trace. 719 720 721 void RegExpTree::AppendToText(RegExpText* text) { 722 UNREACHABLE(); 723 } 724 725 726 void RegExpAtom::AppendToText(RegExpText* text) { 727 text->AddElement(TextElement::Atom(this)); 728 } 729 730 731 void RegExpCharacterClass::AppendToText(RegExpText* text) { 732 text->AddElement(TextElement::CharClass(this)); 733 } 734 735 736 void RegExpText::AppendToText(RegExpText* text) { 737 for (int i = 0; i < elements()->length(); i++) 738 text->AddElement(elements()->at(i)); 739 } 740 741 742 TextElement TextElement::Atom(RegExpAtom* atom) { 743 TextElement result = TextElement(ATOM); 744 result.data.u_atom = atom; 745 return result; 746 } 747 748 749 TextElement TextElement::CharClass( 750 RegExpCharacterClass* char_class) { 751 TextElement result = TextElement(CHAR_CLASS); 752 result.data.u_char_class = char_class; 753 return result; 754 } 755 756 757 int TextElement::length() { 758 if (type == ATOM) { 759 return data.u_atom->length(); 760 } else { 761 ASSERT(type == CHAR_CLASS); 762 return 1; 763 } 764 } 765 766 767 DispatchTable* ChoiceNode::GetTable(bool ignore_case) { 768 if (table_ == NULL) { 769 table_ = new DispatchTable(); 770 DispatchTableConstructor cons(table_, ignore_case); 771 cons.BuildTable(this); 772 } 773 return table_; 774 } 775 776 777 class RegExpCompiler { 778 public: 779 RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii); 780 781 int AllocateRegister() { 782 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) { 783 reg_exp_too_big_ = true; 784 return next_register_; 785 } 786 return next_register_++; 787 } 788 789 RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler, 790 RegExpNode* start, 791 int capture_count, 792 Handle<String> pattern); 793 794 inline void AddWork(RegExpNode* node) { work_list_->Add(node); } 795 796 static const int kImplementationOffset = 0; 797 static const int kNumberOfRegistersOffset = 0; 798 static const int kCodeOffset = 1; 799 800 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; } 801 EndNode* accept() { return accept_; } 802 803 static const int kMaxRecursion = 100; 804 inline int recursion_depth() { return recursion_depth_; } 805 inline void IncrementRecursionDepth() { recursion_depth_++; } 806 inline void DecrementRecursionDepth() { recursion_depth_--; } 807 808 void SetRegExpTooBig() { reg_exp_too_big_ = true; } 809 810 inline bool ignore_case() { return ignore_case_; } 811 inline bool ascii() { return ascii_; } 812 813 static const int kNoRegister = -1; 814 private: 815 EndNode* accept_; 816 int next_register_; 817 List<RegExpNode*>* work_list_; 818 int recursion_depth_; 819 RegExpMacroAssembler* macro_assembler_; 820 bool ignore_case_; 821 bool ascii_; 822 bool reg_exp_too_big_; 823 }; 824 825 826 class RecursionCheck { 827 public: 828 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) { 829 compiler->IncrementRecursionDepth(); 830 } 831 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); } 832 private: 833 RegExpCompiler* compiler_; 834 }; 835 836 837 static RegExpEngine::CompilationResult IrregexpRegExpTooBig() { 838 return RegExpEngine::CompilationResult("RegExp too big"); 839 } 840 841 842 // Attempts to compile the regexp using an Irregexp code generator. Returns 843 // a fixed array or a null handle depending on whether it succeeded. 844 RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii) 845 : next_register_(2 * (capture_count + 1)), 846 work_list_(NULL), 847 recursion_depth_(0), 848 ignore_case_(ignore_case), 849 ascii_(ascii), 850 reg_exp_too_big_(false) { 851 accept_ = new EndNode(EndNode::ACCEPT); 852 ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister); 853 } 854 855 856 RegExpEngine::CompilationResult RegExpCompiler::Assemble( 857 RegExpMacroAssembler* macro_assembler, 858 RegExpNode* start, 859 int capture_count, 860 Handle<String> pattern) { 861 Heap* heap = pattern->GetHeap(); 862 863 bool use_slow_safe_regexp_compiler = false; 864 if (heap->total_regexp_code_generated() > 865 RegExpImpl::kRegWxpCompiledLimit && 866 heap->isolate()->memory_allocator()->SizeExecutable() > 867 RegExpImpl::kRegExpExecutableMemoryLimit) { 868 use_slow_safe_regexp_compiler = true; 869 } 870 871 macro_assembler->set_slow_safe(use_slow_safe_regexp_compiler); 872 873 #ifdef DEBUG 874 if (FLAG_trace_regexp_assembler) 875 macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler); 876 else 877 #endif 878 macro_assembler_ = macro_assembler; 879 880 List <RegExpNode*> work_list(0); 881 work_list_ = &work_list; 882 Label fail; 883 macro_assembler_->PushBacktrack(&fail); 884 Trace new_trace; 885 start->Emit(this, &new_trace); 886 macro_assembler_->Bind(&fail); 887 macro_assembler_->Fail(); 888 while (!work_list.is_empty()) { 889 work_list.RemoveLast()->Emit(this, &new_trace); 890 } 891 if (reg_exp_too_big_) return IrregexpRegExpTooBig(); 892 893 Handle<HeapObject> code = macro_assembler_->GetCode(pattern); 894 heap->IncreaseTotalRegexpCodeGenerated(code->Size()); 895 work_list_ = NULL; 896 #ifdef DEBUG 897 if (FLAG_print_code) { 898 Handle<Code>::cast(code)->Disassemble(*pattern->ToCString()); 899 } 900 if (FLAG_trace_regexp_assembler) { 901 delete macro_assembler_; 902 } 903 #endif 904 return RegExpEngine::CompilationResult(*code, next_register_); 905 } 906 907 908 bool Trace::DeferredAction::Mentions(int that) { 909 if (type() == ActionNode::CLEAR_CAPTURES) { 910 Interval range = static_cast<DeferredClearCaptures*>(this)->range(); 911 return range.Contains(that); 912 } else { 913 return reg() == that; 914 } 915 } 916 917 918 bool Trace::mentions_reg(int reg) { 919 for (DeferredAction* action = actions_; 920 action != NULL; 921 action = action->next()) { 922 if (action->Mentions(reg)) 923 return true; 924 } 925 return false; 926 } 927 928 929 bool Trace::GetStoredPosition(int reg, int* cp_offset) { 930 ASSERT_EQ(0, *cp_offset); 931 for (DeferredAction* action = actions_; 932 action != NULL; 933 action = action->next()) { 934 if (action->Mentions(reg)) { 935 if (action->type() == ActionNode::STORE_POSITION) { 936 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset(); 937 return true; 938 } else { 939 return false; 940 } 941 } 942 } 943 return false; 944 } 945 946 947 int Trace::FindAffectedRegisters(OutSet* affected_registers) { 948 int max_register = RegExpCompiler::kNoRegister; 949 for (DeferredAction* action = actions_; 950 action != NULL; 951 action = action->next()) { 952 if (action->type() == ActionNode::CLEAR_CAPTURES) { 953 Interval range = static_cast<DeferredClearCaptures*>(action)->range(); 954 for (int i = range.from(); i <= range.to(); i++) 955 affected_registers->Set(i); 956 if (range.to() > max_register) max_register = range.to(); 957 } else { 958 affected_registers->Set(action->reg()); 959 if (action->reg() > max_register) max_register = action->reg(); 960 } 961 } 962 return max_register; 963 } 964 965 966 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler, 967 int max_register, 968 OutSet& registers_to_pop, 969 OutSet& registers_to_clear) { 970 for (int reg = max_register; reg >= 0; reg--) { 971 if (registers_to_pop.Get(reg)) assembler->PopRegister(reg); 972 else if (registers_to_clear.Get(reg)) { 973 int clear_to = reg; 974 while (reg > 0 && registers_to_clear.Get(reg - 1)) { 975 reg--; 976 } 977 assembler->ClearRegisters(reg, clear_to); 978 } 979 } 980 } 981 982 983 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler, 984 int max_register, 985 OutSet& affected_registers, 986 OutSet* registers_to_pop, 987 OutSet* registers_to_clear) { 988 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1. 989 const int push_limit = (assembler->stack_limit_slack() + 1) / 2; 990 991 // Count pushes performed to force a stack limit check occasionally. 992 int pushes = 0; 993 994 for (int reg = 0; reg <= max_register; reg++) { 995 if (!affected_registers.Get(reg)) { 996 continue; 997 } 998 999 // The chronologically first deferred action in the trace 1000 // is used to infer the action needed to restore a register 1001 // to its previous state (or not, if it's safe to ignore it). 1002 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR }; 1003 DeferredActionUndoType undo_action = IGNORE; 1004 1005 int value = 0; 1006 bool absolute = false; 1007 bool clear = false; 1008 int store_position = -1; 1009 // This is a little tricky because we are scanning the actions in reverse 1010 // historical order (newest first). 1011 for (DeferredAction* action = actions_; 1012 action != NULL; 1013 action = action->next()) { 1014 if (action->Mentions(reg)) { 1015 switch (action->type()) { 1016 case ActionNode::SET_REGISTER: { 1017 Trace::DeferredSetRegister* psr = 1018 static_cast<Trace::DeferredSetRegister*>(action); 1019 if (!absolute) { 1020 value += psr->value(); 1021 absolute = true; 1022 } 1023 // SET_REGISTER is currently only used for newly introduced loop 1024 // counters. They can have a significant previous value if they 1025 // occour in a loop. TODO(lrn): Propagate this information, so 1026 // we can set undo_action to IGNORE if we know there is no value to 1027 // restore. 1028 undo_action = RESTORE; 1029 ASSERT_EQ(store_position, -1); 1030 ASSERT(!clear); 1031 break; 1032 } 1033 case ActionNode::INCREMENT_REGISTER: 1034 if (!absolute) { 1035 value++; 1036 } 1037 ASSERT_EQ(store_position, -1); 1038 ASSERT(!clear); 1039 undo_action = RESTORE; 1040 break; 1041 case ActionNode::STORE_POSITION: { 1042 Trace::DeferredCapture* pc = 1043 static_cast<Trace::DeferredCapture*>(action); 1044 if (!clear && store_position == -1) { 1045 store_position = pc->cp_offset(); 1046 } 1047 1048 // For captures we know that stores and clears alternate. 1049 // Other register, are never cleared, and if the occur 1050 // inside a loop, they might be assigned more than once. 1051 if (reg <= 1) { 1052 // Registers zero and one, aka "capture zero", is 1053 // always set correctly if we succeed. There is no 1054 // need to undo a setting on backtrack, because we 1055 // will set it again or fail. 1056 undo_action = IGNORE; 1057 } else { 1058 undo_action = pc->is_capture() ? CLEAR : RESTORE; 1059 } 1060 ASSERT(!absolute); 1061 ASSERT_EQ(value, 0); 1062 break; 1063 } 1064 case ActionNode::CLEAR_CAPTURES: { 1065 // Since we're scanning in reverse order, if we've already 1066 // set the position we have to ignore historically earlier 1067 // clearing operations. 1068 if (store_position == -1) { 1069 clear = true; 1070 } 1071 undo_action = RESTORE; 1072 ASSERT(!absolute); 1073 ASSERT_EQ(value, 0); 1074 break; 1075 } 1076 default: 1077 UNREACHABLE(); 1078 break; 1079 } 1080 } 1081 } 1082 // Prepare for the undo-action (e.g., push if it's going to be popped). 1083 if (undo_action == RESTORE) { 1084 pushes++; 1085 RegExpMacroAssembler::StackCheckFlag stack_check = 1086 RegExpMacroAssembler::kNoStackLimitCheck; 1087 if (pushes == push_limit) { 1088 stack_check = RegExpMacroAssembler::kCheckStackLimit; 1089 pushes = 0; 1090 } 1091 1092 assembler->PushRegister(reg, stack_check); 1093 registers_to_pop->Set(reg); 1094 } else if (undo_action == CLEAR) { 1095 registers_to_clear->Set(reg); 1096 } 1097 // Perform the chronologically last action (or accumulated increment) 1098 // for the register. 1099 if (store_position != -1) { 1100 assembler->WriteCurrentPositionToRegister(reg, store_position); 1101 } else if (clear) { 1102 assembler->ClearRegisters(reg, reg); 1103 } else if (absolute) { 1104 assembler->SetRegister(reg, value); 1105 } else if (value != 0) { 1106 assembler->AdvanceRegister(reg, value); 1107 } 1108 } 1109 } 1110 1111 1112 // This is called as we come into a loop choice node and some other tricky 1113 // nodes. It normalizes the state of the code generator to ensure we can 1114 // generate generic code. 1115 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) { 1116 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1117 1118 ASSERT(!is_trivial()); 1119 1120 if (actions_ == NULL && backtrack() == NULL) { 1121 // Here we just have some deferred cp advances to fix and we are back to 1122 // a normal situation. We may also have to forget some information gained 1123 // through a quick check that was already performed. 1124 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_); 1125 // Create a new trivial state and generate the node with that. 1126 Trace new_state; 1127 successor->Emit(compiler, &new_state); 1128 return; 1129 } 1130 1131 // Generate deferred actions here along with code to undo them again. 1132 OutSet affected_registers; 1133 1134 if (backtrack() != NULL) { 1135 // Here we have a concrete backtrack location. These are set up by choice 1136 // nodes and so they indicate that we have a deferred save of the current 1137 // position which we may need to emit here. 1138 assembler->PushCurrentPosition(); 1139 } 1140 1141 int max_register = FindAffectedRegisters(&affected_registers); 1142 OutSet registers_to_pop; 1143 OutSet registers_to_clear; 1144 PerformDeferredActions(assembler, 1145 max_register, 1146 affected_registers, 1147 ®isters_to_pop, 1148 ®isters_to_clear); 1149 if (cp_offset_ != 0) { 1150 assembler->AdvanceCurrentPosition(cp_offset_); 1151 } 1152 1153 // Create a new trivial state and generate the node with that. 1154 Label undo; 1155 assembler->PushBacktrack(&undo); 1156 Trace new_state; 1157 successor->Emit(compiler, &new_state); 1158 1159 // On backtrack we need to restore state. 1160 assembler->Bind(&undo); 1161 RestoreAffectedRegisters(assembler, 1162 max_register, 1163 registers_to_pop, 1164 registers_to_clear); 1165 if (backtrack() == NULL) { 1166 assembler->Backtrack(); 1167 } else { 1168 assembler->PopCurrentPosition(); 1169 assembler->GoTo(backtrack()); 1170 } 1171 } 1172 1173 1174 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) { 1175 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1176 1177 // Omit flushing the trace. We discard the entire stack frame anyway. 1178 1179 if (!label()->is_bound()) { 1180 // We are completely independent of the trace, since we ignore it, 1181 // so this code can be used as the generic version. 1182 assembler->Bind(label()); 1183 } 1184 1185 // Throw away everything on the backtrack stack since the start 1186 // of the negative submatch and restore the character position. 1187 assembler->ReadCurrentPositionFromRegister(current_position_register_); 1188 assembler->ReadStackPointerFromRegister(stack_pointer_register_); 1189 if (clear_capture_count_ > 0) { 1190 // Clear any captures that might have been performed during the success 1191 // of the body of the negative look-ahead. 1192 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1; 1193 assembler->ClearRegisters(clear_capture_start_, clear_capture_end); 1194 } 1195 // Now that we have unwound the stack we find at the top of the stack the 1196 // backtrack that the BeginSubmatch node got. 1197 assembler->Backtrack(); 1198 } 1199 1200 1201 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) { 1202 if (!trace->is_trivial()) { 1203 trace->Flush(compiler, this); 1204 return; 1205 } 1206 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1207 if (!label()->is_bound()) { 1208 assembler->Bind(label()); 1209 } 1210 switch (action_) { 1211 case ACCEPT: 1212 assembler->Succeed(); 1213 return; 1214 case BACKTRACK: 1215 assembler->GoTo(trace->backtrack()); 1216 return; 1217 case NEGATIVE_SUBMATCH_SUCCESS: 1218 // This case is handled in a different virtual method. 1219 UNREACHABLE(); 1220 } 1221 UNIMPLEMENTED(); 1222 } 1223 1224 1225 void GuardedAlternative::AddGuard(Guard* guard) { 1226 if (guards_ == NULL) 1227 guards_ = new ZoneList<Guard*>(1); 1228 guards_->Add(guard); 1229 } 1230 1231 1232 ActionNode* ActionNode::SetRegister(int reg, 1233 int val, 1234 RegExpNode* on_success) { 1235 ActionNode* result = new ActionNode(SET_REGISTER, on_success); 1236 result->data_.u_store_register.reg = reg; 1237 result->data_.u_store_register.value = val; 1238 return result; 1239 } 1240 1241 1242 ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) { 1243 ActionNode* result = new ActionNode(INCREMENT_REGISTER, on_success); 1244 result->data_.u_increment_register.reg = reg; 1245 return result; 1246 } 1247 1248 1249 ActionNode* ActionNode::StorePosition(int reg, 1250 bool is_capture, 1251 RegExpNode* on_success) { 1252 ActionNode* result = new ActionNode(STORE_POSITION, on_success); 1253 result->data_.u_position_register.reg = reg; 1254 result->data_.u_position_register.is_capture = is_capture; 1255 return result; 1256 } 1257 1258 1259 ActionNode* ActionNode::ClearCaptures(Interval range, 1260 RegExpNode* on_success) { 1261 ActionNode* result = new ActionNode(CLEAR_CAPTURES, on_success); 1262 result->data_.u_clear_captures.range_from = range.from(); 1263 result->data_.u_clear_captures.range_to = range.to(); 1264 return result; 1265 } 1266 1267 1268 ActionNode* ActionNode::BeginSubmatch(int stack_reg, 1269 int position_reg, 1270 RegExpNode* on_success) { 1271 ActionNode* result = new ActionNode(BEGIN_SUBMATCH, on_success); 1272 result->data_.u_submatch.stack_pointer_register = stack_reg; 1273 result->data_.u_submatch.current_position_register = position_reg; 1274 return result; 1275 } 1276 1277 1278 ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg, 1279 int position_reg, 1280 int clear_register_count, 1281 int clear_register_from, 1282 RegExpNode* on_success) { 1283 ActionNode* result = new ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success); 1284 result->data_.u_submatch.stack_pointer_register = stack_reg; 1285 result->data_.u_submatch.current_position_register = position_reg; 1286 result->data_.u_submatch.clear_register_count = clear_register_count; 1287 result->data_.u_submatch.clear_register_from = clear_register_from; 1288 return result; 1289 } 1290 1291 1292 ActionNode* ActionNode::EmptyMatchCheck(int start_register, 1293 int repetition_register, 1294 int repetition_limit, 1295 RegExpNode* on_success) { 1296 ActionNode* result = new ActionNode(EMPTY_MATCH_CHECK, on_success); 1297 result->data_.u_empty_match_check.start_register = start_register; 1298 result->data_.u_empty_match_check.repetition_register = repetition_register; 1299 result->data_.u_empty_match_check.repetition_limit = repetition_limit; 1300 return result; 1301 } 1302 1303 1304 #define DEFINE_ACCEPT(Type) \ 1305 void Type##Node::Accept(NodeVisitor* visitor) { \ 1306 visitor->Visit##Type(this); \ 1307 } 1308 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT) 1309 #undef DEFINE_ACCEPT 1310 1311 1312 void LoopChoiceNode::Accept(NodeVisitor* visitor) { 1313 visitor->VisitLoopChoice(this); 1314 } 1315 1316 1317 // ------------------------------------------------------------------- 1318 // Emit code. 1319 1320 1321 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler, 1322 Guard* guard, 1323 Trace* trace) { 1324 switch (guard->op()) { 1325 case Guard::LT: 1326 ASSERT(!trace->mentions_reg(guard->reg())); 1327 macro_assembler->IfRegisterGE(guard->reg(), 1328 guard->value(), 1329 trace->backtrack()); 1330 break; 1331 case Guard::GEQ: 1332 ASSERT(!trace->mentions_reg(guard->reg())); 1333 macro_assembler->IfRegisterLT(guard->reg(), 1334 guard->value(), 1335 trace->backtrack()); 1336 break; 1337 } 1338 } 1339 1340 1341 // Returns the number of characters in the equivalence class, omitting those 1342 // that cannot occur in the source string because it is ASCII. 1343 static int GetCaseIndependentLetters(Isolate* isolate, 1344 uc16 character, 1345 bool ascii_subject, 1346 unibrow::uchar* letters) { 1347 int length = 1348 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters); 1349 // Unibrow returns 0 or 1 for characters where case independence is 1350 // trivial. 1351 if (length == 0) { 1352 letters[0] = character; 1353 length = 1; 1354 } 1355 if (!ascii_subject || character <= String::kMaxAsciiCharCode) { 1356 return length; 1357 } 1358 // The standard requires that non-ASCII characters cannot have ASCII 1359 // character codes in their equivalence class. 1360 return 0; 1361 } 1362 1363 1364 static inline bool EmitSimpleCharacter(Isolate* isolate, 1365 RegExpCompiler* compiler, 1366 uc16 c, 1367 Label* on_failure, 1368 int cp_offset, 1369 bool check, 1370 bool preloaded) { 1371 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1372 bool bound_checked = false; 1373 if (!preloaded) { 1374 assembler->LoadCurrentCharacter( 1375 cp_offset, 1376 on_failure, 1377 check); 1378 bound_checked = true; 1379 } 1380 assembler->CheckNotCharacter(c, on_failure); 1381 return bound_checked; 1382 } 1383 1384 1385 // Only emits non-letters (things that don't have case). Only used for case 1386 // independent matches. 1387 static inline bool EmitAtomNonLetter(Isolate* isolate, 1388 RegExpCompiler* compiler, 1389 uc16 c, 1390 Label* on_failure, 1391 int cp_offset, 1392 bool check, 1393 bool preloaded) { 1394 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 1395 bool ascii = compiler->ascii(); 1396 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1397 int length = GetCaseIndependentLetters(isolate, c, ascii, chars); 1398 if (length < 1) { 1399 // This can't match. Must be an ASCII subject and a non-ASCII character. 1400 // We do not need to do anything since the ASCII pass already handled this. 1401 return false; // Bounds not checked. 1402 } 1403 bool checked = false; 1404 // We handle the length > 1 case in a later pass. 1405 if (length == 1) { 1406 if (ascii && c > String::kMaxAsciiCharCodeU) { 1407 // Can't match - see above. 1408 return false; // Bounds not checked. 1409 } 1410 if (!preloaded) { 1411 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 1412 checked = check; 1413 } 1414 macro_assembler->CheckNotCharacter(c, on_failure); 1415 } 1416 return checked; 1417 } 1418 1419 1420 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler, 1421 bool ascii, 1422 uc16 c1, 1423 uc16 c2, 1424 Label* on_failure) { 1425 uc16 char_mask; 1426 if (ascii) { 1427 char_mask = String::kMaxAsciiCharCode; 1428 } else { 1429 char_mask = String::kMaxUC16CharCode; 1430 } 1431 uc16 exor = c1 ^ c2; 1432 // Check whether exor has only one bit set. 1433 if (((exor - 1) & exor) == 0) { 1434 // If c1 and c2 differ only by one bit. 1435 // Ecma262UnCanonicalize always gives the highest number last. 1436 ASSERT(c2 > c1); 1437 uc16 mask = char_mask ^ exor; 1438 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure); 1439 return true; 1440 } 1441 ASSERT(c2 > c1); 1442 uc16 diff = c2 - c1; 1443 if (((diff - 1) & diff) == 0 && c1 >= diff) { 1444 // If the characters differ by 2^n but don't differ by one bit then 1445 // subtract the difference from the found character, then do the or 1446 // trick. We avoid the theoretical case where negative numbers are 1447 // involved in order to simplify code generation. 1448 uc16 mask = char_mask ^ diff; 1449 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff, 1450 diff, 1451 mask, 1452 on_failure); 1453 return true; 1454 } 1455 return false; 1456 } 1457 1458 1459 typedef bool EmitCharacterFunction(Isolate* isolate, 1460 RegExpCompiler* compiler, 1461 uc16 c, 1462 Label* on_failure, 1463 int cp_offset, 1464 bool check, 1465 bool preloaded); 1466 1467 // Only emits letters (things that have case). Only used for case independent 1468 // matches. 1469 static inline bool EmitAtomLetter(Isolate* isolate, 1470 RegExpCompiler* compiler, 1471 uc16 c, 1472 Label* on_failure, 1473 int cp_offset, 1474 bool check, 1475 bool preloaded) { 1476 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 1477 bool ascii = compiler->ascii(); 1478 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1479 int length = GetCaseIndependentLetters(isolate, c, ascii, chars); 1480 if (length <= 1) return false; 1481 // We may not need to check against the end of the input string 1482 // if this character lies before a character that matched. 1483 if (!preloaded) { 1484 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 1485 } 1486 Label ok; 1487 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4); 1488 switch (length) { 1489 case 2: { 1490 if (ShortCutEmitCharacterPair(macro_assembler, 1491 ascii, 1492 chars[0], 1493 chars[1], 1494 on_failure)) { 1495 } else { 1496 macro_assembler->CheckCharacter(chars[0], &ok); 1497 macro_assembler->CheckNotCharacter(chars[1], on_failure); 1498 macro_assembler->Bind(&ok); 1499 } 1500 break; 1501 } 1502 case 4: 1503 macro_assembler->CheckCharacter(chars[3], &ok); 1504 // Fall through! 1505 case 3: 1506 macro_assembler->CheckCharacter(chars[0], &ok); 1507 macro_assembler->CheckCharacter(chars[1], &ok); 1508 macro_assembler->CheckNotCharacter(chars[2], on_failure); 1509 macro_assembler->Bind(&ok); 1510 break; 1511 default: 1512 UNREACHABLE(); 1513 break; 1514 } 1515 return true; 1516 } 1517 1518 1519 static void EmitCharClass(RegExpMacroAssembler* macro_assembler, 1520 RegExpCharacterClass* cc, 1521 bool ascii, 1522 Label* on_failure, 1523 int cp_offset, 1524 bool check_offset, 1525 bool preloaded) { 1526 ZoneList<CharacterRange>* ranges = cc->ranges(); 1527 int max_char; 1528 if (ascii) { 1529 max_char = String::kMaxAsciiCharCode; 1530 } else { 1531 max_char = String::kMaxUC16CharCode; 1532 } 1533 1534 Label success; 1535 1536 Label* char_is_in_class = 1537 cc->is_negated() ? on_failure : &success; 1538 1539 int range_count = ranges->length(); 1540 1541 int last_valid_range = range_count - 1; 1542 while (last_valid_range >= 0) { 1543 CharacterRange& range = ranges->at(last_valid_range); 1544 if (range.from() <= max_char) { 1545 break; 1546 } 1547 last_valid_range--; 1548 } 1549 1550 if (last_valid_range < 0) { 1551 if (!cc->is_negated()) { 1552 // TODO(plesner): We can remove this when the node level does our 1553 // ASCII optimizations for us. 1554 macro_assembler->GoTo(on_failure); 1555 } 1556 if (check_offset) { 1557 macro_assembler->CheckPosition(cp_offset, on_failure); 1558 } 1559 return; 1560 } 1561 1562 if (last_valid_range == 0 && 1563 !cc->is_negated() && 1564 ranges->at(0).IsEverything(max_char)) { 1565 // This is a common case hit by non-anchored expressions. 1566 if (check_offset) { 1567 macro_assembler->CheckPosition(cp_offset, on_failure); 1568 } 1569 return; 1570 } 1571 1572 if (!preloaded) { 1573 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset); 1574 } 1575 1576 if (cc->is_standard() && 1577 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(), 1578 on_failure)) { 1579 return; 1580 } 1581 1582 for (int i = 0; i < last_valid_range; i++) { 1583 CharacterRange& range = ranges->at(i); 1584 Label next_range; 1585 uc16 from = range.from(); 1586 uc16 to = range.to(); 1587 if (from > max_char) { 1588 continue; 1589 } 1590 if (to > max_char) to = max_char; 1591 if (to == from) { 1592 macro_assembler->CheckCharacter(to, char_is_in_class); 1593 } else { 1594 if (from != 0) { 1595 macro_assembler->CheckCharacterLT(from, &next_range); 1596 } 1597 if (to != max_char) { 1598 macro_assembler->CheckCharacterLT(to + 1, char_is_in_class); 1599 } else { 1600 macro_assembler->GoTo(char_is_in_class); 1601 } 1602 } 1603 macro_assembler->Bind(&next_range); 1604 } 1605 1606 CharacterRange& range = ranges->at(last_valid_range); 1607 uc16 from = range.from(); 1608 uc16 to = range.to(); 1609 1610 if (to > max_char) to = max_char; 1611 ASSERT(to >= from); 1612 1613 if (to == from) { 1614 if (cc->is_negated()) { 1615 macro_assembler->CheckCharacter(to, on_failure); 1616 } else { 1617 macro_assembler->CheckNotCharacter(to, on_failure); 1618 } 1619 } else { 1620 if (from != 0) { 1621 if (cc->is_negated()) { 1622 macro_assembler->CheckCharacterLT(from, &success); 1623 } else { 1624 macro_assembler->CheckCharacterLT(from, on_failure); 1625 } 1626 } 1627 if (to != String::kMaxUC16CharCode) { 1628 if (cc->is_negated()) { 1629 macro_assembler->CheckCharacterLT(to + 1, on_failure); 1630 } else { 1631 macro_assembler->CheckCharacterGT(to, on_failure); 1632 } 1633 } else { 1634 if (cc->is_negated()) { 1635 macro_assembler->GoTo(on_failure); 1636 } 1637 } 1638 } 1639 macro_assembler->Bind(&success); 1640 } 1641 1642 1643 RegExpNode::~RegExpNode() { 1644 } 1645 1646 1647 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler, 1648 Trace* trace) { 1649 // If we are generating a greedy loop then don't stop and don't reuse code. 1650 if (trace->stop_node() != NULL) { 1651 return CONTINUE; 1652 } 1653 1654 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 1655 if (trace->is_trivial()) { 1656 if (label_.is_bound()) { 1657 // We are being asked to generate a generic version, but that's already 1658 // been done so just go to it. 1659 macro_assembler->GoTo(&label_); 1660 return DONE; 1661 } 1662 if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) { 1663 // To avoid too deep recursion we push the node to the work queue and just 1664 // generate a goto here. 1665 compiler->AddWork(this); 1666 macro_assembler->GoTo(&label_); 1667 return DONE; 1668 } 1669 // Generate generic version of the node and bind the label for later use. 1670 macro_assembler->Bind(&label_); 1671 return CONTINUE; 1672 } 1673 1674 // We are being asked to make a non-generic version. Keep track of how many 1675 // non-generic versions we generate so as not to overdo it. 1676 trace_count_++; 1677 if (FLAG_regexp_optimization && 1678 trace_count_ < kMaxCopiesCodeGenerated && 1679 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) { 1680 return CONTINUE; 1681 } 1682 1683 // If we get here code has been generated for this node too many times or 1684 // recursion is too deep. Time to switch to a generic version. The code for 1685 // generic versions above can handle deep recursion properly. 1686 trace->Flush(compiler, this); 1687 return DONE; 1688 } 1689 1690 1691 int ActionNode::EatsAtLeast(int still_to_find, 1692 int recursion_depth, 1693 bool not_at_start) { 1694 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; 1695 if (type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input! 1696 return on_success()->EatsAtLeast(still_to_find, 1697 recursion_depth + 1, 1698 not_at_start); 1699 } 1700 1701 1702 int AssertionNode::EatsAtLeast(int still_to_find, 1703 int recursion_depth, 1704 bool not_at_start) { 1705 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; 1706 // If we know we are not at the start and we are asked "how many characters 1707 // will you match if you succeed?" then we can answer anything since false 1708 // implies false. So lets just return the max answer (still_to_find) since 1709 // that won't prevent us from preloading a lot of characters for the other 1710 // branches in the node graph. 1711 if (type() == AT_START && not_at_start) return still_to_find; 1712 return on_success()->EatsAtLeast(still_to_find, 1713 recursion_depth + 1, 1714 not_at_start); 1715 } 1716 1717 1718 int BackReferenceNode::EatsAtLeast(int still_to_find, 1719 int recursion_depth, 1720 bool not_at_start) { 1721 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; 1722 return on_success()->EatsAtLeast(still_to_find, 1723 recursion_depth + 1, 1724 not_at_start); 1725 } 1726 1727 1728 int TextNode::EatsAtLeast(int still_to_find, 1729 int recursion_depth, 1730 bool not_at_start) { 1731 int answer = Length(); 1732 if (answer >= still_to_find) return answer; 1733 if (recursion_depth > RegExpCompiler::kMaxRecursion) return answer; 1734 // We are not at start after this node so we set the last argument to 'true'. 1735 return answer + on_success()->EatsAtLeast(still_to_find - answer, 1736 recursion_depth + 1, 1737 true); 1738 } 1739 1740 1741 int NegativeLookaheadChoiceNode::EatsAtLeast(int still_to_find, 1742 int recursion_depth, 1743 bool not_at_start) { 1744 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; 1745 // Alternative 0 is the negative lookahead, alternative 1 is what comes 1746 // afterwards. 1747 RegExpNode* node = alternatives_->at(1).node(); 1748 return node->EatsAtLeast(still_to_find, recursion_depth + 1, not_at_start); 1749 } 1750 1751 1752 void NegativeLookaheadChoiceNode::GetQuickCheckDetails( 1753 QuickCheckDetails* details, 1754 RegExpCompiler* compiler, 1755 int filled_in, 1756 bool not_at_start) { 1757 // Alternative 0 is the negative lookahead, alternative 1 is what comes 1758 // afterwards. 1759 RegExpNode* node = alternatives_->at(1).node(); 1760 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start); 1761 } 1762 1763 1764 int ChoiceNode::EatsAtLeastHelper(int still_to_find, 1765 int recursion_depth, 1766 RegExpNode* ignore_this_node, 1767 bool not_at_start) { 1768 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; 1769 int min = 100; 1770 int choice_count = alternatives_->length(); 1771 for (int i = 0; i < choice_count; i++) { 1772 RegExpNode* node = alternatives_->at(i).node(); 1773 if (node == ignore_this_node) continue; 1774 int node_eats_at_least = node->EatsAtLeast(still_to_find, 1775 recursion_depth + 1, 1776 not_at_start); 1777 if (node_eats_at_least < min) min = node_eats_at_least; 1778 } 1779 return min; 1780 } 1781 1782 1783 int LoopChoiceNode::EatsAtLeast(int still_to_find, 1784 int recursion_depth, 1785 bool not_at_start) { 1786 return EatsAtLeastHelper(still_to_find, 1787 recursion_depth, 1788 loop_node_, 1789 not_at_start); 1790 } 1791 1792 1793 int ChoiceNode::EatsAtLeast(int still_to_find, 1794 int recursion_depth, 1795 bool not_at_start) { 1796 return EatsAtLeastHelper(still_to_find, 1797 recursion_depth, 1798 NULL, 1799 not_at_start); 1800 } 1801 1802 1803 // Takes the left-most 1-bit and smears it out, setting all bits to its right. 1804 static inline uint32_t SmearBitsRight(uint32_t v) { 1805 v |= v >> 1; 1806 v |= v >> 2; 1807 v |= v >> 4; 1808 v |= v >> 8; 1809 v |= v >> 16; 1810 return v; 1811 } 1812 1813 1814 bool QuickCheckDetails::Rationalize(bool asc) { 1815 bool found_useful_op = false; 1816 uint32_t char_mask; 1817 if (asc) { 1818 char_mask = String::kMaxAsciiCharCode; 1819 } else { 1820 char_mask = String::kMaxUC16CharCode; 1821 } 1822 mask_ = 0; 1823 value_ = 0; 1824 int char_shift = 0; 1825 for (int i = 0; i < characters_; i++) { 1826 Position* pos = &positions_[i]; 1827 if ((pos->mask & String::kMaxAsciiCharCode) != 0) { 1828 found_useful_op = true; 1829 } 1830 mask_ |= (pos->mask & char_mask) << char_shift; 1831 value_ |= (pos->value & char_mask) << char_shift; 1832 char_shift += asc ? 8 : 16; 1833 } 1834 return found_useful_op; 1835 } 1836 1837 1838 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler, 1839 Trace* trace, 1840 bool preload_has_checked_bounds, 1841 Label* on_possible_success, 1842 QuickCheckDetails* details, 1843 bool fall_through_on_failure) { 1844 if (details->characters() == 0) return false; 1845 GetQuickCheckDetails(details, compiler, 0, trace->at_start() == Trace::FALSE); 1846 if (details->cannot_match()) return false; 1847 if (!details->Rationalize(compiler->ascii())) return false; 1848 ASSERT(details->characters() == 1 || 1849 compiler->macro_assembler()->CanReadUnaligned()); 1850 uint32_t mask = details->mask(); 1851 uint32_t value = details->value(); 1852 1853 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1854 1855 if (trace->characters_preloaded() != details->characters()) { 1856 assembler->LoadCurrentCharacter(trace->cp_offset(), 1857 trace->backtrack(), 1858 !preload_has_checked_bounds, 1859 details->characters()); 1860 } 1861 1862 1863 bool need_mask = true; 1864 1865 if (details->characters() == 1) { 1866 // If number of characters preloaded is 1 then we used a byte or 16 bit 1867 // load so the value is already masked down. 1868 uint32_t char_mask; 1869 if (compiler->ascii()) { 1870 char_mask = String::kMaxAsciiCharCode; 1871 } else { 1872 char_mask = String::kMaxUC16CharCode; 1873 } 1874 if ((mask & char_mask) == char_mask) need_mask = false; 1875 mask &= char_mask; 1876 } else { 1877 // For 2-character preloads in ASCII mode or 1-character preloads in 1878 // TWO_BYTE mode we also use a 16 bit load with zero extend. 1879 if (details->characters() == 2 && compiler->ascii()) { 1880 if ((mask & 0x7f7f) == 0x7f7f) need_mask = false; 1881 } else if (details->characters() == 1 && !compiler->ascii()) { 1882 if ((mask & 0xffff) == 0xffff) need_mask = false; 1883 } else { 1884 if (mask == 0xffffffff) need_mask = false; 1885 } 1886 } 1887 1888 if (fall_through_on_failure) { 1889 if (need_mask) { 1890 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success); 1891 } else { 1892 assembler->CheckCharacter(value, on_possible_success); 1893 } 1894 } else { 1895 if (need_mask) { 1896 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack()); 1897 } else { 1898 assembler->CheckNotCharacter(value, trace->backtrack()); 1899 } 1900 } 1901 return true; 1902 } 1903 1904 1905 // Here is the meat of GetQuickCheckDetails (see also the comment on the 1906 // super-class in the .h file). 1907 // 1908 // We iterate along the text object, building up for each character a 1909 // mask and value that can be used to test for a quick failure to match. 1910 // The masks and values for the positions will be combined into a single 1911 // machine word for the current character width in order to be used in 1912 // generating a quick check. 1913 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, 1914 RegExpCompiler* compiler, 1915 int characters_filled_in, 1916 bool not_at_start) { 1917 Isolate* isolate = Isolate::Current(); 1918 ASSERT(characters_filled_in < details->characters()); 1919 int characters = details->characters(); 1920 int char_mask; 1921 int char_shift; 1922 if (compiler->ascii()) { 1923 char_mask = String::kMaxAsciiCharCode; 1924 char_shift = 8; 1925 } else { 1926 char_mask = String::kMaxUC16CharCode; 1927 char_shift = 16; 1928 } 1929 for (int k = 0; k < elms_->length(); k++) { 1930 TextElement elm = elms_->at(k); 1931 if (elm.type == TextElement::ATOM) { 1932 Vector<const uc16> quarks = elm.data.u_atom->data(); 1933 for (int i = 0; i < characters && i < quarks.length(); i++) { 1934 QuickCheckDetails::Position* pos = 1935 details->positions(characters_filled_in); 1936 uc16 c = quarks[i]; 1937 if (c > char_mask) { 1938 // If we expect a non-ASCII character from an ASCII string, 1939 // there is no way we can match. Not even case independent 1940 // matching can turn an ASCII character into non-ASCII or 1941 // vice versa. 1942 details->set_cannot_match(); 1943 pos->determines_perfectly = false; 1944 return; 1945 } 1946 if (compiler->ignore_case()) { 1947 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1948 int length = GetCaseIndependentLetters(isolate, c, compiler->ascii(), 1949 chars); 1950 ASSERT(length != 0); // Can only happen if c > char_mask (see above). 1951 if (length == 1) { 1952 // This letter has no case equivalents, so it's nice and simple 1953 // and the mask-compare will determine definitely whether we have 1954 // a match at this character position. 1955 pos->mask = char_mask; 1956 pos->value = c; 1957 pos->determines_perfectly = true; 1958 } else { 1959 uint32_t common_bits = char_mask; 1960 uint32_t bits = chars[0]; 1961 for (int j = 1; j < length; j++) { 1962 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits); 1963 common_bits ^= differing_bits; 1964 bits &= common_bits; 1965 } 1966 // If length is 2 and common bits has only one zero in it then 1967 // our mask and compare instruction will determine definitely 1968 // whether we have a match at this character position. Otherwise 1969 // it can only be an approximate check. 1970 uint32_t one_zero = (common_bits | ~char_mask); 1971 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) { 1972 pos->determines_perfectly = true; 1973 } 1974 pos->mask = common_bits; 1975 pos->value = bits; 1976 } 1977 } else { 1978 // Don't ignore case. Nice simple case where the mask-compare will 1979 // determine definitely whether we have a match at this character 1980 // position. 1981 pos->mask = char_mask; 1982 pos->value = c; 1983 pos->determines_perfectly = true; 1984 } 1985 characters_filled_in++; 1986 ASSERT(characters_filled_in <= details->characters()); 1987 if (characters_filled_in == details->characters()) { 1988 return; 1989 } 1990 } 1991 } else { 1992 QuickCheckDetails::Position* pos = 1993 details->positions(characters_filled_in); 1994 RegExpCharacterClass* tree = elm.data.u_char_class; 1995 ZoneList<CharacterRange>* ranges = tree->ranges(); 1996 if (tree->is_negated()) { 1997 // A quick check uses multi-character mask and compare. There is no 1998 // useful way to incorporate a negative char class into this scheme 1999 // so we just conservatively create a mask and value that will always 2000 // succeed. 2001 pos->mask = 0; 2002 pos->value = 0; 2003 } else { 2004 int first_range = 0; 2005 while (ranges->at(first_range).from() > char_mask) { 2006 first_range++; 2007 if (first_range == ranges->length()) { 2008 details->set_cannot_match(); 2009 pos->determines_perfectly = false; 2010 return; 2011 } 2012 } 2013 CharacterRange range = ranges->at(first_range); 2014 uc16 from = range.from(); 2015 uc16 to = range.to(); 2016 if (to > char_mask) { 2017 to = char_mask; 2018 } 2019 uint32_t differing_bits = (from ^ to); 2020 // A mask and compare is only perfect if the differing bits form a 2021 // number like 00011111 with one single block of trailing 1s. 2022 if ((differing_bits & (differing_bits + 1)) == 0 && 2023 from + differing_bits == to) { 2024 pos->determines_perfectly = true; 2025 } 2026 uint32_t common_bits = ~SmearBitsRight(differing_bits); 2027 uint32_t bits = (from & common_bits); 2028 for (int i = first_range + 1; i < ranges->length(); i++) { 2029 CharacterRange range = ranges->at(i); 2030 uc16 from = range.from(); 2031 uc16 to = range.to(); 2032 if (from > char_mask) continue; 2033 if (to > char_mask) to = char_mask; 2034 // Here we are combining more ranges into the mask and compare 2035 // value. With each new range the mask becomes more sparse and 2036 // so the chances of a false positive rise. A character class 2037 // with multiple ranges is assumed never to be equivalent to a 2038 // mask and compare operation. 2039 pos->determines_perfectly = false; 2040 uint32_t new_common_bits = (from ^ to); 2041 new_common_bits = ~SmearBitsRight(new_common_bits); 2042 common_bits &= new_common_bits; 2043 bits &= new_common_bits; 2044 uint32_t differing_bits = (from & common_bits) ^ bits; 2045 common_bits ^= differing_bits; 2046 bits &= common_bits; 2047 } 2048 pos->mask = common_bits; 2049 pos->value = bits; 2050 } 2051 characters_filled_in++; 2052 ASSERT(characters_filled_in <= details->characters()); 2053 if (characters_filled_in == details->characters()) { 2054 return; 2055 } 2056 } 2057 } 2058 ASSERT(characters_filled_in != details->characters()); 2059 on_success()-> GetQuickCheckDetails(details, 2060 compiler, 2061 characters_filled_in, 2062 true); 2063 } 2064 2065 2066 void QuickCheckDetails::Clear() { 2067 for (int i = 0; i < characters_; i++) { 2068 positions_[i].mask = 0; 2069 positions_[i].value = 0; 2070 positions_[i].determines_perfectly = false; 2071 } 2072 characters_ = 0; 2073 } 2074 2075 2076 void QuickCheckDetails::Advance(int by, bool ascii) { 2077 ASSERT(by >= 0); 2078 if (by >= characters_) { 2079 Clear(); 2080 return; 2081 } 2082 for (int i = 0; i < characters_ - by; i++) { 2083 positions_[i] = positions_[by + i]; 2084 } 2085 for (int i = characters_ - by; i < characters_; i++) { 2086 positions_[i].mask = 0; 2087 positions_[i].value = 0; 2088 positions_[i].determines_perfectly = false; 2089 } 2090 characters_ -= by; 2091 // We could change mask_ and value_ here but we would never advance unless 2092 // they had already been used in a check and they won't be used again because 2093 // it would gain us nothing. So there's no point. 2094 } 2095 2096 2097 void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) { 2098 ASSERT(characters_ == other->characters_); 2099 if (other->cannot_match_) { 2100 return; 2101 } 2102 if (cannot_match_) { 2103 *this = *other; 2104 return; 2105 } 2106 for (int i = from_index; i < characters_; i++) { 2107 QuickCheckDetails::Position* pos = positions(i); 2108 QuickCheckDetails::Position* other_pos = other->positions(i); 2109 if (pos->mask != other_pos->mask || 2110 pos->value != other_pos->value || 2111 !other_pos->determines_perfectly) { 2112 // Our mask-compare operation will be approximate unless we have the 2113 // exact same operation on both sides of the alternation. 2114 pos->determines_perfectly = false; 2115 } 2116 pos->mask &= other_pos->mask; 2117 pos->value &= pos->mask; 2118 other_pos->value &= pos->mask; 2119 uc16 differing_bits = (pos->value ^ other_pos->value); 2120 pos->mask &= ~differing_bits; 2121 pos->value &= pos->mask; 2122 } 2123 } 2124 2125 2126 class VisitMarker { 2127 public: 2128 explicit VisitMarker(NodeInfo* info) : info_(info) { 2129 ASSERT(!info->visited); 2130 info->visited = true; 2131 } 2132 ~VisitMarker() { 2133 info_->visited = false; 2134 } 2135 private: 2136 NodeInfo* info_; 2137 }; 2138 2139 2140 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 2141 RegExpCompiler* compiler, 2142 int characters_filled_in, 2143 bool not_at_start) { 2144 if (body_can_be_zero_length_ || info()->visited) return; 2145 VisitMarker marker(info()); 2146 return ChoiceNode::GetQuickCheckDetails(details, 2147 compiler, 2148 characters_filled_in, 2149 not_at_start); 2150 } 2151 2152 2153 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 2154 RegExpCompiler* compiler, 2155 int characters_filled_in, 2156 bool not_at_start) { 2157 not_at_start = (not_at_start || not_at_start_); 2158 int choice_count = alternatives_->length(); 2159 ASSERT(choice_count > 0); 2160 alternatives_->at(0).node()->GetQuickCheckDetails(details, 2161 compiler, 2162 characters_filled_in, 2163 not_at_start); 2164 for (int i = 1; i < choice_count; i++) { 2165 QuickCheckDetails new_details(details->characters()); 2166 RegExpNode* node = alternatives_->at(i).node(); 2167 node->GetQuickCheckDetails(&new_details, compiler, 2168 characters_filled_in, 2169 not_at_start); 2170 // Here we merge the quick match details of the two branches. 2171 details->Merge(&new_details, characters_filled_in); 2172 } 2173 } 2174 2175 2176 // Check for [0-9A-Z_a-z]. 2177 static void EmitWordCheck(RegExpMacroAssembler* assembler, 2178 Label* word, 2179 Label* non_word, 2180 bool fall_through_on_word) { 2181 if (assembler->CheckSpecialCharacterClass( 2182 fall_through_on_word ? 'w' : 'W', 2183 fall_through_on_word ? non_word : word)) { 2184 // Optimized implementation available. 2185 return; 2186 } 2187 assembler->CheckCharacterGT('z', non_word); 2188 assembler->CheckCharacterLT('0', non_word); 2189 assembler->CheckCharacterGT('a' - 1, word); 2190 assembler->CheckCharacterLT('9' + 1, word); 2191 assembler->CheckCharacterLT('A', non_word); 2192 assembler->CheckCharacterLT('Z' + 1, word); 2193 if (fall_through_on_word) { 2194 assembler->CheckNotCharacter('_', non_word); 2195 } else { 2196 assembler->CheckCharacter('_', word); 2197 } 2198 } 2199 2200 2201 // Emit the code to check for a ^ in multiline mode (1-character lookbehind 2202 // that matches newline or the start of input). 2203 static void EmitHat(RegExpCompiler* compiler, 2204 RegExpNode* on_success, 2205 Trace* trace) { 2206 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2207 // We will be loading the previous character into the current character 2208 // register. 2209 Trace new_trace(*trace); 2210 new_trace.InvalidateCurrentCharacter(); 2211 2212 Label ok; 2213 if (new_trace.cp_offset() == 0) { 2214 // The start of input counts as a newline in this context, so skip to 2215 // ok if we are at the start. 2216 assembler->CheckAtStart(&ok); 2217 } 2218 // We already checked that we are not at the start of input so it must be 2219 // OK to load the previous character. 2220 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1, 2221 new_trace.backtrack(), 2222 false); 2223 if (!assembler->CheckSpecialCharacterClass('n', 2224 new_trace.backtrack())) { 2225 // Newline means \n, \r, 0x2028 or 0x2029. 2226 if (!compiler->ascii()) { 2227 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok); 2228 } 2229 assembler->CheckCharacter('\n', &ok); 2230 assembler->CheckNotCharacter('\r', new_trace.backtrack()); 2231 } 2232 assembler->Bind(&ok); 2233 on_success->Emit(compiler, &new_trace); 2234 } 2235 2236 2237 // Emit the code to handle \b and \B (word-boundary or non-word-boundary) 2238 // when we know whether the next character must be a word character or not. 2239 static void EmitHalfBoundaryCheck(AssertionNode::AssertionNodeType type, 2240 RegExpCompiler* compiler, 2241 RegExpNode* on_success, 2242 Trace* trace) { 2243 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2244 Label done; 2245 2246 Trace new_trace(*trace); 2247 2248 bool expect_word_character = (type == AssertionNode::AFTER_WORD_CHARACTER); 2249 Label* on_word = expect_word_character ? &done : new_trace.backtrack(); 2250 Label* on_non_word = expect_word_character ? new_trace.backtrack() : &done; 2251 2252 // Check whether previous character was a word character. 2253 switch (trace->at_start()) { 2254 case Trace::TRUE: 2255 if (expect_word_character) { 2256 assembler->GoTo(on_non_word); 2257 } 2258 break; 2259 case Trace::UNKNOWN: 2260 ASSERT_EQ(0, trace->cp_offset()); 2261 assembler->CheckAtStart(on_non_word); 2262 // Fall through. 2263 case Trace::FALSE: 2264 int prev_char_offset = trace->cp_offset() - 1; 2265 assembler->LoadCurrentCharacter(prev_char_offset, NULL, false, 1); 2266 EmitWordCheck(assembler, on_word, on_non_word, expect_word_character); 2267 // We may or may not have loaded the previous character. 2268 new_trace.InvalidateCurrentCharacter(); 2269 } 2270 2271 assembler->Bind(&done); 2272 2273 on_success->Emit(compiler, &new_trace); 2274 } 2275 2276 2277 // Emit the code to handle \b and \B (word-boundary or non-word-boundary). 2278 static void EmitBoundaryCheck(AssertionNode::AssertionNodeType type, 2279 RegExpCompiler* compiler, 2280 RegExpNode* on_success, 2281 Trace* trace) { 2282 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2283 Label before_non_word; 2284 Label before_word; 2285 if (trace->characters_preloaded() != 1) { 2286 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word); 2287 } 2288 // Fall through on non-word. 2289 EmitWordCheck(assembler, &before_word, &before_non_word, false); 2290 2291 // We will be loading the previous character into the current character 2292 // register. 2293 Trace new_trace(*trace); 2294 new_trace.InvalidateCurrentCharacter(); 2295 2296 Label ok; 2297 Label* boundary; 2298 Label* not_boundary; 2299 if (type == AssertionNode::AT_BOUNDARY) { 2300 boundary = &ok; 2301 not_boundary = new_trace.backtrack(); 2302 } else { 2303 not_boundary = &ok; 2304 boundary = new_trace.backtrack(); 2305 } 2306 2307 // Next character is not a word character. 2308 assembler->Bind(&before_non_word); 2309 if (new_trace.cp_offset() == 0) { 2310 // The start of input counts as a non-word character, so the question is 2311 // decided if we are at the start. 2312 assembler->CheckAtStart(not_boundary); 2313 } 2314 // We already checked that we are not at the start of input so it must be 2315 // OK to load the previous character. 2316 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, 2317 &ok, // Unused dummy label in this call. 2318 false); 2319 // Fall through on non-word. 2320 EmitWordCheck(assembler, boundary, not_boundary, false); 2321 assembler->GoTo(not_boundary); 2322 2323 // Next character is a word character. 2324 assembler->Bind(&before_word); 2325 if (new_trace.cp_offset() == 0) { 2326 // The start of input counts as a non-word character, so the question is 2327 // decided if we are at the start. 2328 assembler->CheckAtStart(boundary); 2329 } 2330 // We already checked that we are not at the start of input so it must be 2331 // OK to load the previous character. 2332 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, 2333 &ok, // Unused dummy label in this call. 2334 false); 2335 bool fall_through_on_word = (type == AssertionNode::AT_NON_BOUNDARY); 2336 EmitWordCheck(assembler, not_boundary, boundary, fall_through_on_word); 2337 2338 assembler->Bind(&ok); 2339 2340 on_success->Emit(compiler, &new_trace); 2341 } 2342 2343 2344 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details, 2345 RegExpCompiler* compiler, 2346 int filled_in, 2347 bool not_at_start) { 2348 if (type_ == AT_START && not_at_start) { 2349 details->set_cannot_match(); 2350 return; 2351 } 2352 return on_success()->GetQuickCheckDetails(details, 2353 compiler, 2354 filled_in, 2355 not_at_start); 2356 } 2357 2358 2359 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) { 2360 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2361 switch (type_) { 2362 case AT_END: { 2363 Label ok; 2364 assembler->CheckPosition(trace->cp_offset(), &ok); 2365 assembler->GoTo(trace->backtrack()); 2366 assembler->Bind(&ok); 2367 break; 2368 } 2369 case AT_START: { 2370 if (trace->at_start() == Trace::FALSE) { 2371 assembler->GoTo(trace->backtrack()); 2372 return; 2373 } 2374 if (trace->at_start() == Trace::UNKNOWN) { 2375 assembler->CheckNotAtStart(trace->backtrack()); 2376 Trace at_start_trace = *trace; 2377 at_start_trace.set_at_start(true); 2378 on_success()->Emit(compiler, &at_start_trace); 2379 return; 2380 } 2381 } 2382 break; 2383 case AFTER_NEWLINE: 2384 EmitHat(compiler, on_success(), trace); 2385 return; 2386 case AT_BOUNDARY: 2387 case AT_NON_BOUNDARY: { 2388 EmitBoundaryCheck(type_, compiler, on_success(), trace); 2389 return; 2390 } 2391 case AFTER_WORD_CHARACTER: 2392 case AFTER_NONWORD_CHARACTER: { 2393 EmitHalfBoundaryCheck(type_, compiler, on_success(), trace); 2394 } 2395 } 2396 on_success()->Emit(compiler, trace); 2397 } 2398 2399 2400 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) { 2401 if (quick_check == NULL) return false; 2402 if (offset >= quick_check->characters()) return false; 2403 return quick_check->positions(offset)->determines_perfectly; 2404 } 2405 2406 2407 static void UpdateBoundsCheck(int index, int* checked_up_to) { 2408 if (index > *checked_up_to) { 2409 *checked_up_to = index; 2410 } 2411 } 2412 2413 2414 // We call this repeatedly to generate code for each pass over the text node. 2415 // The passes are in increasing order of difficulty because we hope one 2416 // of the first passes will fail in which case we are saved the work of the 2417 // later passes. for example for the case independent regexp /%[asdfghjkl]a/ 2418 // we will check the '%' in the first pass, the case independent 'a' in the 2419 // second pass and the character class in the last pass. 2420 // 2421 // The passes are done from right to left, so for example to test for /bar/ 2422 // we will first test for an 'r' with offset 2, then an 'a' with offset 1 2423 // and then a 'b' with offset 0. This means we can avoid the end-of-input 2424 // bounds check most of the time. In the example we only need to check for 2425 // end-of-input when loading the putative 'r'. 2426 // 2427 // A slight complication involves the fact that the first character may already 2428 // be fetched into a register by the previous node. In this case we want to 2429 // do the test for that character first. We do this in separate passes. The 2430 // 'preloaded' argument indicates that we are doing such a 'pass'. If such a 2431 // pass has been performed then subsequent passes will have true in 2432 // first_element_checked to indicate that that character does not need to be 2433 // checked again. 2434 // 2435 // In addition to all this we are passed a Trace, which can 2436 // contain an AlternativeGeneration object. In this AlternativeGeneration 2437 // object we can see details of any quick check that was already passed in 2438 // order to get to the code we are now generating. The quick check can involve 2439 // loading characters, which means we do not need to recheck the bounds 2440 // up to the limit the quick check already checked. In addition the quick 2441 // check can have involved a mask and compare operation which may simplify 2442 // or obviate the need for further checks at some character positions. 2443 void TextNode::TextEmitPass(RegExpCompiler* compiler, 2444 TextEmitPassType pass, 2445 bool preloaded, 2446 Trace* trace, 2447 bool first_element_checked, 2448 int* checked_up_to) { 2449 Isolate* isolate = Isolate::Current(); 2450 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2451 bool ascii = compiler->ascii(); 2452 Label* backtrack = trace->backtrack(); 2453 QuickCheckDetails* quick_check = trace->quick_check_performed(); 2454 int element_count = elms_->length(); 2455 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) { 2456 TextElement elm = elms_->at(i); 2457 int cp_offset = trace->cp_offset() + elm.cp_offset; 2458 if (elm.type == TextElement::ATOM) { 2459 Vector<const uc16> quarks = elm.data.u_atom->data(); 2460 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { 2461 if (first_element_checked && i == 0 && j == 0) continue; 2462 if (DeterminedAlready(quick_check, elm.cp_offset + j)) continue; 2463 EmitCharacterFunction* emit_function = NULL; 2464 switch (pass) { 2465 case NON_ASCII_MATCH: 2466 ASSERT(ascii); 2467 if (quarks[j] > String::kMaxAsciiCharCode) { 2468 assembler->GoTo(backtrack); 2469 return; 2470 } 2471 break; 2472 case NON_LETTER_CHARACTER_MATCH: 2473 emit_function = &EmitAtomNonLetter; 2474 break; 2475 case SIMPLE_CHARACTER_MATCH: 2476 emit_function = &EmitSimpleCharacter; 2477 break; 2478 case CASE_CHARACTER_MATCH: 2479 emit_function = &EmitAtomLetter; 2480 break; 2481 default: 2482 break; 2483 } 2484 if (emit_function != NULL) { 2485 bool bound_checked = emit_function(isolate, 2486 compiler, 2487 quarks[j], 2488 backtrack, 2489 cp_offset + j, 2490 *checked_up_to < cp_offset + j, 2491 preloaded); 2492 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); 2493 } 2494 } 2495 } else { 2496 ASSERT_EQ(elm.type, TextElement::CHAR_CLASS); 2497 if (pass == CHARACTER_CLASS_MATCH) { 2498 if (first_element_checked && i == 0) continue; 2499 if (DeterminedAlready(quick_check, elm.cp_offset)) continue; 2500 RegExpCharacterClass* cc = elm.data.u_char_class; 2501 EmitCharClass(assembler, 2502 cc, 2503 ascii, 2504 backtrack, 2505 cp_offset, 2506 *checked_up_to < cp_offset, 2507 preloaded); 2508 UpdateBoundsCheck(cp_offset, checked_up_to); 2509 } 2510 } 2511 } 2512 } 2513 2514 2515 int TextNode::Length() { 2516 TextElement elm = elms_->last(); 2517 ASSERT(elm.cp_offset >= 0); 2518 if (elm.type == TextElement::ATOM) { 2519 return elm.cp_offset + elm.data.u_atom->data().length(); 2520 } else { 2521 return elm.cp_offset + 1; 2522 } 2523 } 2524 2525 2526 bool TextNode::SkipPass(int int_pass, bool ignore_case) { 2527 TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass); 2528 if (ignore_case) { 2529 return pass == SIMPLE_CHARACTER_MATCH; 2530 } else { 2531 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH; 2532 } 2533 } 2534 2535 2536 // This generates the code to match a text node. A text node can contain 2537 // straight character sequences (possibly to be matched in a case-independent 2538 // way) and character classes. For efficiency we do not do this in a single 2539 // pass from left to right. Instead we pass over the text node several times, 2540 // emitting code for some character positions every time. See the comment on 2541 // TextEmitPass for details. 2542 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) { 2543 LimitResult limit_result = LimitVersions(compiler, trace); 2544 if (limit_result == DONE) return; 2545 ASSERT(limit_result == CONTINUE); 2546 2547 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) { 2548 compiler->SetRegExpTooBig(); 2549 return; 2550 } 2551 2552 if (compiler->ascii()) { 2553 int dummy = 0; 2554 TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy); 2555 } 2556 2557 bool first_elt_done = false; 2558 int bound_checked_to = trace->cp_offset() - 1; 2559 bound_checked_to += trace->bound_checked_up_to(); 2560 2561 // If a character is preloaded into the current character register then 2562 // check that now. 2563 if (trace->characters_preloaded() == 1) { 2564 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { 2565 if (!SkipPass(pass, compiler->ignore_case())) { 2566 TextEmitPass(compiler, 2567 static_cast<TextEmitPassType>(pass), 2568 true, 2569 trace, 2570 false, 2571 &bound_checked_to); 2572 } 2573 } 2574 first_elt_done = true; 2575 } 2576 2577 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { 2578 if (!SkipPass(pass, compiler->ignore_case())) { 2579 TextEmitPass(compiler, 2580 static_cast<TextEmitPassType>(pass), 2581 false, 2582 trace, 2583 first_elt_done, 2584 &bound_checked_to); 2585 } 2586 } 2587 2588 Trace successor_trace(*trace); 2589 successor_trace.set_at_start(false); 2590 successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler); 2591 RecursionCheck rc(compiler); 2592 on_success()->Emit(compiler, &successor_trace); 2593 } 2594 2595 2596 void Trace::InvalidateCurrentCharacter() { 2597 characters_preloaded_ = 0; 2598 } 2599 2600 2601 void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) { 2602 ASSERT(by > 0); 2603 // We don't have an instruction for shifting the current character register 2604 // down or for using a shifted value for anything so lets just forget that 2605 // we preloaded any characters into it. 2606 characters_preloaded_ = 0; 2607 // Adjust the offsets of the quick check performed information. This 2608 // information is used to find out what we already determined about the 2609 // characters by means of mask and compare. 2610 quick_check_performed_.Advance(by, compiler->ascii()); 2611 cp_offset_ += by; 2612 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) { 2613 compiler->SetRegExpTooBig(); 2614 cp_offset_ = 0; 2615 } 2616 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by); 2617 } 2618 2619 2620 void TextNode::MakeCaseIndependent(bool is_ascii) { 2621 int element_count = elms_->length(); 2622 for (int i = 0; i < element_count; i++) { 2623 TextElement elm = elms_->at(i); 2624 if (elm.type == TextElement::CHAR_CLASS) { 2625 RegExpCharacterClass* cc = elm.data.u_char_class; 2626 // None of the standard character classses is different in the case 2627 // independent case and it slows us down if we don't know that. 2628 if (cc->is_standard()) continue; 2629 ZoneList<CharacterRange>* ranges = cc->ranges(); 2630 int range_count = ranges->length(); 2631 for (int j = 0; j < range_count; j++) { 2632 ranges->at(j).AddCaseEquivalents(ranges, is_ascii); 2633 } 2634 } 2635 } 2636 } 2637 2638 2639 int TextNode::GreedyLoopTextLength() { 2640 TextElement elm = elms_->at(elms_->length() - 1); 2641 if (elm.type == TextElement::CHAR_CLASS) { 2642 return elm.cp_offset + 1; 2643 } else { 2644 return elm.cp_offset + elm.data.u_atom->data().length(); 2645 } 2646 } 2647 2648 2649 // Finds the fixed match length of a sequence of nodes that goes from 2650 // this alternative and back to this choice node. If there are variable 2651 // length nodes or other complications in the way then return a sentinel 2652 // value indicating that a greedy loop cannot be constructed. 2653 int ChoiceNode::GreedyLoopTextLength(GuardedAlternative* alternative) { 2654 int length = 0; 2655 RegExpNode* node = alternative->node(); 2656 // Later we will generate code for all these text nodes using recursion 2657 // so we have to limit the max number. 2658 int recursion_depth = 0; 2659 while (node != this) { 2660 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) { 2661 return kNodeIsTooComplexForGreedyLoops; 2662 } 2663 int node_length = node->GreedyLoopTextLength(); 2664 if (node_length == kNodeIsTooComplexForGreedyLoops) { 2665 return kNodeIsTooComplexForGreedyLoops; 2666 } 2667 length += node_length; 2668 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node); 2669 node = seq_node->on_success(); 2670 } 2671 return length; 2672 } 2673 2674 2675 void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) { 2676 ASSERT_EQ(loop_node_, NULL); 2677 AddAlternative(alt); 2678 loop_node_ = alt.node(); 2679 } 2680 2681 2682 void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) { 2683 ASSERT_EQ(continue_node_, NULL); 2684 AddAlternative(alt); 2685 continue_node_ = alt.node(); 2686 } 2687 2688 2689 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 2690 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 2691 if (trace->stop_node() == this) { 2692 int text_length = GreedyLoopTextLength(&(alternatives_->at(0))); 2693 ASSERT(text_length != kNodeIsTooComplexForGreedyLoops); 2694 // Update the counter-based backtracking info on the stack. This is an 2695 // optimization for greedy loops (see below). 2696 ASSERT(trace->cp_offset() == text_length); 2697 macro_assembler->AdvanceCurrentPosition(text_length); 2698 macro_assembler->GoTo(trace->loop_label()); 2699 return; 2700 } 2701 ASSERT(trace->stop_node() == NULL); 2702 if (!trace->is_trivial()) { 2703 trace->Flush(compiler, this); 2704 return; 2705 } 2706 ChoiceNode::Emit(compiler, trace); 2707 } 2708 2709 2710 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler, 2711 bool not_at_start) { 2712 int preload_characters = EatsAtLeast(4, 0, not_at_start); 2713 if (compiler->macro_assembler()->CanReadUnaligned()) { 2714 bool ascii = compiler->ascii(); 2715 if (ascii) { 2716 if (preload_characters > 4) preload_characters = 4; 2717 // We can't preload 3 characters because there is no machine instruction 2718 // to do that. We can't just load 4 because we could be reading 2719 // beyond the end of the string, which could cause a memory fault. 2720 if (preload_characters == 3) preload_characters = 2; 2721 } else { 2722 if (preload_characters > 2) preload_characters = 2; 2723 } 2724 } else { 2725 if (preload_characters > 1) preload_characters = 1; 2726 } 2727 return preload_characters; 2728 } 2729 2730 2731 // This class is used when generating the alternatives in a choice node. It 2732 // records the way the alternative is being code generated. 2733 class AlternativeGeneration: public Malloced { 2734 public: 2735 AlternativeGeneration() 2736 : possible_success(), 2737 expects_preload(false), 2738 after(), 2739 quick_check_details() { } 2740 Label possible_success; 2741 bool expects_preload; 2742 Label after; 2743 QuickCheckDetails quick_check_details; 2744 }; 2745 2746 2747 // Creates a list of AlternativeGenerations. If the list has a reasonable 2748 // size then it is on the stack, otherwise the excess is on the heap. 2749 class AlternativeGenerationList { 2750 public: 2751 explicit AlternativeGenerationList(int count) 2752 : alt_gens_(count) { 2753 for (int i = 0; i < count && i < kAFew; i++) { 2754 alt_gens_.Add(a_few_alt_gens_ + i); 2755 } 2756 for (int i = kAFew; i < count; i++) { 2757 alt_gens_.Add(new AlternativeGeneration()); 2758 } 2759 } 2760 ~AlternativeGenerationList() { 2761 for (int i = kAFew; i < alt_gens_.length(); i++) { 2762 delete alt_gens_[i]; 2763 alt_gens_[i] = NULL; 2764 } 2765 } 2766 2767 AlternativeGeneration* at(int i) { 2768 return alt_gens_[i]; 2769 } 2770 private: 2771 static const int kAFew = 10; 2772 ZoneList<AlternativeGeneration*> alt_gens_; 2773 AlternativeGeneration a_few_alt_gens_[kAFew]; 2774 }; 2775 2776 2777 /* Code generation for choice nodes. 2778 * 2779 * We generate quick checks that do a mask and compare to eliminate a 2780 * choice. If the quick check succeeds then it jumps to the continuation to 2781 * do slow checks and check subsequent nodes. If it fails (the common case) 2782 * it falls through to the next choice. 2783 * 2784 * Here is the desired flow graph. Nodes directly below each other imply 2785 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative 2786 * 3 doesn't have a quick check so we have to call the slow check. 2787 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire 2788 * regexp continuation is generated directly after the Sn node, up to the 2789 * next GoTo if we decide to reuse some already generated code. Some 2790 * nodes expect preload_characters to be preloaded into the current 2791 * character register. R nodes do this preloading. Vertices are marked 2792 * F for failures and S for success (possible success in the case of quick 2793 * nodes). L, V, < and > are used as arrow heads. 2794 * 2795 * ----------> R 2796 * | 2797 * V 2798 * Q1 -----> S1 2799 * | S / 2800 * F| / 2801 * | F/ 2802 * | / 2803 * | R 2804 * | / 2805 * V L 2806 * Q2 -----> S2 2807 * | S / 2808 * F| / 2809 * | F/ 2810 * | / 2811 * | R 2812 * | / 2813 * V L 2814 * S3 2815 * | 2816 * F| 2817 * | 2818 * R 2819 * | 2820 * backtrack V 2821 * <----------Q4 2822 * \ F | 2823 * \ |S 2824 * \ F V 2825 * \-----S4 2826 * 2827 * For greedy loops we reverse our expectation and expect to match rather 2828 * than fail. Therefore we want the loop code to look like this (U is the 2829 * unwind code that steps back in the greedy loop). The following alternatives 2830 * look the same as above. 2831 * _____ 2832 * / \ 2833 * V | 2834 * ----------> S1 | 2835 * /| | 2836 * / |S | 2837 * F/ \_____/ 2838 * / 2839 * |<----------- 2840 * | \ 2841 * V \ 2842 * Q2 ---> S2 \ 2843 * | S / | 2844 * F| / | 2845 * | F/ | 2846 * | / | 2847 * | R | 2848 * | / | 2849 * F VL | 2850 * <------U | 2851 * back |S | 2852 * \______________/ 2853 */ 2854 2855 2856 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 2857 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 2858 int choice_count = alternatives_->length(); 2859 #ifdef DEBUG 2860 for (int i = 0; i < choice_count - 1; i++) { 2861 GuardedAlternative alternative = alternatives_->at(i); 2862 ZoneList<Guard*>* guards = alternative.guards(); 2863 int guard_count = (guards == NULL) ? 0 : guards->length(); 2864 for (int j = 0; j < guard_count; j++) { 2865 ASSERT(!trace->mentions_reg(guards->at(j)->reg())); 2866 } 2867 } 2868 #endif 2869 2870 LimitResult limit_result = LimitVersions(compiler, trace); 2871 if (limit_result == DONE) return; 2872 ASSERT(limit_result == CONTINUE); 2873 2874 int new_flush_budget = trace->flush_budget() / choice_count; 2875 if (trace->flush_budget() == 0 && trace->actions() != NULL) { 2876 trace->Flush(compiler, this); 2877 return; 2878 } 2879 2880 RecursionCheck rc(compiler); 2881 2882 Trace* current_trace = trace; 2883 2884 int text_length = GreedyLoopTextLength(&(alternatives_->at(0))); 2885 bool greedy_loop = false; 2886 Label greedy_loop_label; 2887 Trace counter_backtrack_trace; 2888 counter_backtrack_trace.set_backtrack(&greedy_loop_label); 2889 if (not_at_start()) counter_backtrack_trace.set_at_start(false); 2890 2891 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) { 2892 // Here we have special handling for greedy loops containing only text nodes 2893 // and other simple nodes. These are handled by pushing the current 2894 // position on the stack and then incrementing the current position each 2895 // time around the switch. On backtrack we decrement the current position 2896 // and check it against the pushed value. This avoids pushing backtrack 2897 // information for each iteration of the loop, which could take up a lot of 2898 // space. 2899 greedy_loop = true; 2900 ASSERT(trace->stop_node() == NULL); 2901 macro_assembler->PushCurrentPosition(); 2902 current_trace = &counter_backtrack_trace; 2903 Label greedy_match_failed; 2904 Trace greedy_match_trace; 2905 if (not_at_start()) greedy_match_trace.set_at_start(false); 2906 greedy_match_trace.set_backtrack(&greedy_match_failed); 2907 Label loop_label; 2908 macro_assembler->Bind(&loop_label); 2909 greedy_match_trace.set_stop_node(this); 2910 greedy_match_trace.set_loop_label(&loop_label); 2911 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace); 2912 macro_assembler->Bind(&greedy_match_failed); 2913 } 2914 2915 Label second_choice; // For use in greedy matches. 2916 macro_assembler->Bind(&second_choice); 2917 2918 int first_normal_choice = greedy_loop ? 1 : 0; 2919 2920 int preload_characters = 2921 CalculatePreloadCharacters(compiler, 2922 current_trace->at_start() == Trace::FALSE); 2923 bool preload_is_current = 2924 (current_trace->characters_preloaded() == preload_characters); 2925 bool preload_has_checked_bounds = preload_is_current; 2926 2927 AlternativeGenerationList alt_gens(choice_count); 2928 2929 // For now we just call all choices one after the other. The idea ultimately 2930 // is to use the Dispatch table to try only the relevant ones. 2931 for (int i = first_normal_choice; i < choice_count; i++) { 2932 GuardedAlternative alternative = alternatives_->at(i); 2933 AlternativeGeneration* alt_gen = alt_gens.at(i); 2934 alt_gen->quick_check_details.set_characters(preload_characters); 2935 ZoneList<Guard*>* guards = alternative.guards(); 2936 int guard_count = (guards == NULL) ? 0 : guards->length(); 2937 Trace new_trace(*current_trace); 2938 new_trace.set_characters_preloaded(preload_is_current ? 2939 preload_characters : 2940 0); 2941 if (preload_has_checked_bounds) { 2942 new_trace.set_bound_checked_up_to(preload_characters); 2943 } 2944 new_trace.quick_check_performed()->Clear(); 2945 if (not_at_start_) new_trace.set_at_start(Trace::FALSE); 2946 alt_gen->expects_preload = preload_is_current; 2947 bool generate_full_check_inline = false; 2948 if (FLAG_regexp_optimization && 2949 try_to_emit_quick_check_for_alternative(i) && 2950 alternative.node()->EmitQuickCheck(compiler, 2951 &new_trace, 2952 preload_has_checked_bounds, 2953 &alt_gen->possible_success, 2954 &alt_gen->quick_check_details, 2955 i < choice_count - 1)) { 2956 // Quick check was generated for this choice. 2957 preload_is_current = true; 2958 preload_has_checked_bounds = true; 2959 // On the last choice in the ChoiceNode we generated the quick 2960 // check to fall through on possible success. So now we need to 2961 // generate the full check inline. 2962 if (i == choice_count - 1) { 2963 macro_assembler->Bind(&alt_gen->possible_success); 2964 new_trace.set_quick_check_performed(&alt_gen->quick_check_details); 2965 new_trace.set_characters_preloaded(preload_characters); 2966 new_trace.set_bound_checked_up_to(preload_characters); 2967 generate_full_check_inline = true; 2968 } 2969 } else if (alt_gen->quick_check_details.cannot_match()) { 2970 if (i == choice_count - 1 && !greedy_loop) { 2971 macro_assembler->GoTo(trace->backtrack()); 2972 } 2973 continue; 2974 } else { 2975 // No quick check was generated. Put the full code here. 2976 // If this is not the first choice then there could be slow checks from 2977 // previous cases that go here when they fail. There's no reason to 2978 // insist that they preload characters since the slow check we are about 2979 // to generate probably can't use it. 2980 if (i != first_normal_choice) { 2981 alt_gen->expects_preload = false; 2982 new_trace.InvalidateCurrentCharacter(); 2983 } 2984 if (i < choice_count - 1) { 2985 new_trace.set_backtrack(&alt_gen->after); 2986 } 2987 generate_full_check_inline = true; 2988 } 2989 if (generate_full_check_inline) { 2990 if (new_trace.actions() != NULL) { 2991 new_trace.set_flush_budget(new_flush_budget); 2992 } 2993 for (int j = 0; j < guard_count; j++) { 2994 GenerateGuard(macro_assembler, guards->at(j), &new_trace); 2995 } 2996 alternative.node()->Emit(compiler, &new_trace); 2997 preload_is_current = false; 2998 } 2999 macro_assembler->Bind(&alt_gen->after); 3000 } 3001 if (greedy_loop) { 3002 macro_assembler->Bind(&greedy_loop_label); 3003 // If we have unwound to the bottom then backtrack. 3004 macro_assembler->CheckGreedyLoop(trace->backtrack()); 3005 // Otherwise try the second priority at an earlier position. 3006 macro_assembler->AdvanceCurrentPosition(-text_length); 3007 macro_assembler->GoTo(&second_choice); 3008 } 3009 3010 // At this point we need to generate slow checks for the alternatives where 3011 // the quick check was inlined. We can recognize these because the associated 3012 // label was bound. 3013 for (int i = first_normal_choice; i < choice_count - 1; i++) { 3014 AlternativeGeneration* alt_gen = alt_gens.at(i); 3015 Trace new_trace(*current_trace); 3016 // If there are actions to be flushed we have to limit how many times 3017 // they are flushed. Take the budget of the parent trace and distribute 3018 // it fairly amongst the children. 3019 if (new_trace.actions() != NULL) { 3020 new_trace.set_flush_budget(new_flush_budget); 3021 } 3022 EmitOutOfLineContinuation(compiler, 3023 &new_trace, 3024 alternatives_->at(i), 3025 alt_gen, 3026 preload_characters, 3027 alt_gens.at(i + 1)->expects_preload); 3028 } 3029 } 3030 3031 3032 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler, 3033 Trace* trace, 3034 GuardedAlternative alternative, 3035 AlternativeGeneration* alt_gen, 3036 int preload_characters, 3037 bool next_expects_preload) { 3038 if (!alt_gen->possible_success.is_linked()) return; 3039 3040 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 3041 macro_assembler->Bind(&alt_gen->possible_success); 3042 Trace out_of_line_trace(*trace); 3043 out_of_line_trace.set_characters_preloaded(preload_characters); 3044 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details); 3045 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE); 3046 ZoneList<Guard*>* guards = alternative.guards(); 3047 int guard_count = (guards == NULL) ? 0 : guards->length(); 3048 if (next_expects_preload) { 3049 Label reload_current_char; 3050 out_of_line_trace.set_backtrack(&reload_current_char); 3051 for (int j = 0; j < guard_count; j++) { 3052 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace); 3053 } 3054 alternative.node()->Emit(compiler, &out_of_line_trace); 3055 macro_assembler->Bind(&reload_current_char); 3056 // Reload the current character, since the next quick check expects that. 3057 // We don't need to check bounds here because we only get into this 3058 // code through a quick check which already did the checked load. 3059 macro_assembler->LoadCurrentCharacter(trace->cp_offset(), 3060 NULL, 3061 false, 3062 preload_characters); 3063 macro_assembler->GoTo(&(alt_gen->after)); 3064 } else { 3065 out_of_line_trace.set_backtrack(&(alt_gen->after)); 3066 for (int j = 0; j < guard_count; j++) { 3067 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace); 3068 } 3069 alternative.node()->Emit(compiler, &out_of_line_trace); 3070 } 3071 } 3072 3073 3074 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3075 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3076 LimitResult limit_result = LimitVersions(compiler, trace); 3077 if (limit_result == DONE) return; 3078 ASSERT(limit_result == CONTINUE); 3079 3080 RecursionCheck rc(compiler); 3081 3082 switch (type_) { 3083 case STORE_POSITION: { 3084 Trace::DeferredCapture 3085 new_capture(data_.u_position_register.reg, 3086 data_.u_position_register.is_capture, 3087 trace); 3088 Trace new_trace = *trace; 3089 new_trace.add_action(&new_capture); 3090 on_success()->Emit(compiler, &new_trace); 3091 break; 3092 } 3093 case INCREMENT_REGISTER: { 3094 Trace::DeferredIncrementRegister 3095 new_increment(data_.u_increment_register.reg); 3096 Trace new_trace = *trace; 3097 new_trace.add_action(&new_increment); 3098 on_success()->Emit(compiler, &new_trace); 3099 break; 3100 } 3101 case SET_REGISTER: { 3102 Trace::DeferredSetRegister 3103 new_set(data_.u_store_register.reg, data_.u_store_register.value); 3104 Trace new_trace = *trace; 3105 new_trace.add_action(&new_set); 3106 on_success()->Emit(compiler, &new_trace); 3107 break; 3108 } 3109 case CLEAR_CAPTURES: { 3110 Trace::DeferredClearCaptures 3111 new_capture(Interval(data_.u_clear_captures.range_from, 3112 data_.u_clear_captures.range_to)); 3113 Trace new_trace = *trace; 3114 new_trace.add_action(&new_capture); 3115 on_success()->Emit(compiler, &new_trace); 3116 break; 3117 } 3118 case BEGIN_SUBMATCH: 3119 if (!trace->is_trivial()) { 3120 trace->Flush(compiler, this); 3121 } else { 3122 assembler->WriteCurrentPositionToRegister( 3123 data_.u_submatch.current_position_register, 0); 3124 assembler->WriteStackPointerToRegister( 3125 data_.u_submatch.stack_pointer_register); 3126 on_success()->Emit(compiler, trace); 3127 } 3128 break; 3129 case EMPTY_MATCH_CHECK: { 3130 int start_pos_reg = data_.u_empty_match_check.start_register; 3131 int stored_pos = 0; 3132 int rep_reg = data_.u_empty_match_check.repetition_register; 3133 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister); 3134 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos); 3135 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) { 3136 // If we know we haven't advanced and there is no minimum we 3137 // can just backtrack immediately. 3138 assembler->GoTo(trace->backtrack()); 3139 } else if (know_dist && stored_pos < trace->cp_offset()) { 3140 // If we know we've advanced we can generate the continuation 3141 // immediately. 3142 on_success()->Emit(compiler, trace); 3143 } else if (!trace->is_trivial()) { 3144 trace->Flush(compiler, this); 3145 } else { 3146 Label skip_empty_check; 3147 // If we have a minimum number of repetitions we check the current 3148 // number first and skip the empty check if it's not enough. 3149 if (has_minimum) { 3150 int limit = data_.u_empty_match_check.repetition_limit; 3151 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check); 3152 } 3153 // If the match is empty we bail out, otherwise we fall through 3154 // to the on-success continuation. 3155 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register, 3156 trace->backtrack()); 3157 assembler->Bind(&skip_empty_check); 3158 on_success()->Emit(compiler, trace); 3159 } 3160 break; 3161 } 3162 case POSITIVE_SUBMATCH_SUCCESS: { 3163 if (!trace->is_trivial()) { 3164 trace->Flush(compiler, this); 3165 return; 3166 } 3167 assembler->ReadCurrentPositionFromRegister( 3168 data_.u_submatch.current_position_register); 3169 assembler->ReadStackPointerFromRegister( 3170 data_.u_submatch.stack_pointer_register); 3171 int clear_register_count = data_.u_submatch.clear_register_count; 3172 if (clear_register_count == 0) { 3173 on_success()->Emit(compiler, trace); 3174 return; 3175 } 3176 int clear_registers_from = data_.u_submatch.clear_register_from; 3177 Label clear_registers_backtrack; 3178 Trace new_trace = *trace; 3179 new_trace.set_backtrack(&clear_registers_backtrack); 3180 on_success()->Emit(compiler, &new_trace); 3181 3182 assembler->Bind(&clear_registers_backtrack); 3183 int clear_registers_to = clear_registers_from + clear_register_count - 1; 3184 assembler->ClearRegisters(clear_registers_from, clear_registers_to); 3185 3186 ASSERT(trace->backtrack() == NULL); 3187 assembler->Backtrack(); 3188 return; 3189 } 3190 default: 3191 UNREACHABLE(); 3192 } 3193 } 3194 3195 3196 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3197 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3198 if (!trace->is_trivial()) { 3199 trace->Flush(compiler, this); 3200 return; 3201 } 3202 3203 LimitResult limit_result = LimitVersions(compiler, trace); 3204 if (limit_result == DONE) return; 3205 ASSERT(limit_result == CONTINUE); 3206 3207 RecursionCheck rc(compiler); 3208 3209 ASSERT_EQ(start_reg_ + 1, end_reg_); 3210 if (compiler->ignore_case()) { 3211 assembler->CheckNotBackReferenceIgnoreCase(start_reg_, 3212 trace->backtrack()); 3213 } else { 3214 assembler->CheckNotBackReference(start_reg_, trace->backtrack()); 3215 } 3216 on_success()->Emit(compiler, trace); 3217 } 3218 3219 3220 // ------------------------------------------------------------------- 3221 // Dot/dotty output 3222 3223 3224 #ifdef DEBUG 3225 3226 3227 class DotPrinter: public NodeVisitor { 3228 public: 3229 explicit DotPrinter(bool ignore_case) 3230 : ignore_case_(ignore_case), 3231 stream_(&alloc_) { } 3232 void PrintNode(const char* label, RegExpNode* node); 3233 void Visit(RegExpNode* node); 3234 void PrintAttributes(RegExpNode* from); 3235 StringStream* stream() { return &stream_; } 3236 void PrintOnFailure(RegExpNode* from, RegExpNode* to); 3237 #define DECLARE_VISIT(Type) \ 3238 virtual void Visit##Type(Type##Node* that); 3239 FOR_EACH_NODE_TYPE(DECLARE_VISIT) 3240 #undef DECLARE_VISIT 3241 private: 3242 bool ignore_case_; 3243 HeapStringAllocator alloc_; 3244 StringStream stream_; 3245 }; 3246 3247 3248 void DotPrinter::PrintNode(const char* label, RegExpNode* node) { 3249 stream()->Add("digraph G {\n graph [label=\""); 3250 for (int i = 0; label[i]; i++) { 3251 switch (label[i]) { 3252 case '\\': 3253 stream()->Add("\\\\"); 3254 break; 3255 case '"': 3256 stream()->Add("\""); 3257 break; 3258 default: 3259 stream()->Put(label[i]); 3260 break; 3261 } 3262 } 3263 stream()->Add("\"];\n"); 3264 Visit(node); 3265 stream()->Add("}\n"); 3266 printf("%s", *(stream()->ToCString())); 3267 } 3268 3269 3270 void DotPrinter::Visit(RegExpNode* node) { 3271 if (node->info()->visited) return; 3272 node->info()->visited = true; 3273 node->Accept(this); 3274 } 3275 3276 3277 void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) { 3278 stream()->Add(" n%p -> n%p [style=dotted];\n", from, on_failure); 3279 Visit(on_failure); 3280 } 3281 3282 3283 class TableEntryBodyPrinter { 3284 public: 3285 TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice) 3286 : stream_(stream), choice_(choice) { } 3287 void Call(uc16 from, DispatchTable::Entry entry) { 3288 OutSet* out_set = entry.out_set(); 3289 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { 3290 if (out_set->Get(i)) { 3291 stream()->Add(" n%p:s%io%i -> n%p;\n", 3292 choice(), 3293 from, 3294 i, 3295 choice()->alternatives()->at(i).node()); 3296 } 3297 } 3298 } 3299 private: 3300 StringStream* stream() { return stream_; } 3301 ChoiceNode* choice() { return choice_; } 3302 StringStream* stream_; 3303 ChoiceNode* choice_; 3304 }; 3305 3306 3307 class TableEntryHeaderPrinter { 3308 public: 3309 explicit TableEntryHeaderPrinter(StringStream* stream) 3310 : first_(true), stream_(stream) { } 3311 void Call(uc16 from, DispatchTable::Entry entry) { 3312 if (first_) { 3313 first_ = false; 3314 } else { 3315 stream()->Add("|"); 3316 } 3317 stream()->Add("{\\%k-\\%k|{", from, entry.to()); 3318 OutSet* out_set = entry.out_set(); 3319 int priority = 0; 3320 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { 3321 if (out_set->Get(i)) { 3322 if (priority > 0) stream()->Add("|"); 3323 stream()->Add("<s%io%i> %i", from, i, priority); 3324 priority++; 3325 } 3326 } 3327 stream()->Add("}}"); 3328 } 3329 private: 3330 bool first_; 3331 StringStream* stream() { return stream_; } 3332 StringStream* stream_; 3333 }; 3334 3335 3336 class AttributePrinter { 3337 public: 3338 explicit AttributePrinter(DotPrinter* out) 3339 : out_(out), first_(true) { } 3340 void PrintSeparator() { 3341 if (first_) { 3342 first_ = false; 3343 } else { 3344 out_->stream()->Add("|"); 3345 } 3346 } 3347 void PrintBit(const char* name, bool value) { 3348 if (!value) return; 3349 PrintSeparator(); 3350 out_->stream()->Add("{%s}", name); 3351 } 3352 void PrintPositive(const char* name, int value) { 3353 if (value < 0) return; 3354 PrintSeparator(); 3355 out_->stream()->Add("{%s|%x}", name, value); 3356 } 3357 private: 3358 DotPrinter* out_; 3359 bool first_; 3360 }; 3361 3362 3363 void DotPrinter::PrintAttributes(RegExpNode* that) { 3364 stream()->Add(" a%p [shape=Mrecord, color=grey, fontcolor=grey, " 3365 "margin=0.1, fontsize=10, label=\"{", 3366 that); 3367 AttributePrinter printer(this); 3368 NodeInfo* info = that->info(); 3369 printer.PrintBit("NI", info->follows_newline_interest); 3370 printer.PrintBit("WI", info->follows_word_interest); 3371 printer.PrintBit("SI", info->follows_start_interest); 3372 Label* label = that->label(); 3373 if (label->is_bound()) 3374 printer.PrintPositive("@", label->pos()); 3375 stream()->Add("}\"];\n"); 3376 stream()->Add(" a%p -> n%p [style=dashed, color=grey, " 3377 "arrowhead=none];\n", that, that); 3378 } 3379 3380 3381 static const bool kPrintDispatchTable = false; 3382 void DotPrinter::VisitChoice(ChoiceNode* that) { 3383 if (kPrintDispatchTable) { 3384 stream()->Add(" n%p [shape=Mrecord, label=\"", that); 3385 TableEntryHeaderPrinter header_printer(stream()); 3386 that->GetTable(ignore_case_)->ForEach(&header_printer); 3387 stream()->Add("\"]\n", that); 3388 PrintAttributes(that); 3389 TableEntryBodyPrinter body_printer(stream(), that); 3390 that->GetTable(ignore_case_)->ForEach(&body_printer); 3391 } else { 3392 stream()->Add(" n%p [shape=Mrecord, label=\"?\"];\n", that); 3393 for (int i = 0; i < that->alternatives()->length(); i++) { 3394 GuardedAlternative alt = that->alternatives()->at(i); 3395 stream()->Add(" n%p -> n%p;\n", that, alt.node()); 3396 } 3397 } 3398 for (int i = 0; i < that->alternatives()->length(); i++) { 3399 GuardedAlternative alt = that->alternatives()->at(i); 3400 alt.node()->Accept(this); 3401 } 3402 } 3403 3404 3405 void DotPrinter::VisitText(TextNode* that) { 3406 stream()->Add(" n%p [label=\"", that); 3407 for (int i = 0; i < that->elements()->length(); i++) { 3408 if (i > 0) stream()->Add(" "); 3409 TextElement elm = that->elements()->at(i); 3410 switch (elm.type) { 3411 case TextElement::ATOM: { 3412 stream()->Add("'%w'", elm.data.u_atom->data()); 3413 break; 3414 } 3415 case TextElement::CHAR_CLASS: { 3416 RegExpCharacterClass* node = elm.data.u_char_class; 3417 stream()->Add("["); 3418 if (node->is_negated()) 3419 stream()->Add("^"); 3420 for (int j = 0; j < node->ranges()->length(); j++) { 3421 CharacterRange range = node->ranges()->at(j); 3422 stream()->Add("%k-%k", range.from(), range.to()); 3423 } 3424 stream()->Add("]"); 3425 break; 3426 } 3427 default: 3428 UNREACHABLE(); 3429 } 3430 } 3431 stream()->Add("\", shape=box, peripheries=2];\n"); 3432 PrintAttributes(that); 3433 stream()->Add(" n%p -> n%p;\n", that, that->on_success()); 3434 Visit(that->on_success()); 3435 } 3436 3437 3438 void DotPrinter::VisitBackReference(BackReferenceNode* that) { 3439 stream()->Add(" n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n", 3440 that, 3441 that->start_register(), 3442 that->end_register()); 3443 PrintAttributes(that); 3444 stream()->Add(" n%p -> n%p;\n", that, that->on_success()); 3445 Visit(that->on_success()); 3446 } 3447 3448 3449 void DotPrinter::VisitEnd(EndNode* that) { 3450 stream()->Add(" n%p [style=bold, shape=point];\n", that); 3451 PrintAttributes(that); 3452 } 3453 3454 3455 void DotPrinter::VisitAssertion(AssertionNode* that) { 3456 stream()->Add(" n%p [", that); 3457 switch (that->type()) { 3458 case AssertionNode::AT_END: 3459 stream()->Add("label=\"$\", shape=septagon"); 3460 break; 3461 case AssertionNode::AT_START: 3462 stream()->Add("label=\"^\", shape=septagon"); 3463 break; 3464 case AssertionNode::AT_BOUNDARY: 3465 stream()->Add("label=\"\\b\", shape=septagon"); 3466 break; 3467 case AssertionNode::AT_NON_BOUNDARY: 3468 stream()->Add("label=\"\\B\", shape=septagon"); 3469 break; 3470 case AssertionNode::AFTER_NEWLINE: 3471 stream()->Add("label=\"(?<=\\n)\", shape=septagon"); 3472 break; 3473 case AssertionNode::AFTER_WORD_CHARACTER: 3474 stream()->Add("label=\"(?<=\\w)\", shape=septagon"); 3475 break; 3476 case AssertionNode::AFTER_NONWORD_CHARACTER: 3477 stream()->Add("label=\"(?<=\\W)\", shape=septagon"); 3478 break; 3479 } 3480 stream()->Add("];\n"); 3481 PrintAttributes(that); 3482 RegExpNode* successor = that->on_success(); 3483 stream()->Add(" n%p -> n%p;\n", that, successor); 3484 Visit(successor); 3485 } 3486 3487 3488 void DotPrinter::VisitAction(ActionNode* that) { 3489 stream()->Add(" n%p [", that); 3490 switch (that->type_) { 3491 case ActionNode::SET_REGISTER: 3492 stream()->Add("label=\"$%i:=%i\", shape=octagon", 3493 that->data_.u_store_register.reg, 3494 that->data_.u_store_register.value); 3495 break; 3496 case ActionNode::INCREMENT_REGISTER: 3497 stream()->Add("label=\"$%i++\", shape=octagon", 3498 that->data_.u_increment_register.reg); 3499 break; 3500 case ActionNode::STORE_POSITION: 3501 stream()->Add("label=\"$%i:=$pos\", shape=octagon", 3502 that->data_.u_position_register.reg); 3503 break; 3504 case ActionNode::BEGIN_SUBMATCH: 3505 stream()->Add("label=\"$%i:=$pos,begin\", shape=septagon", 3506 that->data_.u_submatch.current_position_register); 3507 break; 3508 case ActionNode::POSITIVE_SUBMATCH_SUCCESS: 3509 stream()->Add("label=\"escape\", shape=septagon"); 3510 break; 3511 case ActionNode::EMPTY_MATCH_CHECK: 3512 stream()->Add("label=\"$%i=$pos?,$%i<%i?\", shape=septagon", 3513 that->data_.u_empty_match_check.start_register, 3514 that->data_.u_empty_match_check.repetition_register, 3515 that->data_.u_empty_match_check.repetition_limit); 3516 break; 3517 case ActionNode::CLEAR_CAPTURES: { 3518 stream()->Add("label=\"clear $%i to $%i\", shape=septagon", 3519 that->data_.u_clear_captures.range_from, 3520 that->data_.u_clear_captures.range_to); 3521 break; 3522 } 3523 } 3524 stream()->Add("];\n"); 3525 PrintAttributes(that); 3526 RegExpNode* successor = that->on_success(); 3527 stream()->Add(" n%p -> n%p;\n", that, successor); 3528 Visit(successor); 3529 } 3530 3531 3532 class DispatchTableDumper { 3533 public: 3534 explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { } 3535 void Call(uc16 key, DispatchTable::Entry entry); 3536 StringStream* stream() { return stream_; } 3537 private: 3538 StringStream* stream_; 3539 }; 3540 3541 3542 void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) { 3543 stream()->Add("[%k-%k]: {", key, entry.to()); 3544 OutSet* set = entry.out_set(); 3545 bool first = true; 3546 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { 3547 if (set->Get(i)) { 3548 if (first) { 3549 first = false; 3550 } else { 3551 stream()->Add(", "); 3552 } 3553 stream()->Add("%i", i); 3554 } 3555 } 3556 stream()->Add("}\n"); 3557 } 3558 3559 3560 void DispatchTable::Dump() { 3561 HeapStringAllocator alloc; 3562 StringStream stream(&alloc); 3563 DispatchTableDumper dumper(&stream); 3564 tree()->ForEach(&dumper); 3565 OS::PrintError("%s", *stream.ToCString()); 3566 } 3567 3568 3569 void RegExpEngine::DotPrint(const char* label, 3570 RegExpNode* node, 3571 bool ignore_case) { 3572 DotPrinter printer(ignore_case); 3573 printer.PrintNode(label, node); 3574 } 3575 3576 3577 #endif // DEBUG 3578 3579 3580 // ------------------------------------------------------------------- 3581 // Tree to graph conversion 3582 3583 static const int kSpaceRangeCount = 20; 3584 static const int kSpaceRangeAsciiCount = 4; 3585 static const uc16 kSpaceRanges[kSpaceRangeCount] = { 0x0009, 0x000D, 0x0020, 3586 0x0020, 0x00A0, 0x00A0, 0x1680, 0x1680, 0x180E, 0x180E, 0x2000, 0x200A, 3587 0x2028, 0x2029, 0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000 }; 3588 3589 static const int kWordRangeCount = 8; 3590 static const uc16 kWordRanges[kWordRangeCount] = { '0', '9', 'A', 'Z', '_', 3591 '_', 'a', 'z' }; 3592 3593 static const int kDigitRangeCount = 2; 3594 static const uc16 kDigitRanges[kDigitRangeCount] = { '0', '9' }; 3595 3596 static const int kLineTerminatorRangeCount = 6; 3597 static const uc16 kLineTerminatorRanges[kLineTerminatorRangeCount] = { 0x000A, 3598 0x000A, 0x000D, 0x000D, 0x2028, 0x2029 }; 3599 3600 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler, 3601 RegExpNode* on_success) { 3602 ZoneList<TextElement>* elms = new ZoneList<TextElement>(1); 3603 elms->Add(TextElement::Atom(this)); 3604 return new TextNode(elms, on_success); 3605 } 3606 3607 3608 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, 3609 RegExpNode* on_success) { 3610 return new TextNode(elements(), on_success); 3611 } 3612 3613 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges, 3614 const uc16* special_class, 3615 int length) { 3616 ASSERT(ranges->length() != 0); 3617 ASSERT(length != 0); 3618 ASSERT(special_class[0] != 0); 3619 if (ranges->length() != (length >> 1) + 1) { 3620 return false; 3621 } 3622 CharacterRange range = ranges->at(0); 3623 if (range.from() != 0) { 3624 return false; 3625 } 3626 for (int i = 0; i < length; i += 2) { 3627 if (special_class[i] != (range.to() + 1)) { 3628 return false; 3629 } 3630 range = ranges->at((i >> 1) + 1); 3631 if (special_class[i+1] != range.from() - 1) { 3632 return false; 3633 } 3634 } 3635 if (range.to() != 0xffff) { 3636 return false; 3637 } 3638 return true; 3639 } 3640 3641 3642 static bool CompareRanges(ZoneList<CharacterRange>* ranges, 3643 const uc16* special_class, 3644 int length) { 3645 if (ranges->length() * 2 != length) { 3646 return false; 3647 } 3648 for (int i = 0; i < length; i += 2) { 3649 CharacterRange range = ranges->at(i >> 1); 3650 if (range.from() != special_class[i] || range.to() != special_class[i+1]) { 3651 return false; 3652 } 3653 } 3654 return true; 3655 } 3656 3657 3658 bool RegExpCharacterClass::is_standard() { 3659 // TODO(lrn): Remove need for this function, by not throwing away information 3660 // along the way. 3661 if (is_negated_) { 3662 return false; 3663 } 3664 if (set_.is_standard()) { 3665 return true; 3666 } 3667 if (CompareRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) { 3668 set_.set_standard_set_type('s'); 3669 return true; 3670 } 3671 if (CompareInverseRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) { 3672 set_.set_standard_set_type('S'); 3673 return true; 3674 } 3675 if (CompareInverseRanges(set_.ranges(), 3676 kLineTerminatorRanges, 3677 kLineTerminatorRangeCount)) { 3678 set_.set_standard_set_type('.'); 3679 return true; 3680 } 3681 if (CompareRanges(set_.ranges(), 3682 kLineTerminatorRanges, 3683 kLineTerminatorRangeCount)) { 3684 set_.set_standard_set_type('n'); 3685 return true; 3686 } 3687 if (CompareRanges(set_.ranges(), kWordRanges, kWordRangeCount)) { 3688 set_.set_standard_set_type('w'); 3689 return true; 3690 } 3691 if (CompareInverseRanges(set_.ranges(), kWordRanges, kWordRangeCount)) { 3692 set_.set_standard_set_type('W'); 3693 return true; 3694 } 3695 return false; 3696 } 3697 3698 3699 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, 3700 RegExpNode* on_success) { 3701 return new TextNode(this, on_success); 3702 } 3703 3704 3705 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler, 3706 RegExpNode* on_success) { 3707 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 3708 int length = alternatives->length(); 3709 ChoiceNode* result = new ChoiceNode(length); 3710 for (int i = 0; i < length; i++) { 3711 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler, 3712 on_success)); 3713 result->AddAlternative(alternative); 3714 } 3715 return result; 3716 } 3717 3718 3719 RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler, 3720 RegExpNode* on_success) { 3721 return ToNode(min(), 3722 max(), 3723 is_greedy(), 3724 body(), 3725 compiler, 3726 on_success); 3727 } 3728 3729 3730 RegExpNode* RegExpQuantifier::ToNode(int min, 3731 int max, 3732 bool is_greedy, 3733 RegExpTree* body, 3734 RegExpCompiler* compiler, 3735 RegExpNode* on_success, 3736 bool not_at_start) { 3737 // x{f, t} becomes this: 3738 // 3739 // (r++)<-. 3740 // | ` 3741 // | (x) 3742 // v ^ 3743 // (r=0)-->(?)---/ [if r < t] 3744 // | 3745 // [if r >= f] \----> ... 3746 // 3747 3748 // 15.10.2.5 RepeatMatcher algorithm. 3749 // The parser has already eliminated the case where max is 0. In the case 3750 // where max_match is zero the parser has removed the quantifier if min was 3751 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom. 3752 3753 // If we know that we cannot match zero length then things are a little 3754 // simpler since we don't need to make the special zero length match check 3755 // from step 2.1. If the min and max are small we can unroll a little in 3756 // this case. 3757 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,} 3758 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3} 3759 if (max == 0) return on_success; // This can happen due to recursion. 3760 bool body_can_be_empty = (body->min_match() == 0); 3761 int body_start_reg = RegExpCompiler::kNoRegister; 3762 Interval capture_registers = body->CaptureRegisters(); 3763 bool needs_capture_clearing = !capture_registers.is_empty(); 3764 if (body_can_be_empty) { 3765 body_start_reg = compiler->AllocateRegister(); 3766 } else if (FLAG_regexp_optimization && !needs_capture_clearing) { 3767 // Only unroll if there are no captures and the body can't be 3768 // empty. 3769 if (min > 0 && min <= kMaxUnrolledMinMatches) { 3770 int new_max = (max == kInfinity) ? max : max - min; 3771 // Recurse once to get the loop or optional matches after the fixed ones. 3772 RegExpNode* answer = ToNode( 3773 0, new_max, is_greedy, body, compiler, on_success, true); 3774 // Unroll the forced matches from 0 to min. This can cause chains of 3775 // TextNodes (which the parser does not generate). These should be 3776 // combined if it turns out they hinder good code generation. 3777 for (int i = 0; i < min; i++) { 3778 answer = body->ToNode(compiler, answer); 3779 } 3780 return answer; 3781 } 3782 if (max <= kMaxUnrolledMaxMatches) { 3783 ASSERT(min == 0); 3784 // Unroll the optional matches up to max. 3785 RegExpNode* answer = on_success; 3786 for (int i = 0; i < max; i++) { 3787 ChoiceNode* alternation = new ChoiceNode(2); 3788 if (is_greedy) { 3789 alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler, 3790 answer))); 3791 alternation->AddAlternative(GuardedAlternative(on_success)); 3792 } else { 3793 alternation->AddAlternative(GuardedAlternative(on_success)); 3794 alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler, 3795 answer))); 3796 } 3797 answer = alternation; 3798 if (not_at_start) alternation->set_not_at_start(); 3799 } 3800 return answer; 3801 } 3802 } 3803 bool has_min = min > 0; 3804 bool has_max = max < RegExpTree::kInfinity; 3805 bool needs_counter = has_min || has_max; 3806 int reg_ctr = needs_counter 3807 ? compiler->AllocateRegister() 3808 : RegExpCompiler::kNoRegister; 3809 LoopChoiceNode* center = new LoopChoiceNode(body->min_match() == 0); 3810 if (not_at_start) center->set_not_at_start(); 3811 RegExpNode* loop_return = needs_counter 3812 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center)) 3813 : static_cast<RegExpNode*>(center); 3814 if (body_can_be_empty) { 3815 // If the body can be empty we need to check if it was and then 3816 // backtrack. 3817 loop_return = ActionNode::EmptyMatchCheck(body_start_reg, 3818 reg_ctr, 3819 min, 3820 loop_return); 3821 } 3822 RegExpNode* body_node = body->ToNode(compiler, loop_return); 3823 if (body_can_be_empty) { 3824 // If the body can be empty we need to store the start position 3825 // so we can bail out if it was empty. 3826 body_node = ActionNode::StorePosition(body_start_reg, false, body_node); 3827 } 3828 if (needs_capture_clearing) { 3829 // Before entering the body of this loop we need to clear captures. 3830 body_node = ActionNode::ClearCaptures(capture_registers, body_node); 3831 } 3832 GuardedAlternative body_alt(body_node); 3833 if (has_max) { 3834 Guard* body_guard = new Guard(reg_ctr, Guard::LT, max); 3835 body_alt.AddGuard(body_guard); 3836 } 3837 GuardedAlternative rest_alt(on_success); 3838 if (has_min) { 3839 Guard* rest_guard = new Guard(reg_ctr, Guard::GEQ, min); 3840 rest_alt.AddGuard(rest_guard); 3841 } 3842 if (is_greedy) { 3843 center->AddLoopAlternative(body_alt); 3844 center->AddContinueAlternative(rest_alt); 3845 } else { 3846 center->AddContinueAlternative(rest_alt); 3847 center->AddLoopAlternative(body_alt); 3848 } 3849 if (needs_counter) { 3850 return ActionNode::SetRegister(reg_ctr, 0, center); 3851 } else { 3852 return center; 3853 } 3854 } 3855 3856 3857 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, 3858 RegExpNode* on_success) { 3859 NodeInfo info; 3860 switch (type()) { 3861 case START_OF_LINE: 3862 return AssertionNode::AfterNewline(on_success); 3863 case START_OF_INPUT: 3864 return AssertionNode::AtStart(on_success); 3865 case BOUNDARY: 3866 return AssertionNode::AtBoundary(on_success); 3867 case NON_BOUNDARY: 3868 return AssertionNode::AtNonBoundary(on_success); 3869 case END_OF_INPUT: 3870 return AssertionNode::AtEnd(on_success); 3871 case END_OF_LINE: { 3872 // Compile $ in multiline regexps as an alternation with a positive 3873 // lookahead in one side and an end-of-input on the other side. 3874 // We need two registers for the lookahead. 3875 int stack_pointer_register = compiler->AllocateRegister(); 3876 int position_register = compiler->AllocateRegister(); 3877 // The ChoiceNode to distinguish between a newline and end-of-input. 3878 ChoiceNode* result = new ChoiceNode(2); 3879 // Create a newline atom. 3880 ZoneList<CharacterRange>* newline_ranges = 3881 new ZoneList<CharacterRange>(3); 3882 CharacterRange::AddClassEscape('n', newline_ranges); 3883 RegExpCharacterClass* newline_atom = new RegExpCharacterClass('n'); 3884 TextNode* newline_matcher = new TextNode( 3885 newline_atom, 3886 ActionNode::PositiveSubmatchSuccess(stack_pointer_register, 3887 position_register, 3888 0, // No captures inside. 3889 -1, // Ignored if no captures. 3890 on_success)); 3891 // Create an end-of-input matcher. 3892 RegExpNode* end_of_line = ActionNode::BeginSubmatch( 3893 stack_pointer_register, 3894 position_register, 3895 newline_matcher); 3896 // Add the two alternatives to the ChoiceNode. 3897 GuardedAlternative eol_alternative(end_of_line); 3898 result->AddAlternative(eol_alternative); 3899 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success)); 3900 result->AddAlternative(end_alternative); 3901 return result; 3902 } 3903 default: 3904 UNREACHABLE(); 3905 } 3906 return on_success; 3907 } 3908 3909 3910 RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler, 3911 RegExpNode* on_success) { 3912 return new BackReferenceNode(RegExpCapture::StartRegister(index()), 3913 RegExpCapture::EndRegister(index()), 3914 on_success); 3915 } 3916 3917 3918 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler, 3919 RegExpNode* on_success) { 3920 return on_success; 3921 } 3922 3923 3924 RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler, 3925 RegExpNode* on_success) { 3926 int stack_pointer_register = compiler->AllocateRegister(); 3927 int position_register = compiler->AllocateRegister(); 3928 3929 const int registers_per_capture = 2; 3930 const int register_of_first_capture = 2; 3931 int register_count = capture_count_ * registers_per_capture; 3932 int register_start = 3933 register_of_first_capture + capture_from_ * registers_per_capture; 3934 3935 RegExpNode* success; 3936 if (is_positive()) { 3937 RegExpNode* node = ActionNode::BeginSubmatch( 3938 stack_pointer_register, 3939 position_register, 3940 body()->ToNode( 3941 compiler, 3942 ActionNode::PositiveSubmatchSuccess(stack_pointer_register, 3943 position_register, 3944 register_count, 3945 register_start, 3946 on_success))); 3947 return node; 3948 } else { 3949 // We use a ChoiceNode for a negative lookahead because it has most of 3950 // the characteristics we need. It has the body of the lookahead as its 3951 // first alternative and the expression after the lookahead of the second 3952 // alternative. If the first alternative succeeds then the 3953 // NegativeSubmatchSuccess will unwind the stack including everything the 3954 // choice node set up and backtrack. If the first alternative fails then 3955 // the second alternative is tried, which is exactly the desired result 3956 // for a negative lookahead. The NegativeLookaheadChoiceNode is a special 3957 // ChoiceNode that knows to ignore the first exit when calculating quick 3958 // checks. 3959 GuardedAlternative body_alt( 3960 body()->ToNode( 3961 compiler, 3962 success = new NegativeSubmatchSuccess(stack_pointer_register, 3963 position_register, 3964 register_count, 3965 register_start))); 3966 ChoiceNode* choice_node = 3967 new NegativeLookaheadChoiceNode(body_alt, 3968 GuardedAlternative(on_success)); 3969 return ActionNode::BeginSubmatch(stack_pointer_register, 3970 position_register, 3971 choice_node); 3972 } 3973 } 3974 3975 3976 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler, 3977 RegExpNode* on_success) { 3978 return ToNode(body(), index(), compiler, on_success); 3979 } 3980 3981 3982 RegExpNode* RegExpCapture::ToNode(RegExpTree* body, 3983 int index, 3984 RegExpCompiler* compiler, 3985 RegExpNode* on_success) { 3986 int start_reg = RegExpCapture::StartRegister(index); 3987 int end_reg = RegExpCapture::EndRegister(index); 3988 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success); 3989 RegExpNode* body_node = body->ToNode(compiler, store_end); 3990 return ActionNode::StorePosition(start_reg, true, body_node); 3991 } 3992 3993 3994 RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler, 3995 RegExpNode* on_success) { 3996 ZoneList<RegExpTree*>* children = nodes(); 3997 RegExpNode* current = on_success; 3998 for (int i = children->length() - 1; i >= 0; i--) { 3999 current = children->at(i)->ToNode(compiler, current); 4000 } 4001 return current; 4002 } 4003 4004 4005 static void AddClass(const uc16* elmv, 4006 int elmc, 4007 ZoneList<CharacterRange>* ranges) { 4008 for (int i = 0; i < elmc; i += 2) { 4009 ASSERT(elmv[i] <= elmv[i + 1]); 4010 ranges->Add(CharacterRange(elmv[i], elmv[i + 1])); 4011 } 4012 } 4013 4014 4015 static void AddClassNegated(const uc16 *elmv, 4016 int elmc, 4017 ZoneList<CharacterRange>* ranges) { 4018 ASSERT(elmv[0] != 0x0000); 4019 ASSERT(elmv[elmc-1] != String::kMaxUC16CharCode); 4020 uc16 last = 0x0000; 4021 for (int i = 0; i < elmc; i += 2) { 4022 ASSERT(last <= elmv[i] - 1); 4023 ASSERT(elmv[i] <= elmv[i + 1]); 4024 ranges->Add(CharacterRange(last, elmv[i] - 1)); 4025 last = elmv[i + 1] + 1; 4026 } 4027 ranges->Add(CharacterRange(last, String::kMaxUC16CharCode)); 4028 } 4029 4030 4031 void CharacterRange::AddClassEscape(uc16 type, 4032 ZoneList<CharacterRange>* ranges) { 4033 switch (type) { 4034 case 's': 4035 AddClass(kSpaceRanges, kSpaceRangeCount, ranges); 4036 break; 4037 case 'S': 4038 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges); 4039 break; 4040 case 'w': 4041 AddClass(kWordRanges, kWordRangeCount, ranges); 4042 break; 4043 case 'W': 4044 AddClassNegated(kWordRanges, kWordRangeCount, ranges); 4045 break; 4046 case 'd': 4047 AddClass(kDigitRanges, kDigitRangeCount, ranges); 4048 break; 4049 case 'D': 4050 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges); 4051 break; 4052 case '.': 4053 AddClassNegated(kLineTerminatorRanges, 4054 kLineTerminatorRangeCount, 4055 ranges); 4056 break; 4057 // This is not a character range as defined by the spec but a 4058 // convenient shorthand for a character class that matches any 4059 // character. 4060 case '*': 4061 ranges->Add(CharacterRange::Everything()); 4062 break; 4063 // This is the set of characters matched by the $ and ^ symbols 4064 // in multiline mode. 4065 case 'n': 4066 AddClass(kLineTerminatorRanges, 4067 kLineTerminatorRangeCount, 4068 ranges); 4069 break; 4070 default: 4071 UNREACHABLE(); 4072 } 4073 } 4074 4075 4076 Vector<const uc16> CharacterRange::GetWordBounds() { 4077 return Vector<const uc16>(kWordRanges, kWordRangeCount); 4078 } 4079 4080 4081 class CharacterRangeSplitter { 4082 public: 4083 CharacterRangeSplitter(ZoneList<CharacterRange>** included, 4084 ZoneList<CharacterRange>** excluded) 4085 : included_(included), 4086 excluded_(excluded) { } 4087 void Call(uc16 from, DispatchTable::Entry entry); 4088 4089 static const int kInBase = 0; 4090 static const int kInOverlay = 1; 4091 4092 private: 4093 ZoneList<CharacterRange>** included_; 4094 ZoneList<CharacterRange>** excluded_; 4095 }; 4096 4097 4098 void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) { 4099 if (!entry.out_set()->Get(kInBase)) return; 4100 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay) 4101 ? included_ 4102 : excluded_; 4103 if (*target == NULL) *target = new ZoneList<CharacterRange>(2); 4104 (*target)->Add(CharacterRange(entry.from(), entry.to())); 4105 } 4106 4107 4108 void CharacterRange::Split(ZoneList<CharacterRange>* base, 4109 Vector<const uc16> overlay, 4110 ZoneList<CharacterRange>** included, 4111 ZoneList<CharacterRange>** excluded) { 4112 ASSERT_EQ(NULL, *included); 4113 ASSERT_EQ(NULL, *excluded); 4114 DispatchTable table; 4115 for (int i = 0; i < base->length(); i++) 4116 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase); 4117 for (int i = 0; i < overlay.length(); i += 2) { 4118 table.AddRange(CharacterRange(overlay[i], overlay[i+1]), 4119 CharacterRangeSplitter::kInOverlay); 4120 } 4121 CharacterRangeSplitter callback(included, excluded); 4122 table.ForEach(&callback); 4123 } 4124 4125 4126 static void AddUncanonicals(Isolate* isolate, 4127 ZoneList<CharacterRange>* ranges, 4128 int bottom, 4129 int top); 4130 4131 4132 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges, 4133 bool is_ascii) { 4134 Isolate* isolate = Isolate::Current(); 4135 uc16 bottom = from(); 4136 uc16 top = to(); 4137 if (is_ascii) { 4138 if (bottom > String::kMaxAsciiCharCode) return; 4139 if (top > String::kMaxAsciiCharCode) top = String::kMaxAsciiCharCode; 4140 } 4141 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 4142 if (top == bottom) { 4143 // If this is a singleton we just expand the one character. 4144 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars); 4145 for (int i = 0; i < length; i++) { 4146 uc32 chr = chars[i]; 4147 if (chr != bottom) { 4148 ranges->Add(CharacterRange::Singleton(chars[i])); 4149 } 4150 } 4151 } else { 4152 // If this is a range we expand the characters block by block, 4153 // expanding contiguous subranges (blocks) one at a time. 4154 // The approach is as follows. For a given start character we 4155 // look up the remainder of the block that contains it (represented 4156 // by the end point), for instance we find 'z' if the character 4157 // is 'c'. A block is characterized by the property 4158 // that all characters uncanonicalize in the same way, except that 4159 // each entry in the result is incremented by the distance from the first 4160 // element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and 4161 // the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. 4162 // Once we've found the end point we look up its uncanonicalization 4163 // and produce a range for each element. For instance for [c-f] 4164 // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only 4165 // add a range if it is not already contained in the input, so [c-f] 4166 // will be skipped but [C-F] will be added. If this range is not 4167 // completely contained in a block we do this for all the blocks 4168 // covered by the range (handling characters that is not in a block 4169 // as a "singleton block"). 4170 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 4171 int pos = bottom; 4172 while (pos < top) { 4173 int length = isolate->jsregexp_canonrange()->get(pos, '\0', range); 4174 uc16 block_end; 4175 if (length == 0) { 4176 block_end = pos; 4177 } else { 4178 ASSERT_EQ(1, length); 4179 block_end = range[0]; 4180 } 4181 int end = (block_end > top) ? top : block_end; 4182 length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', range); 4183 for (int i = 0; i < length; i++) { 4184 uc32 c = range[i]; 4185 uc16 range_from = c - (block_end - pos); 4186 uc16 range_to = c - (block_end - end); 4187 if (!(bottom <= range_from && range_to <= top)) { 4188 ranges->Add(CharacterRange(range_from, range_to)); 4189 } 4190 } 4191 pos = end + 1; 4192 } 4193 } 4194 } 4195 4196 4197 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) { 4198 ASSERT_NOT_NULL(ranges); 4199 int n = ranges->length(); 4200 if (n <= 1) return true; 4201 int max = ranges->at(0).to(); 4202 for (int i = 1; i < n; i++) { 4203 CharacterRange next_range = ranges->at(i); 4204 if (next_range.from() <= max + 1) return false; 4205 max = next_range.to(); 4206 } 4207 return true; 4208 } 4209 4210 SetRelation CharacterRange::WordCharacterRelation( 4211 ZoneList<CharacterRange>* range) { 4212 ASSERT(IsCanonical(range)); 4213 int i = 0; // Word character range index. 4214 int j = 0; // Argument range index. 4215 ASSERT_NE(0, kWordRangeCount); 4216 SetRelation result; 4217 if (range->length() == 0) { 4218 result.SetElementsInSecondSet(); 4219 return result; 4220 } 4221 CharacterRange argument_range = range->at(0); 4222 CharacterRange word_range = CharacterRange(kWordRanges[0], kWordRanges[1]); 4223 while (i < kWordRangeCount && j < range->length()) { 4224 // Check the two ranges for the five cases: 4225 // - no overlap. 4226 // - partial overlap (there are elements in both ranges that isn't 4227 // in the other, and there are also elements that are in both). 4228 // - argument range entirely inside word range. 4229 // - word range entirely inside argument range. 4230 // - ranges are completely equal. 4231 4232 // First check for no overlap. The earlier range is not in the other set. 4233 if (argument_range.from() > word_range.to()) { 4234 // Ranges are disjoint. The earlier word range contains elements that 4235 // cannot be in the argument set. 4236 result.SetElementsInSecondSet(); 4237 } else if (word_range.from() > argument_range.to()) { 4238 // Ranges are disjoint. The earlier argument range contains elements that 4239 // cannot be in the word set. 4240 result.SetElementsInFirstSet(); 4241 } else if (word_range.from() <= argument_range.from() && 4242 word_range.to() >= argument_range.from()) { 4243 result.SetElementsInBothSets(); 4244 // argument range completely inside word range. 4245 if (word_range.from() < argument_range.from() || 4246 word_range.to() > argument_range.from()) { 4247 result.SetElementsInSecondSet(); 4248 } 4249 } else if (word_range.from() >= argument_range.from() && 4250 word_range.to() <= argument_range.from()) { 4251 result.SetElementsInBothSets(); 4252 result.SetElementsInFirstSet(); 4253 } else { 4254 // There is overlap, and neither is a subrange of the other 4255 result.SetElementsInFirstSet(); 4256 result.SetElementsInSecondSet(); 4257 result.SetElementsInBothSets(); 4258 } 4259 if (result.NonTrivialIntersection()) { 4260 // The result is as (im)precise as we can possibly make it. 4261 return result; 4262 } 4263 // Progress the range(s) with minimal to-character. 4264 uc16 word_to = word_range.to(); 4265 uc16 argument_to = argument_range.to(); 4266 if (argument_to <= word_to) { 4267 j++; 4268 if (j < range->length()) { 4269 argument_range = range->at(j); 4270 } 4271 } 4272 if (word_to <= argument_to) { 4273 i += 2; 4274 if (i < kWordRangeCount) { 4275 word_range = CharacterRange(kWordRanges[i], kWordRanges[i + 1]); 4276 } 4277 } 4278 } 4279 // Check if anything wasn't compared in the loop. 4280 if (i < kWordRangeCount) { 4281 // word range contains something not in argument range. 4282 result.SetElementsInSecondSet(); 4283 } else if (j < range->length()) { 4284 // Argument range contains something not in word range. 4285 result.SetElementsInFirstSet(); 4286 } 4287 4288 return result; 4289 } 4290 4291 4292 static void AddUncanonicals(Isolate* isolate, 4293 ZoneList<CharacterRange>* ranges, 4294 int bottom, 4295 int top) { 4296 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 4297 // Zones with no case mappings. There is a DEBUG-mode loop to assert that 4298 // this table is correct. 4299 // 0x0600 - 0x0fff 4300 // 0x1100 - 0x1cff 4301 // 0x2000 - 0x20ff 4302 // 0x2200 - 0x23ff 4303 // 0x2500 - 0x2bff 4304 // 0x2e00 - 0xa5ff 4305 // 0xa800 - 0xfaff 4306 // 0xfc00 - 0xfeff 4307 const int boundary_count = 18; 4308 int boundaries[] = { 4309 0x600, 0x1000, 0x1100, 0x1d00, 0x2000, 0x2100, 0x2200, 0x2400, 0x2500, 4310 0x2c00, 0x2e00, 0xa600, 0xa800, 0xfb00, 0xfc00, 0xff00}; 4311 4312 // Special ASCII rule from spec can save us some work here. 4313 if (bottom == 0x80 && top == 0xffff) return; 4314 4315 if (top <= boundaries[0]) { 4316 CharacterRange range(bottom, top); 4317 range.AddCaseEquivalents(ranges, false); 4318 return; 4319 } 4320 4321 // Split up very large ranges. This helps remove ranges where there are no 4322 // case mappings. 4323 for (int i = 0; i < boundary_count; i++) { 4324 if (bottom < boundaries[i] && top >= boundaries[i]) { 4325 AddUncanonicals(isolate, ranges, bottom, boundaries[i] - 1); 4326 AddUncanonicals(isolate, ranges, boundaries[i], top); 4327 return; 4328 } 4329 } 4330 4331 // If we are completely in a zone with no case mappings then we are done. 4332 for (int i = 0; i < boundary_count; i += 2) { 4333 if (bottom >= boundaries[i] && top < boundaries[i + 1]) { 4334 #ifdef DEBUG 4335 for (int j = bottom; j <= top; j++) { 4336 unsigned current_char = j; 4337 int length = isolate->jsregexp_uncanonicalize()->get(current_char, 4338 '\0', chars); 4339 for (int k = 0; k < length; k++) { 4340 ASSERT(chars[k] == current_char); 4341 } 4342 } 4343 #endif 4344 return; 4345 } 4346 } 4347 4348 // Step through the range finding equivalent characters. 4349 ZoneList<unibrow::uchar> *characters = new ZoneList<unibrow::uchar>(100); 4350 for (int i = bottom; i <= top; i++) { 4351 int length = isolate->jsregexp_uncanonicalize()->get(i, '\0', chars); 4352 for (int j = 0; j < length; j++) { 4353 uc32 chr = chars[j]; 4354 if (chr != i && (chr < bottom || chr > top)) { 4355 characters->Add(chr); 4356 } 4357 } 4358 } 4359 4360 // Step through the equivalent characters finding simple ranges and 4361 // adding ranges to the character class. 4362 if (characters->length() > 0) { 4363 int new_from = characters->at(0); 4364 int new_to = new_from; 4365 for (int i = 1; i < characters->length(); i++) { 4366 int chr = characters->at(i); 4367 if (chr == new_to + 1) { 4368 new_to++; 4369 } else { 4370 if (new_to == new_from) { 4371 ranges->Add(CharacterRange::Singleton(new_from)); 4372 } else { 4373 ranges->Add(CharacterRange(new_from, new_to)); 4374 } 4375 new_from = new_to = chr; 4376 } 4377 } 4378 if (new_to == new_from) { 4379 ranges->Add(CharacterRange::Singleton(new_from)); 4380 } else { 4381 ranges->Add(CharacterRange(new_from, new_to)); 4382 } 4383 } 4384 } 4385 4386 4387 ZoneList<CharacterRange>* CharacterSet::ranges() { 4388 if (ranges_ == NULL) { 4389 ranges_ = new ZoneList<CharacterRange>(2); 4390 CharacterRange::AddClassEscape(standard_set_type_, ranges_); 4391 } 4392 return ranges_; 4393 } 4394 4395 4396 // Move a number of elements in a zonelist to another position 4397 // in the same list. Handles overlapping source and target areas. 4398 static void MoveRanges(ZoneList<CharacterRange>* list, 4399 int from, 4400 int to, 4401 int count) { 4402 // Ranges are potentially overlapping. 4403 if (from < to) { 4404 for (int i = count - 1; i >= 0; i--) { 4405 list->at(to + i) = list->at(from + i); 4406 } 4407 } else { 4408 for (int i = 0; i < count; i++) { 4409 list->at(to + i) = list->at(from + i); 4410 } 4411 } 4412 } 4413 4414 4415 static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list, 4416 int count, 4417 CharacterRange insert) { 4418 // Inserts a range into list[0..count[, which must be sorted 4419 // by from value and non-overlapping and non-adjacent, using at most 4420 // list[0..count] for the result. Returns the number of resulting 4421 // canonicalized ranges. Inserting a range may collapse existing ranges into 4422 // fewer ranges, so the return value can be anything in the range 1..count+1. 4423 uc16 from = insert.from(); 4424 uc16 to = insert.to(); 4425 int start_pos = 0; 4426 int end_pos = count; 4427 for (int i = count - 1; i >= 0; i--) { 4428 CharacterRange current = list->at(i); 4429 if (current.from() > to + 1) { 4430 end_pos = i; 4431 } else if (current.to() + 1 < from) { 4432 start_pos = i + 1; 4433 break; 4434 } 4435 } 4436 4437 // Inserted range overlaps, or is adjacent to, ranges at positions 4438 // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are 4439 // not affected by the insertion. 4440 // If start_pos == end_pos, the range must be inserted before start_pos. 4441 // if start_pos < end_pos, the entire range from start_pos to end_pos 4442 // must be merged with the insert range. 4443 4444 if (start_pos == end_pos) { 4445 // Insert between existing ranges at position start_pos. 4446 if (start_pos < count) { 4447 MoveRanges(list, start_pos, start_pos + 1, count - start_pos); 4448 } 4449 list->at(start_pos) = insert; 4450 return count + 1; 4451 } 4452 if (start_pos + 1 == end_pos) { 4453 // Replace single existing range at position start_pos. 4454 CharacterRange to_replace = list->at(start_pos); 4455 int new_from = Min(to_replace.from(), from); 4456 int new_to = Max(to_replace.to(), to); 4457 list->at(start_pos) = CharacterRange(new_from, new_to); 4458 return count; 4459 } 4460 // Replace a number of existing ranges from start_pos to end_pos - 1. 4461 // Move the remaining ranges down. 4462 4463 int new_from = Min(list->at(start_pos).from(), from); 4464 int new_to = Max(list->at(end_pos - 1).to(), to); 4465 if (end_pos < count) { 4466 MoveRanges(list, end_pos, start_pos + 1, count - end_pos); 4467 } 4468 list->at(start_pos) = CharacterRange(new_from, new_to); 4469 return count - (end_pos - start_pos) + 1; 4470 } 4471 4472 4473 void CharacterSet::Canonicalize() { 4474 // Special/default classes are always considered canonical. The result 4475 // of calling ranges() will be sorted. 4476 if (ranges_ == NULL) return; 4477 CharacterRange::Canonicalize(ranges_); 4478 } 4479 4480 4481 void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) { 4482 if (character_ranges->length() <= 1) return; 4483 // Check whether ranges are already canonical (increasing, non-overlapping, 4484 // non-adjacent). 4485 int n = character_ranges->length(); 4486 int max = character_ranges->at(0).to(); 4487 int i = 1; 4488 while (i < n) { 4489 CharacterRange current = character_ranges->at(i); 4490 if (current.from() <= max + 1) { 4491 break; 4492 } 4493 max = current.to(); 4494 i++; 4495 } 4496 // Canonical until the i'th range. If that's all of them, we are done. 4497 if (i == n) return; 4498 4499 // The ranges at index i and forward are not canonicalized. Make them so by 4500 // doing the equivalent of insertion sort (inserting each into the previous 4501 // list, in order). 4502 // Notice that inserting a range can reduce the number of ranges in the 4503 // result due to combining of adjacent and overlapping ranges. 4504 int read = i; // Range to insert. 4505 int num_canonical = i; // Length of canonicalized part of list. 4506 do { 4507 num_canonical = InsertRangeInCanonicalList(character_ranges, 4508 num_canonical, 4509 character_ranges->at(read)); 4510 read++; 4511 } while (read < n); 4512 character_ranges->Rewind(num_canonical); 4513 4514 ASSERT(CharacterRange::IsCanonical(character_ranges)); 4515 } 4516 4517 4518 // Utility function for CharacterRange::Merge. Adds a range at the end of 4519 // a canonicalized range list, if necessary merging the range with the last 4520 // range of the list. 4521 static void AddRangeToSet(ZoneList<CharacterRange>* set, CharacterRange range) { 4522 if (set == NULL) return; 4523 ASSERT(set->length() == 0 || set->at(set->length() - 1).to() < range.from()); 4524 int n = set->length(); 4525 if (n > 0) { 4526 CharacterRange lastRange = set->at(n - 1); 4527 if (lastRange.to() == range.from() - 1) { 4528 set->at(n - 1) = CharacterRange(lastRange.from(), range.to()); 4529 return; 4530 } 4531 } 4532 set->Add(range); 4533 } 4534 4535 4536 static void AddRangeToSelectedSet(int selector, 4537 ZoneList<CharacterRange>* first_set, 4538 ZoneList<CharacterRange>* second_set, 4539 ZoneList<CharacterRange>* intersection_set, 4540 CharacterRange range) { 4541 switch (selector) { 4542 case kInsideFirst: 4543 AddRangeToSet(first_set, range); 4544 break; 4545 case kInsideSecond: 4546 AddRangeToSet(second_set, range); 4547 break; 4548 case kInsideBoth: 4549 AddRangeToSet(intersection_set, range); 4550 break; 4551 } 4552 } 4553 4554 4555 4556 void CharacterRange::Merge(ZoneList<CharacterRange>* first_set, 4557 ZoneList<CharacterRange>* second_set, 4558 ZoneList<CharacterRange>* first_set_only_out, 4559 ZoneList<CharacterRange>* second_set_only_out, 4560 ZoneList<CharacterRange>* both_sets_out) { 4561 // Inputs are canonicalized. 4562 ASSERT(CharacterRange::IsCanonical(first_set)); 4563 ASSERT(CharacterRange::IsCanonical(second_set)); 4564 // Outputs are empty, if applicable. 4565 ASSERT(first_set_only_out == NULL || first_set_only_out->length() == 0); 4566 ASSERT(second_set_only_out == NULL || second_set_only_out->length() == 0); 4567 ASSERT(both_sets_out == NULL || both_sets_out->length() == 0); 4568 4569 // Merge sets by iterating through the lists in order of lowest "from" value, 4570 // and putting intervals into one of three sets. 4571 4572 if (first_set->length() == 0) { 4573 second_set_only_out->AddAll(*second_set); 4574 return; 4575 } 4576 if (second_set->length() == 0) { 4577 first_set_only_out->AddAll(*first_set); 4578 return; 4579 } 4580 // Indices into input lists. 4581 int i1 = 0; 4582 int i2 = 0; 4583 // Cache length of input lists. 4584 int n1 = first_set->length(); 4585 int n2 = second_set->length(); 4586 // Current range. May be invalid if state is kInsideNone. 4587 int from = 0; 4588 int to = -1; 4589 // Where current range comes from. 4590 int state = kInsideNone; 4591 4592 while (i1 < n1 || i2 < n2) { 4593 CharacterRange next_range; 4594 int range_source; 4595 if (i2 == n2 || 4596 (i1 < n1 && first_set->at(i1).from() < second_set->at(i2).from())) { 4597 // Next smallest element is in first set. 4598 next_range = first_set->at(i1++); 4599 range_source = kInsideFirst; 4600 } else { 4601 // Next smallest element is in second set. 4602 next_range = second_set->at(i2++); 4603 range_source = kInsideSecond; 4604 } 4605 if (to < next_range.from()) { 4606 // Ranges disjoint: |current| |next| 4607 AddRangeToSelectedSet(state, 4608 first_set_only_out, 4609 second_set_only_out, 4610 both_sets_out, 4611 CharacterRange(from, to)); 4612 from = next_range.from(); 4613 to = next_range.to(); 4614 state = range_source; 4615 } else { 4616 if (from < next_range.from()) { 4617 AddRangeToSelectedSet(state, 4618 first_set_only_out, 4619 second_set_only_out, 4620 both_sets_out, 4621 CharacterRange(from, next_range.from()-1)); 4622 } 4623 if (to < next_range.to()) { 4624 // Ranges overlap: |current| 4625 // |next| 4626 AddRangeToSelectedSet(state | range_source, 4627 first_set_only_out, 4628 second_set_only_out, 4629 both_sets_out, 4630 CharacterRange(next_range.from(), to)); 4631 from = to + 1; 4632 to = next_range.to(); 4633 state = range_source; 4634 } else { 4635 // Range included: |current| , possibly ending at same character. 4636 // |next| 4637 AddRangeToSelectedSet( 4638 state | range_source, 4639 first_set_only_out, 4640 second_set_only_out, 4641 both_sets_out, 4642 CharacterRange(next_range.from(), next_range.to())); 4643 from = next_range.to() + 1; 4644 // If ranges end at same character, both ranges are consumed completely. 4645 if (next_range.to() == to) state = kInsideNone; 4646 } 4647 } 4648 } 4649 AddRangeToSelectedSet(state, 4650 first_set_only_out, 4651 second_set_only_out, 4652 both_sets_out, 4653 CharacterRange(from, to)); 4654 } 4655 4656 4657 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges, 4658 ZoneList<CharacterRange>* negated_ranges) { 4659 ASSERT(CharacterRange::IsCanonical(ranges)); 4660 ASSERT_EQ(0, negated_ranges->length()); 4661 int range_count = ranges->length(); 4662 uc16 from = 0; 4663 int i = 0; 4664 if (range_count > 0 && ranges->at(0).from() == 0) { 4665 from = ranges->at(0).to(); 4666 i = 1; 4667 } 4668 while (i < range_count) { 4669 CharacterRange range = ranges->at(i); 4670 negated_ranges->Add(CharacterRange(from + 1, range.from() - 1)); 4671 from = range.to(); 4672 i++; 4673 } 4674 if (from < String::kMaxUC16CharCode) { 4675 negated_ranges->Add(CharacterRange(from + 1, String::kMaxUC16CharCode)); 4676 } 4677 } 4678 4679 4680 4681 // ------------------------------------------------------------------- 4682 // Interest propagation 4683 4684 4685 RegExpNode* RegExpNode::TryGetSibling(NodeInfo* info) { 4686 for (int i = 0; i < siblings_.length(); i++) { 4687 RegExpNode* sibling = siblings_.Get(i); 4688 if (sibling->info()->Matches(info)) 4689 return sibling; 4690 } 4691 return NULL; 4692 } 4693 4694 4695 RegExpNode* RegExpNode::EnsureSibling(NodeInfo* info, bool* cloned) { 4696 ASSERT_EQ(false, *cloned); 4697 siblings_.Ensure(this); 4698 RegExpNode* result = TryGetSibling(info); 4699 if (result != NULL) return result; 4700 result = this->Clone(); 4701 NodeInfo* new_info = result->info(); 4702 new_info->ResetCompilationState(); 4703 new_info->AddFromPreceding(info); 4704 AddSibling(result); 4705 *cloned = true; 4706 return result; 4707 } 4708 4709 4710 template <class C> 4711 static RegExpNode* PropagateToEndpoint(C* node, NodeInfo* info) { 4712 NodeInfo full_info(*node->info()); 4713 full_info.AddFromPreceding(info); 4714 bool cloned = false; 4715 return RegExpNode::EnsureSibling(node, &full_info, &cloned); 4716 } 4717 4718 4719 // ------------------------------------------------------------------- 4720 // Splay tree 4721 4722 4723 OutSet* OutSet::Extend(unsigned value) { 4724 if (Get(value)) 4725 return this; 4726 if (successors() != NULL) { 4727 for (int i = 0; i < successors()->length(); i++) { 4728 OutSet* successor = successors()->at(i); 4729 if (successor->Get(value)) 4730 return successor; 4731 } 4732 } else { 4733 successors_ = new ZoneList<OutSet*>(2); 4734 } 4735 OutSet* result = new OutSet(first_, remaining_); 4736 result->Set(value); 4737 successors()->Add(result); 4738 return result; 4739 } 4740 4741 4742 void OutSet::Set(unsigned value) { 4743 if (value < kFirstLimit) { 4744 first_ |= (1 << value); 4745 } else { 4746 if (remaining_ == NULL) 4747 remaining_ = new ZoneList<unsigned>(1); 4748 if (remaining_->is_empty() || !remaining_->Contains(value)) 4749 remaining_->Add(value); 4750 } 4751 } 4752 4753 4754 bool OutSet::Get(unsigned value) { 4755 if (value < kFirstLimit) { 4756 return (first_ & (1 << value)) != 0; 4757 } else if (remaining_ == NULL) { 4758 return false; 4759 } else { 4760 return remaining_->Contains(value); 4761 } 4762 } 4763 4764 4765 const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar; 4766 const DispatchTable::Entry DispatchTable::Config::kNoValue; 4767 4768 4769 void DispatchTable::AddRange(CharacterRange full_range, int value) { 4770 CharacterRange current = full_range; 4771 if (tree()->is_empty()) { 4772 // If this is the first range we just insert into the table. 4773 ZoneSplayTree<Config>::Locator loc; 4774 ASSERT_RESULT(tree()->Insert(current.from(), &loc)); 4775 loc.set_value(Entry(current.from(), current.to(), empty()->Extend(value))); 4776 return; 4777 } 4778 // First see if there is a range to the left of this one that 4779 // overlaps. 4780 ZoneSplayTree<Config>::Locator loc; 4781 if (tree()->FindGreatestLessThan(current.from(), &loc)) { 4782 Entry* entry = &loc.value(); 4783 // If we've found a range that overlaps with this one, and it 4784 // starts strictly to the left of this one, we have to fix it 4785 // because the following code only handles ranges that start on 4786 // or after the start point of the range we're adding. 4787 if (entry->from() < current.from() && entry->to() >= current.from()) { 4788 // Snap the overlapping range in half around the start point of 4789 // the range we're adding. 4790 CharacterRange left(entry->from(), current.from() - 1); 4791 CharacterRange right(current.from(), entry->to()); 4792 // The left part of the overlapping range doesn't overlap. 4793 // Truncate the whole entry to be just the left part. 4794 entry->set_to(left.to()); 4795 // The right part is the one that overlaps. We add this part 4796 // to the map and let the next step deal with merging it with 4797 // the range we're adding. 4798 ZoneSplayTree<Config>::Locator loc; 4799 ASSERT_RESULT(tree()->Insert(right.from(), &loc)); 4800 loc.set_value(Entry(right.from(), 4801 right.to(), 4802 entry->out_set())); 4803 } 4804 } 4805 while (current.is_valid()) { 4806 if (tree()->FindLeastGreaterThan(current.from(), &loc) && 4807 (loc.value().from() <= current.to()) && 4808 (loc.value().to() >= current.from())) { 4809 Entry* entry = &loc.value(); 4810 // We have overlap. If there is space between the start point of 4811 // the range we're adding and where the overlapping range starts 4812 // then we have to add a range covering just that space. 4813 if (current.from() < entry->from()) { 4814 ZoneSplayTree<Config>::Locator ins; 4815 ASSERT_RESULT(tree()->Insert(current.from(), &ins)); 4816 ins.set_value(Entry(current.from(), 4817 entry->from() - 1, 4818 empty()->Extend(value))); 4819 current.set_from(entry->from()); 4820 } 4821 ASSERT_EQ(current.from(), entry->from()); 4822 // If the overlapping range extends beyond the one we want to add 4823 // we have to snap the right part off and add it separately. 4824 if (entry->to() > current.to()) { 4825 ZoneSplayTree<Config>::Locator ins; 4826 ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins)); 4827 ins.set_value(Entry(current.to() + 1, 4828 entry->to(), 4829 entry->out_set())); 4830 entry->set_to(current.to()); 4831 } 4832 ASSERT(entry->to() <= current.to()); 4833 // The overlapping range is now completely contained by the range 4834 // we're adding so we can just update it and move the start point 4835 // of the range we're adding just past it. 4836 entry->AddValue(value); 4837 // Bail out if the last interval ended at 0xFFFF since otherwise 4838 // adding 1 will wrap around to 0. 4839 if (entry->to() == String::kMaxUC16CharCode) 4840 break; 4841 ASSERT(entry->to() + 1 > current.from()); 4842 current.set_from(entry->to() + 1); 4843 } else { 4844 // There is no overlap so we can just add the range 4845 ZoneSplayTree<Config>::Locator ins; 4846 ASSERT_RESULT(tree()->Insert(current.from(), &ins)); 4847 ins.set_value(Entry(current.from(), 4848 current.to(), 4849 empty()->Extend(value))); 4850 break; 4851 } 4852 } 4853 } 4854 4855 4856 OutSet* DispatchTable::Get(uc16 value) { 4857 ZoneSplayTree<Config>::Locator loc; 4858 if (!tree()->FindGreatestLessThan(value, &loc)) 4859 return empty(); 4860 Entry* entry = &loc.value(); 4861 if (value <= entry->to()) 4862 return entry->out_set(); 4863 else 4864 return empty(); 4865 } 4866 4867 4868 // ------------------------------------------------------------------- 4869 // Analysis 4870 4871 4872 void Analysis::EnsureAnalyzed(RegExpNode* that) { 4873 StackLimitCheck check(Isolate::Current()); 4874 if (check.HasOverflowed()) { 4875 fail("Stack overflow"); 4876 return; 4877 } 4878 if (that->info()->been_analyzed || that->info()->being_analyzed) 4879 return; 4880 that->info()->being_analyzed = true; 4881 that->Accept(this); 4882 that->info()->being_analyzed = false; 4883 that->info()->been_analyzed = true; 4884 } 4885 4886 4887 void Analysis::VisitEnd(EndNode* that) { 4888 // nothing to do 4889 } 4890 4891 4892 void TextNode::CalculateOffsets() { 4893 int element_count = elements()->length(); 4894 // Set up the offsets of the elements relative to the start. This is a fixed 4895 // quantity since a TextNode can only contain fixed-width things. 4896 int cp_offset = 0; 4897 for (int i = 0; i < element_count; i++) { 4898 TextElement& elm = elements()->at(i); 4899 elm.cp_offset = cp_offset; 4900 if (elm.type == TextElement::ATOM) { 4901 cp_offset += elm.data.u_atom->data().length(); 4902 } else { 4903 cp_offset++; 4904 Vector<const uc16> quarks = elm.data.u_atom->data(); 4905 } 4906 } 4907 } 4908 4909 4910 void Analysis::VisitText(TextNode* that) { 4911 if (ignore_case_) { 4912 that->MakeCaseIndependent(is_ascii_); 4913 } 4914 EnsureAnalyzed(that->on_success()); 4915 if (!has_failed()) { 4916 that->CalculateOffsets(); 4917 } 4918 } 4919 4920 4921 void Analysis::VisitAction(ActionNode* that) { 4922 RegExpNode* target = that->on_success(); 4923 EnsureAnalyzed(target); 4924 if (!has_failed()) { 4925 // If the next node is interested in what it follows then this node 4926 // has to be interested too so it can pass the information on. 4927 that->info()->AddFromFollowing(target->info()); 4928 } 4929 } 4930 4931 4932 void Analysis::VisitChoice(ChoiceNode* that) { 4933 NodeInfo* info = that->info(); 4934 for (int i = 0; i < that->alternatives()->length(); i++) { 4935 RegExpNode* node = that->alternatives()->at(i).node(); 4936 EnsureAnalyzed(node); 4937 if (has_failed()) return; 4938 // Anything the following nodes need to know has to be known by 4939 // this node also, so it can pass it on. 4940 info->AddFromFollowing(node->info()); 4941 } 4942 } 4943 4944 4945 void Analysis::VisitLoopChoice(LoopChoiceNode* that) { 4946 NodeInfo* info = that->info(); 4947 for (int i = 0; i < that->alternatives()->length(); i++) { 4948 RegExpNode* node = that->alternatives()->at(i).node(); 4949 if (node != that->loop_node()) { 4950 EnsureAnalyzed(node); 4951 if (has_failed()) return; 4952 info->AddFromFollowing(node->info()); 4953 } 4954 } 4955 // Check the loop last since it may need the value of this node 4956 // to get a correct result. 4957 EnsureAnalyzed(that->loop_node()); 4958 if (!has_failed()) { 4959 info->AddFromFollowing(that->loop_node()->info()); 4960 } 4961 } 4962 4963 4964 void Analysis::VisitBackReference(BackReferenceNode* that) { 4965 EnsureAnalyzed(that->on_success()); 4966 } 4967 4968 4969 void Analysis::VisitAssertion(AssertionNode* that) { 4970 EnsureAnalyzed(that->on_success()); 4971 AssertionNode::AssertionNodeType type = that->type(); 4972 if (type == AssertionNode::AT_BOUNDARY || 4973 type == AssertionNode::AT_NON_BOUNDARY) { 4974 // Check if the following character is known to be a word character 4975 // or known to not be a word character. 4976 ZoneList<CharacterRange>* following_chars = that->FirstCharacterSet(); 4977 4978 CharacterRange::Canonicalize(following_chars); 4979 4980 SetRelation word_relation = 4981 CharacterRange::WordCharacterRelation(following_chars); 4982 if (word_relation.Disjoint()) { 4983 // Includes the case where following_chars is empty (e.g., end-of-input). 4984 // Following character is definitely *not* a word character. 4985 type = (type == AssertionNode::AT_BOUNDARY) ? 4986 AssertionNode::AFTER_WORD_CHARACTER : 4987 AssertionNode::AFTER_NONWORD_CHARACTER; 4988 that->set_type(type); 4989 } else if (word_relation.ContainedIn()) { 4990 // Following character is definitely a word character. 4991 type = (type == AssertionNode::AT_BOUNDARY) ? 4992 AssertionNode::AFTER_NONWORD_CHARACTER : 4993 AssertionNode::AFTER_WORD_CHARACTER; 4994 that->set_type(type); 4995 } 4996 } 4997 } 4998 4999 5000 ZoneList<CharacterRange>* RegExpNode::FirstCharacterSet() { 5001 if (first_character_set_ == NULL) { 5002 if (ComputeFirstCharacterSet(kFirstCharBudget) < 0) { 5003 // If we can't find an exact solution within the budget, we 5004 // set the value to the set of every character, i.e., all characters 5005 // are possible. 5006 ZoneList<CharacterRange>* all_set = new ZoneList<CharacterRange>(1); 5007 all_set->Add(CharacterRange::Everything()); 5008 first_character_set_ = all_set; 5009 } 5010 } 5011 return first_character_set_; 5012 } 5013 5014 5015 int RegExpNode::ComputeFirstCharacterSet(int budget) { 5016 // Default behavior is to not be able to determine the first character. 5017 return kComputeFirstCharacterSetFail; 5018 } 5019 5020 5021 int LoopChoiceNode::ComputeFirstCharacterSet(int budget) { 5022 budget--; 5023 if (budget >= 0) { 5024 // Find loop min-iteration. It's the value of the guarded choice node 5025 // with a GEQ guard, if any. 5026 int min_repetition = 0; 5027 5028 for (int i = 0; i <= 1; i++) { 5029 GuardedAlternative alternative = alternatives()->at(i); 5030 ZoneList<Guard*>* guards = alternative.guards(); 5031 if (guards != NULL && guards->length() > 0) { 5032 Guard* guard = guards->at(0); 5033 if (guard->op() == Guard::GEQ) { 5034 min_repetition = guard->value(); 5035 break; 5036 } 5037 } 5038 } 5039 5040 budget = loop_node()->ComputeFirstCharacterSet(budget); 5041 if (budget >= 0) { 5042 ZoneList<CharacterRange>* character_set = 5043 loop_node()->first_character_set(); 5044 if (body_can_be_zero_length() || min_repetition == 0) { 5045 budget = continue_node()->ComputeFirstCharacterSet(budget); 5046 if (budget < 0) return budget; 5047 ZoneList<CharacterRange>* body_set = 5048 continue_node()->first_character_set(); 5049 ZoneList<CharacterRange>* union_set = 5050 new ZoneList<CharacterRange>(Max(character_set->length(), 5051 body_set->length())); 5052 CharacterRange::Merge(character_set, 5053 body_set, 5054 union_set, 5055 union_set, 5056 union_set); 5057 character_set = union_set; 5058 } 5059 set_first_character_set(character_set); 5060 } 5061 } 5062 return budget; 5063 } 5064 5065 5066 int NegativeLookaheadChoiceNode::ComputeFirstCharacterSet(int budget) { 5067 budget--; 5068 if (budget >= 0) { 5069 GuardedAlternative successor = this->alternatives()->at(1); 5070 RegExpNode* successor_node = successor.node(); 5071 budget = successor_node->ComputeFirstCharacterSet(budget); 5072 if (budget >= 0) { 5073 set_first_character_set(successor_node->first_character_set()); 5074 } 5075 } 5076 return budget; 5077 } 5078 5079 5080 // The first character set of an EndNode is unknowable. Just use the 5081 // default implementation that fails and returns all characters as possible. 5082 5083 5084 int AssertionNode::ComputeFirstCharacterSet(int budget) { 5085 budget -= 1; 5086 if (budget >= 0) { 5087 switch (type_) { 5088 case AT_END: { 5089 set_first_character_set(new ZoneList<CharacterRange>(0)); 5090 break; 5091 } 5092 case AT_START: 5093 case AT_BOUNDARY: 5094 case AT_NON_BOUNDARY: 5095 case AFTER_NEWLINE: 5096 case AFTER_NONWORD_CHARACTER: 5097 case AFTER_WORD_CHARACTER: { 5098 ASSERT_NOT_NULL(on_success()); 5099 budget = on_success()->ComputeFirstCharacterSet(budget); 5100 if (budget >= 0) { 5101 set_first_character_set(on_success()->first_character_set()); 5102 } 5103 break; 5104 } 5105 } 5106 } 5107 return budget; 5108 } 5109 5110 5111 int ActionNode::ComputeFirstCharacterSet(int budget) { 5112 if (type_ == POSITIVE_SUBMATCH_SUCCESS) return kComputeFirstCharacterSetFail; 5113 budget--; 5114 if (budget >= 0) { 5115 ASSERT_NOT_NULL(on_success()); 5116 budget = on_success()->ComputeFirstCharacterSet(budget); 5117 if (budget >= 0) { 5118 set_first_character_set(on_success()->first_character_set()); 5119 } 5120 } 5121 return budget; 5122 } 5123 5124 5125 int BackReferenceNode::ComputeFirstCharacterSet(int budget) { 5126 // We don't know anything about the first character of a backreference 5127 // at this point. 5128 // The potential first characters are the first characters of the capture, 5129 // and the first characters of the on_success node, depending on whether the 5130 // capture can be empty and whether it is known to be participating or known 5131 // not to be. 5132 return kComputeFirstCharacterSetFail; 5133 } 5134 5135 5136 int TextNode::ComputeFirstCharacterSet(int budget) { 5137 budget--; 5138 if (budget >= 0) { 5139 ASSERT_NE(0, elements()->length()); 5140 TextElement text = elements()->at(0); 5141 if (text.type == TextElement::ATOM) { 5142 RegExpAtom* atom = text.data.u_atom; 5143 ASSERT_NE(0, atom->length()); 5144 uc16 first_char = atom->data()[0]; 5145 ZoneList<CharacterRange>* range = new ZoneList<CharacterRange>(1); 5146 range->Add(CharacterRange(first_char, first_char)); 5147 set_first_character_set(range); 5148 } else { 5149 ASSERT(text.type == TextElement::CHAR_CLASS); 5150 RegExpCharacterClass* char_class = text.data.u_char_class; 5151 ZoneList<CharacterRange>* ranges = char_class->ranges(); 5152 // TODO(lrn): Canonicalize ranges when they are created 5153 // instead of waiting until now. 5154 CharacterRange::Canonicalize(ranges); 5155 if (char_class->is_negated()) { 5156 int length = ranges->length(); 5157 int new_length = length + 1; 5158 if (length > 0) { 5159 if (ranges->at(0).from() == 0) new_length--; 5160 if (ranges->at(length - 1).to() == String::kMaxUC16CharCode) { 5161 new_length--; 5162 } 5163 } 5164 ZoneList<CharacterRange>* negated_ranges = 5165 new ZoneList<CharacterRange>(new_length); 5166 CharacterRange::Negate(ranges, negated_ranges); 5167 set_first_character_set(negated_ranges); 5168 } else { 5169 set_first_character_set(ranges); 5170 } 5171 } 5172 } 5173 return budget; 5174 } 5175 5176 5177 5178 // ------------------------------------------------------------------- 5179 // Dispatch table construction 5180 5181 5182 void DispatchTableConstructor::VisitEnd(EndNode* that) { 5183 AddRange(CharacterRange::Everything()); 5184 } 5185 5186 5187 void DispatchTableConstructor::BuildTable(ChoiceNode* node) { 5188 node->set_being_calculated(true); 5189 ZoneList<GuardedAlternative>* alternatives = node->alternatives(); 5190 for (int i = 0; i < alternatives->length(); i++) { 5191 set_choice_index(i); 5192 alternatives->at(i).node()->Accept(this); 5193 } 5194 node->set_being_calculated(false); 5195 } 5196 5197 5198 class AddDispatchRange { 5199 public: 5200 explicit AddDispatchRange(DispatchTableConstructor* constructor) 5201 : constructor_(constructor) { } 5202 void Call(uc32 from, DispatchTable::Entry entry); 5203 private: 5204 DispatchTableConstructor* constructor_; 5205 }; 5206 5207 5208 void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) { 5209 CharacterRange range(from, entry.to()); 5210 constructor_->AddRange(range); 5211 } 5212 5213 5214 void DispatchTableConstructor::VisitChoice(ChoiceNode* node) { 5215 if (node->being_calculated()) 5216 return; 5217 DispatchTable* table = node->GetTable(ignore_case_); 5218 AddDispatchRange adder(this); 5219 table->ForEach(&adder); 5220 } 5221 5222 5223 void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) { 5224 // TODO(160): Find the node that we refer back to and propagate its start 5225 // set back to here. For now we just accept anything. 5226 AddRange(CharacterRange::Everything()); 5227 } 5228 5229 5230 void DispatchTableConstructor::VisitAssertion(AssertionNode* that) { 5231 RegExpNode* target = that->on_success(); 5232 target->Accept(this); 5233 } 5234 5235 5236 static int CompareRangeByFrom(const CharacterRange* a, 5237 const CharacterRange* b) { 5238 return Compare<uc16>(a->from(), b->from()); 5239 } 5240 5241 5242 void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) { 5243 ranges->Sort(CompareRangeByFrom); 5244 uc16 last = 0; 5245 for (int i = 0; i < ranges->length(); i++) { 5246 CharacterRange range = ranges->at(i); 5247 if (last < range.from()) 5248 AddRange(CharacterRange(last, range.from() - 1)); 5249 if (range.to() >= last) { 5250 if (range.to() == String::kMaxUC16CharCode) { 5251 return; 5252 } else { 5253 last = range.to() + 1; 5254 } 5255 } 5256 } 5257 AddRange(CharacterRange(last, String::kMaxUC16CharCode)); 5258 } 5259 5260 5261 void DispatchTableConstructor::VisitText(TextNode* that) { 5262 TextElement elm = that->elements()->at(0); 5263 switch (elm.type) { 5264 case TextElement::ATOM: { 5265 uc16 c = elm.data.u_atom->data()[0]; 5266 AddRange(CharacterRange(c, c)); 5267 break; 5268 } 5269 case TextElement::CHAR_CLASS: { 5270 RegExpCharacterClass* tree = elm.data.u_char_class; 5271 ZoneList<CharacterRange>* ranges = tree->ranges(); 5272 if (tree->is_negated()) { 5273 AddInverse(ranges); 5274 } else { 5275 for (int i = 0; i < ranges->length(); i++) 5276 AddRange(ranges->at(i)); 5277 } 5278 break; 5279 } 5280 default: { 5281 UNIMPLEMENTED(); 5282 } 5283 } 5284 } 5285 5286 5287 void DispatchTableConstructor::VisitAction(ActionNode* that) { 5288 RegExpNode* target = that->on_success(); 5289 target->Accept(this); 5290 } 5291 5292 5293 RegExpEngine::CompilationResult RegExpEngine::Compile(RegExpCompileData* data, 5294 bool ignore_case, 5295 bool is_multiline, 5296 Handle<String> pattern, 5297 bool is_ascii) { 5298 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) { 5299 return IrregexpRegExpTooBig(); 5300 } 5301 RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii); 5302 // Wrap the body of the regexp in capture #0. 5303 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree, 5304 0, 5305 &compiler, 5306 compiler.accept()); 5307 RegExpNode* node = captured_body; 5308 bool is_end_anchored = data->tree->IsAnchoredAtEnd(); 5309 bool is_start_anchored = data->tree->IsAnchoredAtStart(); 5310 int max_length = data->tree->max_match(); 5311 if (!is_start_anchored) { 5312 // Add a .*? at the beginning, outside the body capture, unless 5313 // this expression is anchored at the beginning. 5314 RegExpNode* loop_node = 5315 RegExpQuantifier::ToNode(0, 5316 RegExpTree::kInfinity, 5317 false, 5318 new RegExpCharacterClass('*'), 5319 &compiler, 5320 captured_body, 5321 data->contains_anchor); 5322 5323 if (data->contains_anchor) { 5324 // Unroll loop once, to take care of the case that might start 5325 // at the start of input. 5326 ChoiceNode* first_step_node = new ChoiceNode(2); 5327 first_step_node->AddAlternative(GuardedAlternative(captured_body)); 5328 first_step_node->AddAlternative(GuardedAlternative( 5329 new TextNode(new RegExpCharacterClass('*'), loop_node))); 5330 node = first_step_node; 5331 } else { 5332 node = loop_node; 5333 } 5334 } 5335 data->node = node; 5336 Analysis analysis(ignore_case, is_ascii); 5337 analysis.EnsureAnalyzed(node); 5338 if (analysis.has_failed()) { 5339 const char* error_message = analysis.error_message(); 5340 return CompilationResult(error_message); 5341 } 5342 5343 NodeInfo info = *node->info(); 5344 5345 // Create the correct assembler for the architecture. 5346 #ifndef V8_INTERPRETED_REGEXP 5347 // Native regexp implementation. 5348 5349 NativeRegExpMacroAssembler::Mode mode = 5350 is_ascii ? NativeRegExpMacroAssembler::ASCII 5351 : NativeRegExpMacroAssembler::UC16; 5352 5353 #if V8_TARGET_ARCH_IA32 5354 RegExpMacroAssemblerIA32 macro_assembler(mode, (data->capture_count + 1) * 2); 5355 #elif V8_TARGET_ARCH_X64 5356 RegExpMacroAssemblerX64 macro_assembler(mode, (data->capture_count + 1) * 2); 5357 #elif V8_TARGET_ARCH_ARM 5358 RegExpMacroAssemblerARM macro_assembler(mode, (data->capture_count + 1) * 2); 5359 #elif V8_TARGET_ARCH_MIPS 5360 RegExpMacroAssemblerMIPS macro_assembler(mode, (data->capture_count + 1) * 2); 5361 #endif 5362 5363 #else // V8_INTERPRETED_REGEXP 5364 // Interpreted regexp implementation. 5365 EmbeddedVector<byte, 1024> codes; 5366 RegExpMacroAssemblerIrregexp macro_assembler(codes); 5367 #endif // V8_INTERPRETED_REGEXP 5368 5369 // Inserted here, instead of in Assembler, because it depends on information 5370 // in the AST that isn't replicated in the Node structure. 5371 static const int kMaxBacksearchLimit = 1024; 5372 if (is_end_anchored && 5373 !is_start_anchored && 5374 max_length < kMaxBacksearchLimit) { 5375 macro_assembler.SetCurrentPositionFromEnd(max_length); 5376 } 5377 5378 return compiler.Assemble(¯o_assembler, 5379 node, 5380 data->capture_count, 5381 pattern); 5382 } 5383 5384 5385 }} // namespace v8::internal 5386