1 // Copyright 2006-2009 the V8 project authors. All rights reserved. 2 // Redistribution and use in source and binary forms, with or without 3 // modification, are permitted provided that the following conditions are 4 // met: 5 // 6 // * Redistributions of source code must retain the above copyright 7 // notice, this list of conditions and the following disclaimer. 8 // * Redistributions in binary form must reproduce the above 9 // copyright notice, this list of conditions and the following 10 // disclaimer in the documentation and/or other materials provided 11 // with the distribution. 12 // * Neither the name of Google Inc. nor the names of its 13 // contributors may be used to endorse or promote products derived 14 // from this software without specific prior written permission. 15 // 16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 28 #include "v8.h" 29 30 #include "ast.h" 31 #include "compiler.h" 32 #include "execution.h" 33 #include "factory.h" 34 #include "jsregexp.h" 35 #include "platform.h" 36 #include "runtime.h" 37 #include "top.h" 38 #include "compilation-cache.h" 39 #include "string-stream.h" 40 #include "parser.h" 41 #include "regexp-macro-assembler.h" 42 #include "regexp-macro-assembler-tracer.h" 43 #include "regexp-macro-assembler-irregexp.h" 44 #include "regexp-stack.h" 45 46 #ifdef V8_NATIVE_REGEXP 47 #if V8_TARGET_ARCH_IA32 48 #include "ia32/regexp-macro-assembler-ia32.h" 49 #elif V8_TARGET_ARCH_X64 50 #include "x64/regexp-macro-assembler-x64.h" 51 #elif V8_TARGET_ARCH_ARM 52 #include "arm/regexp-macro-assembler-arm.h" 53 #else 54 #error Unsupported target architecture. 55 #endif 56 #endif 57 58 #include "interpreter-irregexp.h" 59 60 61 namespace v8 { 62 namespace internal { 63 64 65 Handle<Object> RegExpImpl::CreateRegExpLiteral(Handle<JSFunction> constructor, 66 Handle<String> pattern, 67 Handle<String> flags, 68 bool* has_pending_exception) { 69 // Call the construct code with 2 arguments. 70 Object** argv[2] = { Handle<Object>::cast(pattern).location(), 71 Handle<Object>::cast(flags).location() }; 72 return Execution::New(constructor, 2, argv, has_pending_exception); 73 } 74 75 76 static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) { 77 int flags = JSRegExp::NONE; 78 for (int i = 0; i < str->length(); i++) { 79 switch (str->Get(i)) { 80 case 'i': 81 flags |= JSRegExp::IGNORE_CASE; 82 break; 83 case 'g': 84 flags |= JSRegExp::GLOBAL; 85 break; 86 case 'm': 87 flags |= JSRegExp::MULTILINE; 88 break; 89 } 90 } 91 return JSRegExp::Flags(flags); 92 } 93 94 95 static inline void ThrowRegExpException(Handle<JSRegExp> re, 96 Handle<String> pattern, 97 Handle<String> error_text, 98 const char* message) { 99 Handle<JSArray> array = Factory::NewJSArray(2); 100 SetElement(array, 0, pattern); 101 SetElement(array, 1, error_text); 102 Handle<Object> regexp_err = Factory::NewSyntaxError(message, array); 103 Top::Throw(*regexp_err); 104 } 105 106 107 // Generic RegExp methods. Dispatches to implementation specific methods. 108 109 110 Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re, 111 Handle<String> pattern, 112 Handle<String> flag_str) { 113 JSRegExp::Flags flags = RegExpFlagsFromString(flag_str); 114 Handle<FixedArray> cached = CompilationCache::LookupRegExp(pattern, flags); 115 bool in_cache = !cached.is_null(); 116 LOG(RegExpCompileEvent(re, in_cache)); 117 118 Handle<Object> result; 119 if (in_cache) { 120 re->set_data(*cached); 121 return re; 122 } 123 FlattenString(pattern); 124 CompilationZoneScope zone_scope(DELETE_ON_EXIT); 125 RegExpCompileData parse_result; 126 FlatStringReader reader(pattern); 127 if (!ParseRegExp(&reader, flags.is_multiline(), &parse_result)) { 128 // Throw an exception if we fail to parse the pattern. 129 ThrowRegExpException(re, 130 pattern, 131 parse_result.error, 132 "malformed_regexp"); 133 return Handle<Object>::null(); 134 } 135 136 if (parse_result.simple && !flags.is_ignore_case()) { 137 // Parse-tree is a single atom that is equal to the pattern. 138 AtomCompile(re, pattern, flags, pattern); 139 } else if (parse_result.tree->IsAtom() && 140 !flags.is_ignore_case() && 141 parse_result.capture_count == 0) { 142 RegExpAtom* atom = parse_result.tree->AsAtom(); 143 Vector<const uc16> atom_pattern = atom->data(); 144 Handle<String> atom_string = Factory::NewStringFromTwoByte(atom_pattern); 145 AtomCompile(re, pattern, flags, atom_string); 146 } else { 147 IrregexpPrepare(re, pattern, flags, parse_result.capture_count); 148 } 149 ASSERT(re->data()->IsFixedArray()); 150 // Compilation succeeded so the data is set on the regexp 151 // and we can store it in the cache. 152 Handle<FixedArray> data(FixedArray::cast(re->data())); 153 CompilationCache::PutRegExp(pattern, flags, data); 154 155 return re; 156 } 157 158 159 Handle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp, 160 Handle<String> subject, 161 int index, 162 Handle<JSArray> last_match_info) { 163 switch (regexp->TypeTag()) { 164 case JSRegExp::ATOM: 165 return AtomExec(regexp, subject, index, last_match_info); 166 case JSRegExp::IRREGEXP: { 167 Handle<Object> result = 168 IrregexpExec(regexp, subject, index, last_match_info); 169 ASSERT(!result.is_null() || Top::has_pending_exception()); 170 return result; 171 } 172 default: 173 UNREACHABLE(); 174 return Handle<Object>::null(); 175 } 176 } 177 178 179 // RegExp Atom implementation: Simple string search using indexOf. 180 181 182 void RegExpImpl::AtomCompile(Handle<JSRegExp> re, 183 Handle<String> pattern, 184 JSRegExp::Flags flags, 185 Handle<String> match_pattern) { 186 Factory::SetRegExpAtomData(re, 187 JSRegExp::ATOM, 188 pattern, 189 flags, 190 match_pattern); 191 } 192 193 194 static void SetAtomLastCapture(FixedArray* array, 195 String* subject, 196 int from, 197 int to) { 198 NoHandleAllocation no_handles; 199 RegExpImpl::SetLastCaptureCount(array, 2); 200 RegExpImpl::SetLastSubject(array, subject); 201 RegExpImpl::SetLastInput(array, subject); 202 RegExpImpl::SetCapture(array, 0, from); 203 RegExpImpl::SetCapture(array, 1, to); 204 } 205 206 207 Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re, 208 Handle<String> subject, 209 int index, 210 Handle<JSArray> last_match_info) { 211 Handle<String> needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex))); 212 213 uint32_t start_index = index; 214 215 int value = Runtime::StringMatch(subject, needle, start_index); 216 if (value == -1) return Factory::null_value(); 217 ASSERT(last_match_info->HasFastElements()); 218 219 { 220 NoHandleAllocation no_handles; 221 FixedArray* array = FixedArray::cast(last_match_info->elements()); 222 SetAtomLastCapture(array, *subject, value, value + needle->length()); 223 } 224 return last_match_info; 225 } 226 227 228 // Irregexp implementation. 229 230 // Ensures that the regexp object contains a compiled version of the 231 // source for either ASCII or non-ASCII strings. 232 // If the compiled version doesn't already exist, it is compiled 233 // from the source pattern. 234 // If compilation fails, an exception is thrown and this function 235 // returns false. 236 bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, bool is_ascii) { 237 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii)); 238 #ifdef V8_NATIVE_REGEXP 239 if (compiled_code->IsCode()) return true; 240 #else // ! V8_NATIVE_REGEXP (RegExp interpreter code) 241 if (compiled_code->IsByteArray()) return true; 242 #endif 243 return CompileIrregexp(re, is_ascii); 244 } 245 246 247 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, bool is_ascii) { 248 // Compile the RegExp. 249 CompilationZoneScope zone_scope(DELETE_ON_EXIT); 250 Object* entry = re->DataAt(JSRegExp::code_index(is_ascii)); 251 if (entry->IsJSObject()) { 252 // If it's a JSObject, a previous compilation failed and threw this object. 253 // Re-throw the object without trying again. 254 Top::Throw(entry); 255 return false; 256 } 257 ASSERT(entry->IsTheHole()); 258 259 JSRegExp::Flags flags = re->GetFlags(); 260 261 Handle<String> pattern(re->Pattern()); 262 if (!pattern->IsFlat()) { 263 FlattenString(pattern); 264 } 265 266 RegExpCompileData compile_data; 267 FlatStringReader reader(pattern); 268 if (!ParseRegExp(&reader, flags.is_multiline(), &compile_data)) { 269 // Throw an exception if we fail to parse the pattern. 270 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once. 271 ThrowRegExpException(re, 272 pattern, 273 compile_data.error, 274 "malformed_regexp"); 275 return false; 276 } 277 RegExpEngine::CompilationResult result = 278 RegExpEngine::Compile(&compile_data, 279 flags.is_ignore_case(), 280 flags.is_multiline(), 281 pattern, 282 is_ascii); 283 if (result.error_message != NULL) { 284 // Unable to compile regexp. 285 Handle<JSArray> array = Factory::NewJSArray(2); 286 SetElement(array, 0, pattern); 287 SetElement(array, 288 1, 289 Factory::NewStringFromUtf8(CStrVector(result.error_message))); 290 Handle<Object> regexp_err = 291 Factory::NewSyntaxError("malformed_regexp", array); 292 Top::Throw(*regexp_err); 293 re->SetDataAt(JSRegExp::code_index(is_ascii), *regexp_err); 294 return false; 295 } 296 297 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data())); 298 data->set(JSRegExp::code_index(is_ascii), result.code); 299 int register_max = IrregexpMaxRegisterCount(*data); 300 if (result.num_registers > register_max) { 301 SetIrregexpMaxRegisterCount(*data, result.num_registers); 302 } 303 304 return true; 305 } 306 307 308 int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) { 309 return Smi::cast( 310 re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value(); 311 } 312 313 314 void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) { 315 re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value)); 316 } 317 318 319 int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) { 320 return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value(); 321 } 322 323 324 int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) { 325 return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value(); 326 } 327 328 329 ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_ascii) { 330 return ByteArray::cast(re->get(JSRegExp::code_index(is_ascii))); 331 } 332 333 334 Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_ascii) { 335 return Code::cast(re->get(JSRegExp::code_index(is_ascii))); 336 } 337 338 339 void RegExpImpl::IrregexpPrepare(Handle<JSRegExp> re, 340 Handle<String> pattern, 341 JSRegExp::Flags flags, 342 int capture_count) { 343 // Initialize compiled code entries to null. 344 Factory::SetRegExpIrregexpData(re, 345 JSRegExp::IRREGEXP, 346 pattern, 347 flags, 348 capture_count); 349 } 350 351 352 Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> jsregexp, 353 Handle<String> subject, 354 int previous_index, 355 Handle<JSArray> last_match_info) { 356 ASSERT_EQ(jsregexp->TypeTag(), JSRegExp::IRREGEXP); 357 358 // Prepare space for the return values. 359 int number_of_capture_registers = 360 (IrregexpNumberOfCaptures(FixedArray::cast(jsregexp->data())) + 1) * 2; 361 362 #ifndef V8_NATIVE_REGEXP 363 #ifdef DEBUG 364 if (FLAG_trace_regexp_bytecodes) { 365 String* pattern = jsregexp->Pattern(); 366 PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString())); 367 PrintF("\n\nSubject string: '%s'\n\n", *(subject->ToCString())); 368 } 369 #endif 370 #endif 371 372 if (!subject->IsFlat()) { 373 FlattenString(subject); 374 } 375 376 last_match_info->EnsureSize(number_of_capture_registers + kLastMatchOverhead); 377 378 Handle<FixedArray> array; 379 380 // Dispatch to the correct RegExp implementation. 381 Handle<FixedArray> regexp(FixedArray::cast(jsregexp->data())); 382 383 #ifdef V8_NATIVE_REGEXP 384 385 OffsetsVector captures(number_of_capture_registers); 386 int* captures_vector = captures.vector(); 387 NativeRegExpMacroAssembler::Result res; 388 do { 389 bool is_ascii = subject->IsAsciiRepresentation(); 390 if (!EnsureCompiledIrregexp(jsregexp, is_ascii)) { 391 return Handle<Object>::null(); 392 } 393 Handle<Code> code(RegExpImpl::IrregexpNativeCode(*regexp, is_ascii)); 394 res = NativeRegExpMacroAssembler::Match(code, 395 subject, 396 captures_vector, 397 captures.length(), 398 previous_index); 399 // If result is RETRY, the string have changed representation, and we 400 // must restart from scratch. 401 } while (res == NativeRegExpMacroAssembler::RETRY); 402 if (res == NativeRegExpMacroAssembler::EXCEPTION) { 403 ASSERT(Top::has_pending_exception()); 404 return Handle<Object>::null(); 405 } 406 ASSERT(res == NativeRegExpMacroAssembler::SUCCESS 407 || res == NativeRegExpMacroAssembler::FAILURE); 408 409 if (res != NativeRegExpMacroAssembler::SUCCESS) return Factory::null_value(); 410 411 array = Handle<FixedArray>(FixedArray::cast(last_match_info->elements())); 412 ASSERT(array->length() >= number_of_capture_registers + kLastMatchOverhead); 413 // The captures come in (start, end+1) pairs. 414 for (int i = 0; i < number_of_capture_registers; i += 2) { 415 // Capture values are relative to start_offset only. 416 // Convert them to be relative to start of string. 417 if (captures_vector[i] >= 0) { 418 captures_vector[i] += previous_index; 419 } 420 if (captures_vector[i + 1] >= 0) { 421 captures_vector[i + 1] += previous_index; 422 } 423 SetCapture(*array, i, captures_vector[i]); 424 SetCapture(*array, i + 1, captures_vector[i + 1]); 425 } 426 427 #else // ! V8_NATIVE_REGEXP 428 429 bool is_ascii = subject->IsAsciiRepresentation(); 430 if (!EnsureCompiledIrregexp(jsregexp, is_ascii)) { 431 return Handle<Object>::null(); 432 } 433 // Now that we have done EnsureCompiledIrregexp we can get the number of 434 // registers. 435 int number_of_registers = 436 IrregexpNumberOfRegisters(FixedArray::cast(jsregexp->data())); 437 OffsetsVector registers(number_of_registers); 438 int* register_vector = registers.vector(); 439 for (int i = number_of_capture_registers - 1; i >= 0; i--) { 440 register_vector[i] = -1; 441 } 442 Handle<ByteArray> byte_codes(IrregexpByteCode(*regexp, is_ascii)); 443 444 if (!IrregexpInterpreter::Match(byte_codes, 445 subject, 446 register_vector, 447 previous_index)) { 448 return Factory::null_value(); 449 } 450 451 array = Handle<FixedArray>(FixedArray::cast(last_match_info->elements())); 452 ASSERT(array->length() >= number_of_capture_registers + kLastMatchOverhead); 453 // The captures come in (start, end+1) pairs. 454 for (int i = 0; i < number_of_capture_registers; i += 2) { 455 SetCapture(*array, i, register_vector[i]); 456 SetCapture(*array, i + 1, register_vector[i + 1]); 457 } 458 459 #endif // V8_NATIVE_REGEXP 460 461 SetLastCaptureCount(*array, number_of_capture_registers); 462 SetLastSubject(*array, *subject); 463 SetLastInput(*array, *subject); 464 465 return last_match_info; 466 } 467 468 469 // ------------------------------------------------------------------- 470 // Implementation of the Irregexp regular expression engine. 471 // 472 // The Irregexp regular expression engine is intended to be a complete 473 // implementation of ECMAScript regular expressions. It generates either 474 // bytecodes or native code. 475 476 // The Irregexp regexp engine is structured in three steps. 477 // 1) The parser generates an abstract syntax tree. See ast.cc. 478 // 2) From the AST a node network is created. The nodes are all 479 // subclasses of RegExpNode. The nodes represent states when 480 // executing a regular expression. Several optimizations are 481 // performed on the node network. 482 // 3) From the nodes we generate either byte codes or native code 483 // that can actually execute the regular expression (perform 484 // the search). The code generation step is described in more 485 // detail below. 486 487 // Code generation. 488 // 489 // The nodes are divided into four main categories. 490 // * Choice nodes 491 // These represent places where the regular expression can 492 // match in more than one way. For example on entry to an 493 // alternation (foo|bar) or a repetition (*, +, ? or {}). 494 // * Action nodes 495 // These represent places where some action should be 496 // performed. Examples include recording the current position 497 // in the input string to a register (in order to implement 498 // captures) or other actions on register for example in order 499 // to implement the counters needed for {} repetitions. 500 // * Matching nodes 501 // These attempt to match some element part of the input string. 502 // Examples of elements include character classes, plain strings 503 // or back references. 504 // * End nodes 505 // These are used to implement the actions required on finding 506 // a successful match or failing to find a match. 507 // 508 // The code generated (whether as byte codes or native code) maintains 509 // some state as it runs. This consists of the following elements: 510 // 511 // * The capture registers. Used for string captures. 512 // * Other registers. Used for counters etc. 513 // * The current position. 514 // * The stack of backtracking information. Used when a matching node 515 // fails to find a match and needs to try an alternative. 516 // 517 // Conceptual regular expression execution model: 518 // 519 // There is a simple conceptual model of regular expression execution 520 // which will be presented first. The actual code generated is a more 521 // efficient simulation of the simple conceptual model: 522 // 523 // * Choice nodes are implemented as follows: 524 // For each choice except the last { 525 // push current position 526 // push backtrack code location 527 // <generate code to test for choice> 528 // backtrack code location: 529 // pop current position 530 // } 531 // <generate code to test for last choice> 532 // 533 // * Actions nodes are generated as follows 534 // <push affected registers on backtrack stack> 535 // <generate code to perform action> 536 // push backtrack code location 537 // <generate code to test for following nodes> 538 // backtrack code location: 539 // <pop affected registers to restore their state> 540 // <pop backtrack location from stack and go to it> 541 // 542 // * Matching nodes are generated as follows: 543 // if input string matches at current position 544 // update current position 545 // <generate code to test for following nodes> 546 // else 547 // <pop backtrack location from stack and go to it> 548 // 549 // Thus it can be seen that the current position is saved and restored 550 // by the choice nodes, whereas the registers are saved and restored by 551 // by the action nodes that manipulate them. 552 // 553 // The other interesting aspect of this model is that nodes are generated 554 // at the point where they are needed by a recursive call to Emit(). If 555 // the node has already been code generated then the Emit() call will 556 // generate a jump to the previously generated code instead. In order to 557 // limit recursion it is possible for the Emit() function to put the node 558 // on a work list for later generation and instead generate a jump. The 559 // destination of the jump is resolved later when the code is generated. 560 // 561 // Actual regular expression code generation. 562 // 563 // Code generation is actually more complicated than the above. In order 564 // to improve the efficiency of the generated code some optimizations are 565 // performed 566 // 567 // * Choice nodes have 1-character lookahead. 568 // A choice node looks at the following character and eliminates some of 569 // the choices immediately based on that character. This is not yet 570 // implemented. 571 // * Simple greedy loops store reduced backtracking information. 572 // A quantifier like /.*foo/m will greedily match the whole input. It will 573 // then need to backtrack to a point where it can match "foo". The naive 574 // implementation of this would push each character position onto the 575 // backtracking stack, then pop them off one by one. This would use space 576 // proportional to the length of the input string. However since the "." 577 // can only match in one way and always has a constant length (in this case 578 // of 1) it suffices to store the current position on the top of the stack 579 // once. Matching now becomes merely incrementing the current position and 580 // backtracking becomes decrementing the current position and checking the 581 // result against the stored current position. This is faster and saves 582 // space. 583 // * The current state is virtualized. 584 // This is used to defer expensive operations until it is clear that they 585 // are needed and to generate code for a node more than once, allowing 586 // specialized an efficient versions of the code to be created. This is 587 // explained in the section below. 588 // 589 // Execution state virtualization. 590 // 591 // Instead of emitting code, nodes that manipulate the state can record their 592 // manipulation in an object called the Trace. The Trace object can record a 593 // current position offset, an optional backtrack code location on the top of 594 // the virtualized backtrack stack and some register changes. When a node is 595 // to be emitted it can flush the Trace or update it. Flushing the Trace 596 // will emit code to bring the actual state into line with the virtual state. 597 // Avoiding flushing the state can postpone some work (eg updates of capture 598 // registers). Postponing work can save time when executing the regular 599 // expression since it may be found that the work never has to be done as a 600 // failure to match can occur. In addition it is much faster to jump to a 601 // known backtrack code location than it is to pop an unknown backtrack 602 // location from the stack and jump there. 603 // 604 // The virtual state found in the Trace affects code generation. For example 605 // the virtual state contains the difference between the actual current 606 // position and the virtual current position, and matching code needs to use 607 // this offset to attempt a match in the correct location of the input 608 // string. Therefore code generated for a non-trivial trace is specialized 609 // to that trace. The code generator therefore has the ability to generate 610 // code for each node several times. In order to limit the size of the 611 // generated code there is an arbitrary limit on how many specialized sets of 612 // code may be generated for a given node. If the limit is reached, the 613 // trace is flushed and a generic version of the code for a node is emitted. 614 // This is subsequently used for that node. The code emitted for non-generic 615 // trace is not recorded in the node and so it cannot currently be reused in 616 // the event that code generation is requested for an identical trace. 617 618 619 void RegExpTree::AppendToText(RegExpText* text) { 620 UNREACHABLE(); 621 } 622 623 624 void RegExpAtom::AppendToText(RegExpText* text) { 625 text->AddElement(TextElement::Atom(this)); 626 } 627 628 629 void RegExpCharacterClass::AppendToText(RegExpText* text) { 630 text->AddElement(TextElement::CharClass(this)); 631 } 632 633 634 void RegExpText::AppendToText(RegExpText* text) { 635 for (int i = 0; i < elements()->length(); i++) 636 text->AddElement(elements()->at(i)); 637 } 638 639 640 TextElement TextElement::Atom(RegExpAtom* atom) { 641 TextElement result = TextElement(ATOM); 642 result.data.u_atom = atom; 643 return result; 644 } 645 646 647 TextElement TextElement::CharClass( 648 RegExpCharacterClass* char_class) { 649 TextElement result = TextElement(CHAR_CLASS); 650 result.data.u_char_class = char_class; 651 return result; 652 } 653 654 655 int TextElement::length() { 656 if (type == ATOM) { 657 return data.u_atom->length(); 658 } else { 659 ASSERT(type == CHAR_CLASS); 660 return 1; 661 } 662 } 663 664 665 DispatchTable* ChoiceNode::GetTable(bool ignore_case) { 666 if (table_ == NULL) { 667 table_ = new DispatchTable(); 668 DispatchTableConstructor cons(table_, ignore_case); 669 cons.BuildTable(this); 670 } 671 return table_; 672 } 673 674 675 class RegExpCompiler { 676 public: 677 RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii); 678 679 int AllocateRegister() { 680 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) { 681 reg_exp_too_big_ = true; 682 return next_register_; 683 } 684 return next_register_++; 685 } 686 687 RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler, 688 RegExpNode* start, 689 int capture_count, 690 Handle<String> pattern); 691 692 inline void AddWork(RegExpNode* node) { work_list_->Add(node); } 693 694 static const int kImplementationOffset = 0; 695 static const int kNumberOfRegistersOffset = 0; 696 static const int kCodeOffset = 1; 697 698 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; } 699 EndNode* accept() { return accept_; } 700 701 static const int kMaxRecursion = 100; 702 inline int recursion_depth() { return recursion_depth_; } 703 inline void IncrementRecursionDepth() { recursion_depth_++; } 704 inline void DecrementRecursionDepth() { recursion_depth_--; } 705 706 void SetRegExpTooBig() { reg_exp_too_big_ = true; } 707 708 inline bool ignore_case() { return ignore_case_; } 709 inline bool ascii() { return ascii_; } 710 711 static const int kNoRegister = -1; 712 private: 713 EndNode* accept_; 714 int next_register_; 715 List<RegExpNode*>* work_list_; 716 int recursion_depth_; 717 RegExpMacroAssembler* macro_assembler_; 718 bool ignore_case_; 719 bool ascii_; 720 bool reg_exp_too_big_; 721 }; 722 723 724 class RecursionCheck { 725 public: 726 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) { 727 compiler->IncrementRecursionDepth(); 728 } 729 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); } 730 private: 731 RegExpCompiler* compiler_; 732 }; 733 734 735 static RegExpEngine::CompilationResult IrregexpRegExpTooBig() { 736 return RegExpEngine::CompilationResult("RegExp too big"); 737 } 738 739 740 // Attempts to compile the regexp using an Irregexp code generator. Returns 741 // a fixed array or a null handle depending on whether it succeeded. 742 RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii) 743 : next_register_(2 * (capture_count + 1)), 744 work_list_(NULL), 745 recursion_depth_(0), 746 ignore_case_(ignore_case), 747 ascii_(ascii), 748 reg_exp_too_big_(false) { 749 accept_ = new EndNode(EndNode::ACCEPT); 750 ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister); 751 } 752 753 754 RegExpEngine::CompilationResult RegExpCompiler::Assemble( 755 RegExpMacroAssembler* macro_assembler, 756 RegExpNode* start, 757 int capture_count, 758 Handle<String> pattern) { 759 #ifdef DEBUG 760 if (FLAG_trace_regexp_assembler) 761 macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler); 762 else 763 #endif 764 macro_assembler_ = macro_assembler; 765 List <RegExpNode*> work_list(0); 766 work_list_ = &work_list; 767 Label fail; 768 macro_assembler_->PushBacktrack(&fail); 769 Trace new_trace; 770 start->Emit(this, &new_trace); 771 macro_assembler_->Bind(&fail); 772 macro_assembler_->Fail(); 773 while (!work_list.is_empty()) { 774 work_list.RemoveLast()->Emit(this, &new_trace); 775 } 776 if (reg_exp_too_big_) return IrregexpRegExpTooBig(); 777 778 Handle<Object> code = macro_assembler_->GetCode(pattern); 779 780 work_list_ = NULL; 781 #ifdef DEBUG 782 if (FLAG_trace_regexp_assembler) { 783 delete macro_assembler_; 784 } 785 #endif 786 return RegExpEngine::CompilationResult(*code, next_register_); 787 } 788 789 790 bool Trace::DeferredAction::Mentions(int that) { 791 if (type() == ActionNode::CLEAR_CAPTURES) { 792 Interval range = static_cast<DeferredClearCaptures*>(this)->range(); 793 return range.Contains(that); 794 } else { 795 return reg() == that; 796 } 797 } 798 799 800 bool Trace::mentions_reg(int reg) { 801 for (DeferredAction* action = actions_; 802 action != NULL; 803 action = action->next()) { 804 if (action->Mentions(reg)) 805 return true; 806 } 807 return false; 808 } 809 810 811 bool Trace::GetStoredPosition(int reg, int* cp_offset) { 812 ASSERT_EQ(0, *cp_offset); 813 for (DeferredAction* action = actions_; 814 action != NULL; 815 action = action->next()) { 816 if (action->Mentions(reg)) { 817 if (action->type() == ActionNode::STORE_POSITION) { 818 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset(); 819 return true; 820 } else { 821 return false; 822 } 823 } 824 } 825 return false; 826 } 827 828 829 int Trace::FindAffectedRegisters(OutSet* affected_registers) { 830 int max_register = RegExpCompiler::kNoRegister; 831 for (DeferredAction* action = actions_; 832 action != NULL; 833 action = action->next()) { 834 if (action->type() == ActionNode::CLEAR_CAPTURES) { 835 Interval range = static_cast<DeferredClearCaptures*>(action)->range(); 836 for (int i = range.from(); i <= range.to(); i++) 837 affected_registers->Set(i); 838 if (range.to() > max_register) max_register = range.to(); 839 } else { 840 affected_registers->Set(action->reg()); 841 if (action->reg() > max_register) max_register = action->reg(); 842 } 843 } 844 return max_register; 845 } 846 847 848 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler, 849 int max_register, 850 OutSet& registers_to_pop, 851 OutSet& registers_to_clear) { 852 for (int reg = max_register; reg >= 0; reg--) { 853 if (registers_to_pop.Get(reg)) assembler->PopRegister(reg); 854 else if (registers_to_clear.Get(reg)) { 855 int clear_to = reg; 856 while (reg > 0 && registers_to_clear.Get(reg - 1)) { 857 reg--; 858 } 859 assembler->ClearRegisters(reg, clear_to); 860 } 861 } 862 } 863 864 865 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler, 866 int max_register, 867 OutSet& affected_registers, 868 OutSet* registers_to_pop, 869 OutSet* registers_to_clear) { 870 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1. 871 const int push_limit = (assembler->stack_limit_slack() + 1) / 2; 872 873 // Count pushes performed to force a stack limit check occasionally. 874 int pushes = 0; 875 876 for (int reg = 0; reg <= max_register; reg++) { 877 if (!affected_registers.Get(reg)) { 878 continue; 879 } 880 881 // The chronologically first deferred action in the trace 882 // is used to infer the action needed to restore a register 883 // to its previous state (or not, if it's safe to ignore it). 884 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR }; 885 DeferredActionUndoType undo_action = IGNORE; 886 887 int value = 0; 888 bool absolute = false; 889 bool clear = false; 890 int store_position = -1; 891 // This is a little tricky because we are scanning the actions in reverse 892 // historical order (newest first). 893 for (DeferredAction* action = actions_; 894 action != NULL; 895 action = action->next()) { 896 if (action->Mentions(reg)) { 897 switch (action->type()) { 898 case ActionNode::SET_REGISTER: { 899 Trace::DeferredSetRegister* psr = 900 static_cast<Trace::DeferredSetRegister*>(action); 901 if (!absolute) { 902 value += psr->value(); 903 absolute = true; 904 } 905 // SET_REGISTER is currently only used for newly introduced loop 906 // counters. They can have a significant previous value if they 907 // occour in a loop. TODO(lrn): Propagate this information, so 908 // we can set undo_action to IGNORE if we know there is no value to 909 // restore. 910 undo_action = RESTORE; 911 ASSERT_EQ(store_position, -1); 912 ASSERT(!clear); 913 break; 914 } 915 case ActionNode::INCREMENT_REGISTER: 916 if (!absolute) { 917 value++; 918 } 919 ASSERT_EQ(store_position, -1); 920 ASSERT(!clear); 921 undo_action = RESTORE; 922 break; 923 case ActionNode::STORE_POSITION: { 924 Trace::DeferredCapture* pc = 925 static_cast<Trace::DeferredCapture*>(action); 926 if (!clear && store_position == -1) { 927 store_position = pc->cp_offset(); 928 } 929 930 // For captures we know that stores and clears alternate. 931 // Other register, are never cleared, and if the occur 932 // inside a loop, they might be assigned more than once. 933 if (reg <= 1) { 934 // Registers zero and one, aka "capture zero", is 935 // always set correctly if we succeed. There is no 936 // need to undo a setting on backtrack, because we 937 // will set it again or fail. 938 undo_action = IGNORE; 939 } else { 940 undo_action = pc->is_capture() ? CLEAR : RESTORE; 941 } 942 ASSERT(!absolute); 943 ASSERT_EQ(value, 0); 944 break; 945 } 946 case ActionNode::CLEAR_CAPTURES: { 947 // Since we're scanning in reverse order, if we've already 948 // set the position we have to ignore historically earlier 949 // clearing operations. 950 if (store_position == -1) { 951 clear = true; 952 } 953 undo_action = RESTORE; 954 ASSERT(!absolute); 955 ASSERT_EQ(value, 0); 956 break; 957 } 958 default: 959 UNREACHABLE(); 960 break; 961 } 962 } 963 } 964 // Prepare for the undo-action (e.g., push if it's going to be popped). 965 if (undo_action == RESTORE) { 966 pushes++; 967 RegExpMacroAssembler::StackCheckFlag stack_check = 968 RegExpMacroAssembler::kNoStackLimitCheck; 969 if (pushes == push_limit) { 970 stack_check = RegExpMacroAssembler::kCheckStackLimit; 971 pushes = 0; 972 } 973 974 assembler->PushRegister(reg, stack_check); 975 registers_to_pop->Set(reg); 976 } else if (undo_action == CLEAR) { 977 registers_to_clear->Set(reg); 978 } 979 // Perform the chronologically last action (or accumulated increment) 980 // for the register. 981 if (store_position != -1) { 982 assembler->WriteCurrentPositionToRegister(reg, store_position); 983 } else if (clear) { 984 assembler->ClearRegisters(reg, reg); 985 } else if (absolute) { 986 assembler->SetRegister(reg, value); 987 } else if (value != 0) { 988 assembler->AdvanceRegister(reg, value); 989 } 990 } 991 } 992 993 994 // This is called as we come into a loop choice node and some other tricky 995 // nodes. It normalizes the state of the code generator to ensure we can 996 // generate generic code. 997 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) { 998 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 999 1000 ASSERT(!is_trivial()); 1001 1002 if (actions_ == NULL && backtrack() == NULL) { 1003 // Here we just have some deferred cp advances to fix and we are back to 1004 // a normal situation. We may also have to forget some information gained 1005 // through a quick check that was already performed. 1006 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_); 1007 // Create a new trivial state and generate the node with that. 1008 Trace new_state; 1009 successor->Emit(compiler, &new_state); 1010 return; 1011 } 1012 1013 // Generate deferred actions here along with code to undo them again. 1014 OutSet affected_registers; 1015 1016 if (backtrack() != NULL) { 1017 // Here we have a concrete backtrack location. These are set up by choice 1018 // nodes and so they indicate that we have a deferred save of the current 1019 // position which we may need to emit here. 1020 assembler->PushCurrentPosition(); 1021 } 1022 1023 int max_register = FindAffectedRegisters(&affected_registers); 1024 OutSet registers_to_pop; 1025 OutSet registers_to_clear; 1026 PerformDeferredActions(assembler, 1027 max_register, 1028 affected_registers, 1029 ®isters_to_pop, 1030 ®isters_to_clear); 1031 if (cp_offset_ != 0) { 1032 assembler->AdvanceCurrentPosition(cp_offset_); 1033 } 1034 1035 // Create a new trivial state and generate the node with that. 1036 Label undo; 1037 assembler->PushBacktrack(&undo); 1038 Trace new_state; 1039 successor->Emit(compiler, &new_state); 1040 1041 // On backtrack we need to restore state. 1042 assembler->Bind(&undo); 1043 RestoreAffectedRegisters(assembler, 1044 max_register, 1045 registers_to_pop, 1046 registers_to_clear); 1047 if (backtrack() == NULL) { 1048 assembler->Backtrack(); 1049 } else { 1050 assembler->PopCurrentPosition(); 1051 assembler->GoTo(backtrack()); 1052 } 1053 } 1054 1055 1056 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) { 1057 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1058 1059 // Omit flushing the trace. We discard the entire stack frame anyway. 1060 1061 if (!label()->is_bound()) { 1062 // We are completely independent of the trace, since we ignore it, 1063 // so this code can be used as the generic version. 1064 assembler->Bind(label()); 1065 } 1066 1067 // Throw away everything on the backtrack stack since the start 1068 // of the negative submatch and restore the character position. 1069 assembler->ReadCurrentPositionFromRegister(current_position_register_); 1070 assembler->ReadStackPointerFromRegister(stack_pointer_register_); 1071 if (clear_capture_count_ > 0) { 1072 // Clear any captures that might have been performed during the success 1073 // of the body of the negative look-ahead. 1074 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1; 1075 assembler->ClearRegisters(clear_capture_start_, clear_capture_end); 1076 } 1077 // Now that we have unwound the stack we find at the top of the stack the 1078 // backtrack that the BeginSubmatch node got. 1079 assembler->Backtrack(); 1080 } 1081 1082 1083 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) { 1084 if (!trace->is_trivial()) { 1085 trace->Flush(compiler, this); 1086 return; 1087 } 1088 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1089 if (!label()->is_bound()) { 1090 assembler->Bind(label()); 1091 } 1092 switch (action_) { 1093 case ACCEPT: 1094 assembler->Succeed(); 1095 return; 1096 case BACKTRACK: 1097 assembler->GoTo(trace->backtrack()); 1098 return; 1099 case NEGATIVE_SUBMATCH_SUCCESS: 1100 // This case is handled in a different virtual method. 1101 UNREACHABLE(); 1102 } 1103 UNIMPLEMENTED(); 1104 } 1105 1106 1107 void GuardedAlternative::AddGuard(Guard* guard) { 1108 if (guards_ == NULL) 1109 guards_ = new ZoneList<Guard*>(1); 1110 guards_->Add(guard); 1111 } 1112 1113 1114 ActionNode* ActionNode::SetRegister(int reg, 1115 int val, 1116 RegExpNode* on_success) { 1117 ActionNode* result = new ActionNode(SET_REGISTER, on_success); 1118 result->data_.u_store_register.reg = reg; 1119 result->data_.u_store_register.value = val; 1120 return result; 1121 } 1122 1123 1124 ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) { 1125 ActionNode* result = new ActionNode(INCREMENT_REGISTER, on_success); 1126 result->data_.u_increment_register.reg = reg; 1127 return result; 1128 } 1129 1130 1131 ActionNode* ActionNode::StorePosition(int reg, 1132 bool is_capture, 1133 RegExpNode* on_success) { 1134 ActionNode* result = new ActionNode(STORE_POSITION, on_success); 1135 result->data_.u_position_register.reg = reg; 1136 result->data_.u_position_register.is_capture = is_capture; 1137 return result; 1138 } 1139 1140 1141 ActionNode* ActionNode::ClearCaptures(Interval range, 1142 RegExpNode* on_success) { 1143 ActionNode* result = new ActionNode(CLEAR_CAPTURES, on_success); 1144 result->data_.u_clear_captures.range_from = range.from(); 1145 result->data_.u_clear_captures.range_to = range.to(); 1146 return result; 1147 } 1148 1149 1150 ActionNode* ActionNode::BeginSubmatch(int stack_reg, 1151 int position_reg, 1152 RegExpNode* on_success) { 1153 ActionNode* result = new ActionNode(BEGIN_SUBMATCH, on_success); 1154 result->data_.u_submatch.stack_pointer_register = stack_reg; 1155 result->data_.u_submatch.current_position_register = position_reg; 1156 return result; 1157 } 1158 1159 1160 ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg, 1161 int position_reg, 1162 int clear_register_count, 1163 int clear_register_from, 1164 RegExpNode* on_success) { 1165 ActionNode* result = new ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success); 1166 result->data_.u_submatch.stack_pointer_register = stack_reg; 1167 result->data_.u_submatch.current_position_register = position_reg; 1168 result->data_.u_submatch.clear_register_count = clear_register_count; 1169 result->data_.u_submatch.clear_register_from = clear_register_from; 1170 return result; 1171 } 1172 1173 1174 ActionNode* ActionNode::EmptyMatchCheck(int start_register, 1175 int repetition_register, 1176 int repetition_limit, 1177 RegExpNode* on_success) { 1178 ActionNode* result = new ActionNode(EMPTY_MATCH_CHECK, on_success); 1179 result->data_.u_empty_match_check.start_register = start_register; 1180 result->data_.u_empty_match_check.repetition_register = repetition_register; 1181 result->data_.u_empty_match_check.repetition_limit = repetition_limit; 1182 return result; 1183 } 1184 1185 1186 #define DEFINE_ACCEPT(Type) \ 1187 void Type##Node::Accept(NodeVisitor* visitor) { \ 1188 visitor->Visit##Type(this); \ 1189 } 1190 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT) 1191 #undef DEFINE_ACCEPT 1192 1193 1194 void LoopChoiceNode::Accept(NodeVisitor* visitor) { 1195 visitor->VisitLoopChoice(this); 1196 } 1197 1198 1199 // ------------------------------------------------------------------- 1200 // Emit code. 1201 1202 1203 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler, 1204 Guard* guard, 1205 Trace* trace) { 1206 switch (guard->op()) { 1207 case Guard::LT: 1208 ASSERT(!trace->mentions_reg(guard->reg())); 1209 macro_assembler->IfRegisterGE(guard->reg(), 1210 guard->value(), 1211 trace->backtrack()); 1212 break; 1213 case Guard::GEQ: 1214 ASSERT(!trace->mentions_reg(guard->reg())); 1215 macro_assembler->IfRegisterLT(guard->reg(), 1216 guard->value(), 1217 trace->backtrack()); 1218 break; 1219 } 1220 } 1221 1222 1223 static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize; 1224 static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange; 1225 1226 1227 // Returns the number of characters in the equivalence class, omitting those 1228 // that cannot occur in the source string because it is ASCII. 1229 static int GetCaseIndependentLetters(uc16 character, 1230 bool ascii_subject, 1231 unibrow::uchar* letters) { 1232 int length = uncanonicalize.get(character, '\0', letters); 1233 // Unibrow returns 0 or 1 for characters where case independependence is 1234 // trivial. 1235 if (length == 0) { 1236 letters[0] = character; 1237 length = 1; 1238 } 1239 if (!ascii_subject || character <= String::kMaxAsciiCharCode) { 1240 return length; 1241 } 1242 // The standard requires that non-ASCII characters cannot have ASCII 1243 // character codes in their equivalence class. 1244 return 0; 1245 } 1246 1247 1248 static inline bool EmitSimpleCharacter(RegExpCompiler* compiler, 1249 uc16 c, 1250 Label* on_failure, 1251 int cp_offset, 1252 bool check, 1253 bool preloaded) { 1254 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1255 bool bound_checked = false; 1256 if (!preloaded) { 1257 assembler->LoadCurrentCharacter( 1258 cp_offset, 1259 on_failure, 1260 check); 1261 bound_checked = true; 1262 } 1263 assembler->CheckNotCharacter(c, on_failure); 1264 return bound_checked; 1265 } 1266 1267 1268 // Only emits non-letters (things that don't have case). Only used for case 1269 // independent matches. 1270 static inline bool EmitAtomNonLetter(RegExpCompiler* compiler, 1271 uc16 c, 1272 Label* on_failure, 1273 int cp_offset, 1274 bool check, 1275 bool preloaded) { 1276 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 1277 bool ascii = compiler->ascii(); 1278 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1279 int length = GetCaseIndependentLetters(c, ascii, chars); 1280 if (length < 1) { 1281 // This can't match. Must be an ASCII subject and a non-ASCII character. 1282 // We do not need to do anything since the ASCII pass already handled this. 1283 return false; // Bounds not checked. 1284 } 1285 bool checked = false; 1286 // We handle the length > 1 case in a later pass. 1287 if (length == 1) { 1288 if (ascii && c > String::kMaxAsciiCharCodeU) { 1289 // Can't match - see above. 1290 return false; // Bounds not checked. 1291 } 1292 if (!preloaded) { 1293 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 1294 checked = check; 1295 } 1296 macro_assembler->CheckNotCharacter(c, on_failure); 1297 } 1298 return checked; 1299 } 1300 1301 1302 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler, 1303 bool ascii, 1304 uc16 c1, 1305 uc16 c2, 1306 Label* on_failure) { 1307 uc16 char_mask; 1308 if (ascii) { 1309 char_mask = String::kMaxAsciiCharCode; 1310 } else { 1311 char_mask = String::kMaxUC16CharCode; 1312 } 1313 uc16 exor = c1 ^ c2; 1314 // Check whether exor has only one bit set. 1315 if (((exor - 1) & exor) == 0) { 1316 // If c1 and c2 differ only by one bit. 1317 // Ecma262UnCanonicalize always gives the highest number last. 1318 ASSERT(c2 > c1); 1319 uc16 mask = char_mask ^ exor; 1320 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure); 1321 return true; 1322 } 1323 ASSERT(c2 > c1); 1324 uc16 diff = c2 - c1; 1325 if (((diff - 1) & diff) == 0 && c1 >= diff) { 1326 // If the characters differ by 2^n but don't differ by one bit then 1327 // subtract the difference from the found character, then do the or 1328 // trick. We avoid the theoretical case where negative numbers are 1329 // involved in order to simplify code generation. 1330 uc16 mask = char_mask ^ diff; 1331 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff, 1332 diff, 1333 mask, 1334 on_failure); 1335 return true; 1336 } 1337 return false; 1338 } 1339 1340 1341 typedef bool EmitCharacterFunction(RegExpCompiler* compiler, 1342 uc16 c, 1343 Label* on_failure, 1344 int cp_offset, 1345 bool check, 1346 bool preloaded); 1347 1348 // Only emits letters (things that have case). Only used for case independent 1349 // matches. 1350 static inline bool EmitAtomLetter(RegExpCompiler* compiler, 1351 uc16 c, 1352 Label* on_failure, 1353 int cp_offset, 1354 bool check, 1355 bool preloaded) { 1356 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 1357 bool ascii = compiler->ascii(); 1358 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1359 int length = GetCaseIndependentLetters(c, ascii, chars); 1360 if (length <= 1) return false; 1361 // We may not need to check against the end of the input string 1362 // if this character lies before a character that matched. 1363 if (!preloaded) { 1364 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 1365 } 1366 Label ok; 1367 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4); 1368 switch (length) { 1369 case 2: { 1370 if (ShortCutEmitCharacterPair(macro_assembler, 1371 ascii, 1372 chars[0], 1373 chars[1], 1374 on_failure)) { 1375 } else { 1376 macro_assembler->CheckCharacter(chars[0], &ok); 1377 macro_assembler->CheckNotCharacter(chars[1], on_failure); 1378 macro_assembler->Bind(&ok); 1379 } 1380 break; 1381 } 1382 case 4: 1383 macro_assembler->CheckCharacter(chars[3], &ok); 1384 // Fall through! 1385 case 3: 1386 macro_assembler->CheckCharacter(chars[0], &ok); 1387 macro_assembler->CheckCharacter(chars[1], &ok); 1388 macro_assembler->CheckNotCharacter(chars[2], on_failure); 1389 macro_assembler->Bind(&ok); 1390 break; 1391 default: 1392 UNREACHABLE(); 1393 break; 1394 } 1395 return true; 1396 } 1397 1398 1399 static void EmitCharClass(RegExpMacroAssembler* macro_assembler, 1400 RegExpCharacterClass* cc, 1401 bool ascii, 1402 Label* on_failure, 1403 int cp_offset, 1404 bool check_offset, 1405 bool preloaded) { 1406 ZoneList<CharacterRange>* ranges = cc->ranges(); 1407 int max_char; 1408 if (ascii) { 1409 max_char = String::kMaxAsciiCharCode; 1410 } else { 1411 max_char = String::kMaxUC16CharCode; 1412 } 1413 1414 Label success; 1415 1416 Label* char_is_in_class = 1417 cc->is_negated() ? on_failure : &success; 1418 1419 int range_count = ranges->length(); 1420 1421 int last_valid_range = range_count - 1; 1422 while (last_valid_range >= 0) { 1423 CharacterRange& range = ranges->at(last_valid_range); 1424 if (range.from() <= max_char) { 1425 break; 1426 } 1427 last_valid_range--; 1428 } 1429 1430 if (last_valid_range < 0) { 1431 if (!cc->is_negated()) { 1432 // TODO(plesner): We can remove this when the node level does our 1433 // ASCII optimizations for us. 1434 macro_assembler->GoTo(on_failure); 1435 } 1436 if (check_offset) { 1437 macro_assembler->CheckPosition(cp_offset, on_failure); 1438 } 1439 return; 1440 } 1441 1442 if (last_valid_range == 0 && 1443 !cc->is_negated() && 1444 ranges->at(0).IsEverything(max_char)) { 1445 // This is a common case hit by non-anchored expressions. 1446 if (check_offset) { 1447 macro_assembler->CheckPosition(cp_offset, on_failure); 1448 } 1449 return; 1450 } 1451 1452 if (!preloaded) { 1453 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset); 1454 } 1455 1456 if (cc->is_standard() && 1457 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(), 1458 on_failure)) { 1459 return; 1460 } 1461 1462 for (int i = 0; i < last_valid_range; i++) { 1463 CharacterRange& range = ranges->at(i); 1464 Label next_range; 1465 uc16 from = range.from(); 1466 uc16 to = range.to(); 1467 if (from > max_char) { 1468 continue; 1469 } 1470 if (to > max_char) to = max_char; 1471 if (to == from) { 1472 macro_assembler->CheckCharacter(to, char_is_in_class); 1473 } else { 1474 if (from != 0) { 1475 macro_assembler->CheckCharacterLT(from, &next_range); 1476 } 1477 if (to != max_char) { 1478 macro_assembler->CheckCharacterLT(to + 1, char_is_in_class); 1479 } else { 1480 macro_assembler->GoTo(char_is_in_class); 1481 } 1482 } 1483 macro_assembler->Bind(&next_range); 1484 } 1485 1486 CharacterRange& range = ranges->at(last_valid_range); 1487 uc16 from = range.from(); 1488 uc16 to = range.to(); 1489 1490 if (to > max_char) to = max_char; 1491 ASSERT(to >= from); 1492 1493 if (to == from) { 1494 if (cc->is_negated()) { 1495 macro_assembler->CheckCharacter(to, on_failure); 1496 } else { 1497 macro_assembler->CheckNotCharacter(to, on_failure); 1498 } 1499 } else { 1500 if (from != 0) { 1501 if (cc->is_negated()) { 1502 macro_assembler->CheckCharacterLT(from, &success); 1503 } else { 1504 macro_assembler->CheckCharacterLT(from, on_failure); 1505 } 1506 } 1507 if (to != String::kMaxUC16CharCode) { 1508 if (cc->is_negated()) { 1509 macro_assembler->CheckCharacterLT(to + 1, on_failure); 1510 } else { 1511 macro_assembler->CheckCharacterGT(to, on_failure); 1512 } 1513 } else { 1514 if (cc->is_negated()) { 1515 macro_assembler->GoTo(on_failure); 1516 } 1517 } 1518 } 1519 macro_assembler->Bind(&success); 1520 } 1521 1522 1523 RegExpNode::~RegExpNode() { 1524 } 1525 1526 1527 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler, 1528 Trace* trace) { 1529 // If we are generating a greedy loop then don't stop and don't reuse code. 1530 if (trace->stop_node() != NULL) { 1531 return CONTINUE; 1532 } 1533 1534 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 1535 if (trace->is_trivial()) { 1536 if (label_.is_bound()) { 1537 // We are being asked to generate a generic version, but that's already 1538 // been done so just go to it. 1539 macro_assembler->GoTo(&label_); 1540 return DONE; 1541 } 1542 if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) { 1543 // To avoid too deep recursion we push the node to the work queue and just 1544 // generate a goto here. 1545 compiler->AddWork(this); 1546 macro_assembler->GoTo(&label_); 1547 return DONE; 1548 } 1549 // Generate generic version of the node and bind the label for later use. 1550 macro_assembler->Bind(&label_); 1551 return CONTINUE; 1552 } 1553 1554 // We are being asked to make a non-generic version. Keep track of how many 1555 // non-generic versions we generate so as not to overdo it. 1556 trace_count_++; 1557 if (FLAG_regexp_optimization && 1558 trace_count_ < kMaxCopiesCodeGenerated && 1559 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) { 1560 return CONTINUE; 1561 } 1562 1563 // If we get here code has been generated for this node too many times or 1564 // recursion is too deep. Time to switch to a generic version. The code for 1565 // generic versions above can handle deep recursion properly. 1566 trace->Flush(compiler, this); 1567 return DONE; 1568 } 1569 1570 1571 int ActionNode::EatsAtLeast(int still_to_find, int recursion_depth) { 1572 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; 1573 if (type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input! 1574 return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1); 1575 } 1576 1577 1578 int AssertionNode::EatsAtLeast(int still_to_find, int recursion_depth) { 1579 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; 1580 return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1); 1581 } 1582 1583 1584 int BackReferenceNode::EatsAtLeast(int still_to_find, int recursion_depth) { 1585 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; 1586 return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1); 1587 } 1588 1589 1590 int TextNode::EatsAtLeast(int still_to_find, int recursion_depth) { 1591 int answer = Length(); 1592 if (answer >= still_to_find) return answer; 1593 if (recursion_depth > RegExpCompiler::kMaxRecursion) return answer; 1594 return answer + on_success()->EatsAtLeast(still_to_find - answer, 1595 recursion_depth + 1); 1596 } 1597 1598 1599 int NegativeLookaheadChoiceNode::EatsAtLeast(int still_to_find, 1600 int recursion_depth) { 1601 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; 1602 // Alternative 0 is the negative lookahead, alternative 1 is what comes 1603 // afterwards. 1604 RegExpNode* node = alternatives_->at(1).node(); 1605 return node->EatsAtLeast(still_to_find, recursion_depth + 1); 1606 } 1607 1608 1609 void NegativeLookaheadChoiceNode::GetQuickCheckDetails( 1610 QuickCheckDetails* details, 1611 RegExpCompiler* compiler, 1612 int filled_in, 1613 bool not_at_start) { 1614 // Alternative 0 is the negative lookahead, alternative 1 is what comes 1615 // afterwards. 1616 RegExpNode* node = alternatives_->at(1).node(); 1617 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start); 1618 } 1619 1620 1621 int ChoiceNode::EatsAtLeastHelper(int still_to_find, 1622 int recursion_depth, 1623 RegExpNode* ignore_this_node) { 1624 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; 1625 int min = 100; 1626 int choice_count = alternatives_->length(); 1627 for (int i = 0; i < choice_count; i++) { 1628 RegExpNode* node = alternatives_->at(i).node(); 1629 if (node == ignore_this_node) continue; 1630 int node_eats_at_least = node->EatsAtLeast(still_to_find, 1631 recursion_depth + 1); 1632 if (node_eats_at_least < min) min = node_eats_at_least; 1633 } 1634 return min; 1635 } 1636 1637 1638 int LoopChoiceNode::EatsAtLeast(int still_to_find, int recursion_depth) { 1639 return EatsAtLeastHelper(still_to_find, recursion_depth, loop_node_); 1640 } 1641 1642 1643 int ChoiceNode::EatsAtLeast(int still_to_find, int recursion_depth) { 1644 return EatsAtLeastHelper(still_to_find, recursion_depth, NULL); 1645 } 1646 1647 1648 // Takes the left-most 1-bit and smears it out, setting all bits to its right. 1649 static inline uint32_t SmearBitsRight(uint32_t v) { 1650 v |= v >> 1; 1651 v |= v >> 2; 1652 v |= v >> 4; 1653 v |= v >> 8; 1654 v |= v >> 16; 1655 return v; 1656 } 1657 1658 1659 bool QuickCheckDetails::Rationalize(bool asc) { 1660 bool found_useful_op = false; 1661 uint32_t char_mask; 1662 if (asc) { 1663 char_mask = String::kMaxAsciiCharCode; 1664 } else { 1665 char_mask = String::kMaxUC16CharCode; 1666 } 1667 mask_ = 0; 1668 value_ = 0; 1669 int char_shift = 0; 1670 for (int i = 0; i < characters_; i++) { 1671 Position* pos = &positions_[i]; 1672 if ((pos->mask & String::kMaxAsciiCharCode) != 0) { 1673 found_useful_op = true; 1674 } 1675 mask_ |= (pos->mask & char_mask) << char_shift; 1676 value_ |= (pos->value & char_mask) << char_shift; 1677 char_shift += asc ? 8 : 16; 1678 } 1679 return found_useful_op; 1680 } 1681 1682 1683 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler, 1684 Trace* trace, 1685 bool preload_has_checked_bounds, 1686 Label* on_possible_success, 1687 QuickCheckDetails* details, 1688 bool fall_through_on_failure) { 1689 if (details->characters() == 0) return false; 1690 GetQuickCheckDetails(details, compiler, 0, trace->at_start() == Trace::FALSE); 1691 if (details->cannot_match()) return false; 1692 if (!details->Rationalize(compiler->ascii())) return false; 1693 ASSERT(details->characters() == 1 || 1694 compiler->macro_assembler()->CanReadUnaligned()); 1695 uint32_t mask = details->mask(); 1696 uint32_t value = details->value(); 1697 1698 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1699 1700 if (trace->characters_preloaded() != details->characters()) { 1701 assembler->LoadCurrentCharacter(trace->cp_offset(), 1702 trace->backtrack(), 1703 !preload_has_checked_bounds, 1704 details->characters()); 1705 } 1706 1707 1708 bool need_mask = true; 1709 1710 if (details->characters() == 1) { 1711 // If number of characters preloaded is 1 then we used a byte or 16 bit 1712 // load so the value is already masked down. 1713 uint32_t char_mask; 1714 if (compiler->ascii()) { 1715 char_mask = String::kMaxAsciiCharCode; 1716 } else { 1717 char_mask = String::kMaxUC16CharCode; 1718 } 1719 if ((mask & char_mask) == char_mask) need_mask = false; 1720 mask &= char_mask; 1721 } else { 1722 // For 2-character preloads in ASCII mode we also use a 16 bit load with 1723 // zero extend. 1724 if (details->characters() == 2 && compiler->ascii()) { 1725 if ((mask & 0xffff) == 0xffff) need_mask = false; 1726 } else { 1727 if (mask == 0xffffffff) need_mask = false; 1728 } 1729 } 1730 1731 if (fall_through_on_failure) { 1732 if (need_mask) { 1733 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success); 1734 } else { 1735 assembler->CheckCharacter(value, on_possible_success); 1736 } 1737 } else { 1738 if (need_mask) { 1739 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack()); 1740 } else { 1741 assembler->CheckNotCharacter(value, trace->backtrack()); 1742 } 1743 } 1744 return true; 1745 } 1746 1747 1748 // Here is the meat of GetQuickCheckDetails (see also the comment on the 1749 // super-class in the .h file). 1750 // 1751 // We iterate along the text object, building up for each character a 1752 // mask and value that can be used to test for a quick failure to match. 1753 // The masks and values for the positions will be combined into a single 1754 // machine word for the current character width in order to be used in 1755 // generating a quick check. 1756 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, 1757 RegExpCompiler* compiler, 1758 int characters_filled_in, 1759 bool not_at_start) { 1760 ASSERT(characters_filled_in < details->characters()); 1761 int characters = details->characters(); 1762 int char_mask; 1763 int char_shift; 1764 if (compiler->ascii()) { 1765 char_mask = String::kMaxAsciiCharCode; 1766 char_shift = 8; 1767 } else { 1768 char_mask = String::kMaxUC16CharCode; 1769 char_shift = 16; 1770 } 1771 for (int k = 0; k < elms_->length(); k++) { 1772 TextElement elm = elms_->at(k); 1773 if (elm.type == TextElement::ATOM) { 1774 Vector<const uc16> quarks = elm.data.u_atom->data(); 1775 for (int i = 0; i < characters && i < quarks.length(); i++) { 1776 QuickCheckDetails::Position* pos = 1777 details->positions(characters_filled_in); 1778 uc16 c = quarks[i]; 1779 if (c > char_mask) { 1780 // If we expect a non-ASCII character from an ASCII string, 1781 // there is no way we can match. Not even case independent 1782 // matching can turn an ASCII character into non-ASCII or 1783 // vice versa. 1784 details->set_cannot_match(); 1785 pos->determines_perfectly = false; 1786 return; 1787 } 1788 if (compiler->ignore_case()) { 1789 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1790 int length = GetCaseIndependentLetters(c, compiler->ascii(), chars); 1791 ASSERT(length != 0); // Can only happen if c > char_mask (see above). 1792 if (length == 1) { 1793 // This letter has no case equivalents, so it's nice and simple 1794 // and the mask-compare will determine definitely whether we have 1795 // a match at this character position. 1796 pos->mask = char_mask; 1797 pos->value = c; 1798 pos->determines_perfectly = true; 1799 } else { 1800 uint32_t common_bits = char_mask; 1801 uint32_t bits = chars[0]; 1802 for (int j = 1; j < length; j++) { 1803 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits); 1804 common_bits ^= differing_bits; 1805 bits &= common_bits; 1806 } 1807 // If length is 2 and common bits has only one zero in it then 1808 // our mask and compare instruction will determine definitely 1809 // whether we have a match at this character position. Otherwise 1810 // it can only be an approximate check. 1811 uint32_t one_zero = (common_bits | ~char_mask); 1812 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) { 1813 pos->determines_perfectly = true; 1814 } 1815 pos->mask = common_bits; 1816 pos->value = bits; 1817 } 1818 } else { 1819 // Don't ignore case. Nice simple case where the mask-compare will 1820 // determine definitely whether we have a match at this character 1821 // position. 1822 pos->mask = char_mask; 1823 pos->value = c; 1824 pos->determines_perfectly = true; 1825 } 1826 characters_filled_in++; 1827 ASSERT(characters_filled_in <= details->characters()); 1828 if (characters_filled_in == details->characters()) { 1829 return; 1830 } 1831 } 1832 } else { 1833 QuickCheckDetails::Position* pos = 1834 details->positions(characters_filled_in); 1835 RegExpCharacterClass* tree = elm.data.u_char_class; 1836 ZoneList<CharacterRange>* ranges = tree->ranges(); 1837 if (tree->is_negated()) { 1838 // A quick check uses multi-character mask and compare. There is no 1839 // useful way to incorporate a negative char class into this scheme 1840 // so we just conservatively create a mask and value that will always 1841 // succeed. 1842 pos->mask = 0; 1843 pos->value = 0; 1844 } else { 1845 int first_range = 0; 1846 while (ranges->at(first_range).from() > char_mask) { 1847 first_range++; 1848 if (first_range == ranges->length()) { 1849 details->set_cannot_match(); 1850 pos->determines_perfectly = false; 1851 return; 1852 } 1853 } 1854 CharacterRange range = ranges->at(first_range); 1855 uc16 from = range.from(); 1856 uc16 to = range.to(); 1857 if (to > char_mask) { 1858 to = char_mask; 1859 } 1860 uint32_t differing_bits = (from ^ to); 1861 // A mask and compare is only perfect if the differing bits form a 1862 // number like 00011111 with one single block of trailing 1s. 1863 if ((differing_bits & (differing_bits + 1)) == 0 && 1864 from + differing_bits == to) { 1865 pos->determines_perfectly = true; 1866 } 1867 uint32_t common_bits = ~SmearBitsRight(differing_bits); 1868 uint32_t bits = (from & common_bits); 1869 for (int i = first_range + 1; i < ranges->length(); i++) { 1870 CharacterRange range = ranges->at(i); 1871 uc16 from = range.from(); 1872 uc16 to = range.to(); 1873 if (from > char_mask) continue; 1874 if (to > char_mask) to = char_mask; 1875 // Here we are combining more ranges into the mask and compare 1876 // value. With each new range the mask becomes more sparse and 1877 // so the chances of a false positive rise. A character class 1878 // with multiple ranges is assumed never to be equivalent to a 1879 // mask and compare operation. 1880 pos->determines_perfectly = false; 1881 uint32_t new_common_bits = (from ^ to); 1882 new_common_bits = ~SmearBitsRight(new_common_bits); 1883 common_bits &= new_common_bits; 1884 bits &= new_common_bits; 1885 uint32_t differing_bits = (from & common_bits) ^ bits; 1886 common_bits ^= differing_bits; 1887 bits &= common_bits; 1888 } 1889 pos->mask = common_bits; 1890 pos->value = bits; 1891 } 1892 characters_filled_in++; 1893 ASSERT(characters_filled_in <= details->characters()); 1894 if (characters_filled_in == details->characters()) { 1895 return; 1896 } 1897 } 1898 } 1899 ASSERT(characters_filled_in != details->characters()); 1900 on_success()-> GetQuickCheckDetails(details, 1901 compiler, 1902 characters_filled_in, 1903 true); 1904 } 1905 1906 1907 void QuickCheckDetails::Clear() { 1908 for (int i = 0; i < characters_; i++) { 1909 positions_[i].mask = 0; 1910 positions_[i].value = 0; 1911 positions_[i].determines_perfectly = false; 1912 } 1913 characters_ = 0; 1914 } 1915 1916 1917 void QuickCheckDetails::Advance(int by, bool ascii) { 1918 ASSERT(by >= 0); 1919 if (by >= characters_) { 1920 Clear(); 1921 return; 1922 } 1923 for (int i = 0; i < characters_ - by; i++) { 1924 positions_[i] = positions_[by + i]; 1925 } 1926 for (int i = characters_ - by; i < characters_; i++) { 1927 positions_[i].mask = 0; 1928 positions_[i].value = 0; 1929 positions_[i].determines_perfectly = false; 1930 } 1931 characters_ -= by; 1932 // We could change mask_ and value_ here but we would never advance unless 1933 // they had already been used in a check and they won't be used again because 1934 // it would gain us nothing. So there's no point. 1935 } 1936 1937 1938 void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) { 1939 ASSERT(characters_ == other->characters_); 1940 if (other->cannot_match_) { 1941 return; 1942 } 1943 if (cannot_match_) { 1944 *this = *other; 1945 return; 1946 } 1947 for (int i = from_index; i < characters_; i++) { 1948 QuickCheckDetails::Position* pos = positions(i); 1949 QuickCheckDetails::Position* other_pos = other->positions(i); 1950 if (pos->mask != other_pos->mask || 1951 pos->value != other_pos->value || 1952 !other_pos->determines_perfectly) { 1953 // Our mask-compare operation will be approximate unless we have the 1954 // exact same operation on both sides of the alternation. 1955 pos->determines_perfectly = false; 1956 } 1957 pos->mask &= other_pos->mask; 1958 pos->value &= pos->mask; 1959 other_pos->value &= pos->mask; 1960 uc16 differing_bits = (pos->value ^ other_pos->value); 1961 pos->mask &= ~differing_bits; 1962 pos->value &= pos->mask; 1963 } 1964 } 1965 1966 1967 class VisitMarker { 1968 public: 1969 explicit VisitMarker(NodeInfo* info) : info_(info) { 1970 ASSERT(!info->visited); 1971 info->visited = true; 1972 } 1973 ~VisitMarker() { 1974 info_->visited = false; 1975 } 1976 private: 1977 NodeInfo* info_; 1978 }; 1979 1980 1981 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 1982 RegExpCompiler* compiler, 1983 int characters_filled_in, 1984 bool not_at_start) { 1985 if (body_can_be_zero_length_ || info()->visited) return; 1986 VisitMarker marker(info()); 1987 return ChoiceNode::GetQuickCheckDetails(details, 1988 compiler, 1989 characters_filled_in, 1990 not_at_start); 1991 } 1992 1993 1994 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 1995 RegExpCompiler* compiler, 1996 int characters_filled_in, 1997 bool not_at_start) { 1998 not_at_start = (not_at_start || not_at_start_); 1999 int choice_count = alternatives_->length(); 2000 ASSERT(choice_count > 0); 2001 alternatives_->at(0).node()->GetQuickCheckDetails(details, 2002 compiler, 2003 characters_filled_in, 2004 not_at_start); 2005 for (int i = 1; i < choice_count; i++) { 2006 QuickCheckDetails new_details(details->characters()); 2007 RegExpNode* node = alternatives_->at(i).node(); 2008 node->GetQuickCheckDetails(&new_details, compiler, 2009 characters_filled_in, 2010 not_at_start); 2011 // Here we merge the quick match details of the two branches. 2012 details->Merge(&new_details, characters_filled_in); 2013 } 2014 } 2015 2016 2017 // Check for [0-9A-Z_a-z]. 2018 static void EmitWordCheck(RegExpMacroAssembler* assembler, 2019 Label* word, 2020 Label* non_word, 2021 bool fall_through_on_word) { 2022 if (assembler->CheckSpecialCharacterClass( 2023 fall_through_on_word ? 'w' : 'W', 2024 fall_through_on_word ? non_word : word)) { 2025 // Optimized implementation available. 2026 return; 2027 } 2028 assembler->CheckCharacterGT('z', non_word); 2029 assembler->CheckCharacterLT('0', non_word); 2030 assembler->CheckCharacterGT('a' - 1, word); 2031 assembler->CheckCharacterLT('9' + 1, word); 2032 assembler->CheckCharacterLT('A', non_word); 2033 assembler->CheckCharacterLT('Z' + 1, word); 2034 if (fall_through_on_word) { 2035 assembler->CheckNotCharacter('_', non_word); 2036 } else { 2037 assembler->CheckCharacter('_', word); 2038 } 2039 } 2040 2041 2042 // Emit the code to check for a ^ in multiline mode (1-character lookbehind 2043 // that matches newline or the start of input). 2044 static void EmitHat(RegExpCompiler* compiler, 2045 RegExpNode* on_success, 2046 Trace* trace) { 2047 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2048 // We will be loading the previous character into the current character 2049 // register. 2050 Trace new_trace(*trace); 2051 new_trace.InvalidateCurrentCharacter(); 2052 2053 Label ok; 2054 if (new_trace.cp_offset() == 0) { 2055 // The start of input counts as a newline in this context, so skip to 2056 // ok if we are at the start. 2057 assembler->CheckAtStart(&ok); 2058 } 2059 // We already checked that we are not at the start of input so it must be 2060 // OK to load the previous character. 2061 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1, 2062 new_trace.backtrack(), 2063 false); 2064 if (!assembler->CheckSpecialCharacterClass('n', 2065 new_trace.backtrack())) { 2066 // Newline means \n, \r, 0x2028 or 0x2029. 2067 if (!compiler->ascii()) { 2068 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok); 2069 } 2070 assembler->CheckCharacter('\n', &ok); 2071 assembler->CheckNotCharacter('\r', new_trace.backtrack()); 2072 } 2073 assembler->Bind(&ok); 2074 on_success->Emit(compiler, &new_trace); 2075 } 2076 2077 2078 // Emit the code to handle \b and \B (word-boundary or non-word-boundary) 2079 // when we know whether the next character must be a word character or not. 2080 static void EmitHalfBoundaryCheck(AssertionNode::AssertionNodeType type, 2081 RegExpCompiler* compiler, 2082 RegExpNode* on_success, 2083 Trace* trace) { 2084 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2085 Label done; 2086 2087 Trace new_trace(*trace); 2088 2089 bool expect_word_character = (type == AssertionNode::AFTER_WORD_CHARACTER); 2090 Label* on_word = expect_word_character ? &done : new_trace.backtrack(); 2091 Label* on_non_word = expect_word_character ? new_trace.backtrack() : &done; 2092 2093 // Check whether previous character was a word character. 2094 switch (trace->at_start()) { 2095 case Trace::TRUE: 2096 if (expect_word_character) { 2097 assembler->GoTo(on_non_word); 2098 } 2099 break; 2100 case Trace::UNKNOWN: 2101 ASSERT_EQ(0, trace->cp_offset()); 2102 assembler->CheckAtStart(on_non_word); 2103 // Fall through. 2104 case Trace::FALSE: 2105 int prev_char_offset = trace->cp_offset() - 1; 2106 assembler->LoadCurrentCharacter(prev_char_offset, NULL, false, 1); 2107 EmitWordCheck(assembler, on_word, on_non_word, expect_word_character); 2108 // We may or may not have loaded the previous character. 2109 new_trace.InvalidateCurrentCharacter(); 2110 } 2111 2112 assembler->Bind(&done); 2113 2114 on_success->Emit(compiler, &new_trace); 2115 } 2116 2117 2118 // Emit the code to handle \b and \B (word-boundary or non-word-boundary). 2119 static void EmitBoundaryCheck(AssertionNode::AssertionNodeType type, 2120 RegExpCompiler* compiler, 2121 RegExpNode* on_success, 2122 Trace* trace) { 2123 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2124 Label before_non_word; 2125 Label before_word; 2126 if (trace->characters_preloaded() != 1) { 2127 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word); 2128 } 2129 // Fall through on non-word. 2130 EmitWordCheck(assembler, &before_word, &before_non_word, false); 2131 2132 // We will be loading the previous character into the current character 2133 // register. 2134 Trace new_trace(*trace); 2135 new_trace.InvalidateCurrentCharacter(); 2136 2137 Label ok; 2138 Label* boundary; 2139 Label* not_boundary; 2140 if (type == AssertionNode::AT_BOUNDARY) { 2141 boundary = &ok; 2142 not_boundary = new_trace.backtrack(); 2143 } else { 2144 not_boundary = &ok; 2145 boundary = new_trace.backtrack(); 2146 } 2147 2148 // Next character is not a word character. 2149 assembler->Bind(&before_non_word); 2150 if (new_trace.cp_offset() == 0) { 2151 // The start of input counts as a non-word character, so the question is 2152 // decided if we are at the start. 2153 assembler->CheckAtStart(not_boundary); 2154 } 2155 // We already checked that we are not at the start of input so it must be 2156 // OK to load the previous character. 2157 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, 2158 &ok, // Unused dummy label in this call. 2159 false); 2160 // Fall through on non-word. 2161 EmitWordCheck(assembler, boundary, not_boundary, false); 2162 assembler->GoTo(not_boundary); 2163 2164 // Next character is a word character. 2165 assembler->Bind(&before_word); 2166 if (new_trace.cp_offset() == 0) { 2167 // The start of input counts as a non-word character, so the question is 2168 // decided if we are at the start. 2169 assembler->CheckAtStart(boundary); 2170 } 2171 // We already checked that we are not at the start of input so it must be 2172 // OK to load the previous character. 2173 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, 2174 &ok, // Unused dummy label in this call. 2175 false); 2176 bool fall_through_on_word = (type == AssertionNode::AT_NON_BOUNDARY); 2177 EmitWordCheck(assembler, not_boundary, boundary, fall_through_on_word); 2178 2179 assembler->Bind(&ok); 2180 2181 on_success->Emit(compiler, &new_trace); 2182 } 2183 2184 2185 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details, 2186 RegExpCompiler* compiler, 2187 int filled_in, 2188 bool not_at_start) { 2189 if (type_ == AT_START && not_at_start) { 2190 details->set_cannot_match(); 2191 return; 2192 } 2193 return on_success()->GetQuickCheckDetails(details, 2194 compiler, 2195 filled_in, 2196 not_at_start); 2197 } 2198 2199 2200 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) { 2201 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2202 switch (type_) { 2203 case AT_END: { 2204 Label ok; 2205 assembler->CheckPosition(trace->cp_offset(), &ok); 2206 assembler->GoTo(trace->backtrack()); 2207 assembler->Bind(&ok); 2208 break; 2209 } 2210 case AT_START: { 2211 if (trace->at_start() == Trace::FALSE) { 2212 assembler->GoTo(trace->backtrack()); 2213 return; 2214 } 2215 if (trace->at_start() == Trace::UNKNOWN) { 2216 assembler->CheckNotAtStart(trace->backtrack()); 2217 Trace at_start_trace = *trace; 2218 at_start_trace.set_at_start(true); 2219 on_success()->Emit(compiler, &at_start_trace); 2220 return; 2221 } 2222 } 2223 break; 2224 case AFTER_NEWLINE: 2225 EmitHat(compiler, on_success(), trace); 2226 return; 2227 case AT_BOUNDARY: 2228 case AT_NON_BOUNDARY: { 2229 EmitBoundaryCheck(type_, compiler, on_success(), trace); 2230 return; 2231 } 2232 case AFTER_WORD_CHARACTER: 2233 case AFTER_NONWORD_CHARACTER: { 2234 EmitHalfBoundaryCheck(type_, compiler, on_success(), trace); 2235 } 2236 } 2237 on_success()->Emit(compiler, trace); 2238 } 2239 2240 2241 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) { 2242 if (quick_check == NULL) return false; 2243 if (offset >= quick_check->characters()) return false; 2244 return quick_check->positions(offset)->determines_perfectly; 2245 } 2246 2247 2248 static void UpdateBoundsCheck(int index, int* checked_up_to) { 2249 if (index > *checked_up_to) { 2250 *checked_up_to = index; 2251 } 2252 } 2253 2254 2255 // We call this repeatedly to generate code for each pass over the text node. 2256 // The passes are in increasing order of difficulty because we hope one 2257 // of the first passes will fail in which case we are saved the work of the 2258 // later passes. for example for the case independent regexp /%[asdfghjkl]a/ 2259 // we will check the '%' in the first pass, the case independent 'a' in the 2260 // second pass and the character class in the last pass. 2261 // 2262 // The passes are done from right to left, so for example to test for /bar/ 2263 // we will first test for an 'r' with offset 2, then an 'a' with offset 1 2264 // and then a 'b' with offset 0. This means we can avoid the end-of-input 2265 // bounds check most of the time. In the example we only need to check for 2266 // end-of-input when loading the putative 'r'. 2267 // 2268 // A slight complication involves the fact that the first character may already 2269 // be fetched into a register by the previous node. In this case we want to 2270 // do the test for that character first. We do this in separate passes. The 2271 // 'preloaded' argument indicates that we are doing such a 'pass'. If such a 2272 // pass has been performed then subsequent passes will have true in 2273 // first_element_checked to indicate that that character does not need to be 2274 // checked again. 2275 // 2276 // In addition to all this we are passed a Trace, which can 2277 // contain an AlternativeGeneration object. In this AlternativeGeneration 2278 // object we can see details of any quick check that was already passed in 2279 // order to get to the code we are now generating. The quick check can involve 2280 // loading characters, which means we do not need to recheck the bounds 2281 // up to the limit the quick check already checked. In addition the quick 2282 // check can have involved a mask and compare operation which may simplify 2283 // or obviate the need for further checks at some character positions. 2284 void TextNode::TextEmitPass(RegExpCompiler* compiler, 2285 TextEmitPassType pass, 2286 bool preloaded, 2287 Trace* trace, 2288 bool first_element_checked, 2289 int* checked_up_to) { 2290 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2291 bool ascii = compiler->ascii(); 2292 Label* backtrack = trace->backtrack(); 2293 QuickCheckDetails* quick_check = trace->quick_check_performed(); 2294 int element_count = elms_->length(); 2295 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) { 2296 TextElement elm = elms_->at(i); 2297 int cp_offset = trace->cp_offset() + elm.cp_offset; 2298 if (elm.type == TextElement::ATOM) { 2299 Vector<const uc16> quarks = elm.data.u_atom->data(); 2300 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { 2301 if (first_element_checked && i == 0 && j == 0) continue; 2302 if (DeterminedAlready(quick_check, elm.cp_offset + j)) continue; 2303 EmitCharacterFunction* emit_function = NULL; 2304 switch (pass) { 2305 case NON_ASCII_MATCH: 2306 ASSERT(ascii); 2307 if (quarks[j] > String::kMaxAsciiCharCode) { 2308 assembler->GoTo(backtrack); 2309 return; 2310 } 2311 break; 2312 case NON_LETTER_CHARACTER_MATCH: 2313 emit_function = &EmitAtomNonLetter; 2314 break; 2315 case SIMPLE_CHARACTER_MATCH: 2316 emit_function = &EmitSimpleCharacter; 2317 break; 2318 case CASE_CHARACTER_MATCH: 2319 emit_function = &EmitAtomLetter; 2320 break; 2321 default: 2322 break; 2323 } 2324 if (emit_function != NULL) { 2325 bool bound_checked = emit_function(compiler, 2326 quarks[j], 2327 backtrack, 2328 cp_offset + j, 2329 *checked_up_to < cp_offset + j, 2330 preloaded); 2331 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); 2332 } 2333 } 2334 } else { 2335 ASSERT_EQ(elm.type, TextElement::CHAR_CLASS); 2336 if (pass == CHARACTER_CLASS_MATCH) { 2337 if (first_element_checked && i == 0) continue; 2338 if (DeterminedAlready(quick_check, elm.cp_offset)) continue; 2339 RegExpCharacterClass* cc = elm.data.u_char_class; 2340 EmitCharClass(assembler, 2341 cc, 2342 ascii, 2343 backtrack, 2344 cp_offset, 2345 *checked_up_to < cp_offset, 2346 preloaded); 2347 UpdateBoundsCheck(cp_offset, checked_up_to); 2348 } 2349 } 2350 } 2351 } 2352 2353 2354 int TextNode::Length() { 2355 TextElement elm = elms_->last(); 2356 ASSERT(elm.cp_offset >= 0); 2357 if (elm.type == TextElement::ATOM) { 2358 return elm.cp_offset + elm.data.u_atom->data().length(); 2359 } else { 2360 return elm.cp_offset + 1; 2361 } 2362 } 2363 2364 2365 bool TextNode::SkipPass(int int_pass, bool ignore_case) { 2366 TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass); 2367 if (ignore_case) { 2368 return pass == SIMPLE_CHARACTER_MATCH; 2369 } else { 2370 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH; 2371 } 2372 } 2373 2374 2375 // This generates the code to match a text node. A text node can contain 2376 // straight character sequences (possibly to be matched in a case-independent 2377 // way) and character classes. For efficiency we do not do this in a single 2378 // pass from left to right. Instead we pass over the text node several times, 2379 // emitting code for some character positions every time. See the comment on 2380 // TextEmitPass for details. 2381 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) { 2382 LimitResult limit_result = LimitVersions(compiler, trace); 2383 if (limit_result == DONE) return; 2384 ASSERT(limit_result == CONTINUE); 2385 2386 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) { 2387 compiler->SetRegExpTooBig(); 2388 return; 2389 } 2390 2391 if (compiler->ascii()) { 2392 int dummy = 0; 2393 TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy); 2394 } 2395 2396 bool first_elt_done = false; 2397 int bound_checked_to = trace->cp_offset() - 1; 2398 bound_checked_to += trace->bound_checked_up_to(); 2399 2400 // If a character is preloaded into the current character register then 2401 // check that now. 2402 if (trace->characters_preloaded() == 1) { 2403 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { 2404 if (!SkipPass(pass, compiler->ignore_case())) { 2405 TextEmitPass(compiler, 2406 static_cast<TextEmitPassType>(pass), 2407 true, 2408 trace, 2409 false, 2410 &bound_checked_to); 2411 } 2412 } 2413 first_elt_done = true; 2414 } 2415 2416 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { 2417 if (!SkipPass(pass, compiler->ignore_case())) { 2418 TextEmitPass(compiler, 2419 static_cast<TextEmitPassType>(pass), 2420 false, 2421 trace, 2422 first_elt_done, 2423 &bound_checked_to); 2424 } 2425 } 2426 2427 Trace successor_trace(*trace); 2428 successor_trace.set_at_start(false); 2429 successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler); 2430 RecursionCheck rc(compiler); 2431 on_success()->Emit(compiler, &successor_trace); 2432 } 2433 2434 2435 void Trace::InvalidateCurrentCharacter() { 2436 characters_preloaded_ = 0; 2437 } 2438 2439 2440 void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) { 2441 ASSERT(by > 0); 2442 // We don't have an instruction for shifting the current character register 2443 // down or for using a shifted value for anything so lets just forget that 2444 // we preloaded any characters into it. 2445 characters_preloaded_ = 0; 2446 // Adjust the offsets of the quick check performed information. This 2447 // information is used to find out what we already determined about the 2448 // characters by means of mask and compare. 2449 quick_check_performed_.Advance(by, compiler->ascii()); 2450 cp_offset_ += by; 2451 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) { 2452 compiler->SetRegExpTooBig(); 2453 cp_offset_ = 0; 2454 } 2455 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by); 2456 } 2457 2458 2459 void TextNode::MakeCaseIndependent(bool is_ascii) { 2460 int element_count = elms_->length(); 2461 for (int i = 0; i < element_count; i++) { 2462 TextElement elm = elms_->at(i); 2463 if (elm.type == TextElement::CHAR_CLASS) { 2464 RegExpCharacterClass* cc = elm.data.u_char_class; 2465 // None of the standard character classses is different in the case 2466 // independent case and it slows us down if we don't know that. 2467 if (cc->is_standard()) continue; 2468 ZoneList<CharacterRange>* ranges = cc->ranges(); 2469 int range_count = ranges->length(); 2470 for (int j = 0; j < range_count; j++) { 2471 ranges->at(j).AddCaseEquivalents(ranges, is_ascii); 2472 } 2473 } 2474 } 2475 } 2476 2477 2478 int TextNode::GreedyLoopTextLength() { 2479 TextElement elm = elms_->at(elms_->length() - 1); 2480 if (elm.type == TextElement::CHAR_CLASS) { 2481 return elm.cp_offset + 1; 2482 } else { 2483 return elm.cp_offset + elm.data.u_atom->data().length(); 2484 } 2485 } 2486 2487 2488 // Finds the fixed match length of a sequence of nodes that goes from 2489 // this alternative and back to this choice node. If there are variable 2490 // length nodes or other complications in the way then return a sentinel 2491 // value indicating that a greedy loop cannot be constructed. 2492 int ChoiceNode::GreedyLoopTextLength(GuardedAlternative* alternative) { 2493 int length = 0; 2494 RegExpNode* node = alternative->node(); 2495 // Later we will generate code for all these text nodes using recursion 2496 // so we have to limit the max number. 2497 int recursion_depth = 0; 2498 while (node != this) { 2499 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) { 2500 return kNodeIsTooComplexForGreedyLoops; 2501 } 2502 int node_length = node->GreedyLoopTextLength(); 2503 if (node_length == kNodeIsTooComplexForGreedyLoops) { 2504 return kNodeIsTooComplexForGreedyLoops; 2505 } 2506 length += node_length; 2507 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node); 2508 node = seq_node->on_success(); 2509 } 2510 return length; 2511 } 2512 2513 2514 void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) { 2515 ASSERT_EQ(loop_node_, NULL); 2516 AddAlternative(alt); 2517 loop_node_ = alt.node(); 2518 } 2519 2520 2521 void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) { 2522 ASSERT_EQ(continue_node_, NULL); 2523 AddAlternative(alt); 2524 continue_node_ = alt.node(); 2525 } 2526 2527 2528 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 2529 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 2530 if (trace->stop_node() == this) { 2531 int text_length = GreedyLoopTextLength(&(alternatives_->at(0))); 2532 ASSERT(text_length != kNodeIsTooComplexForGreedyLoops); 2533 // Update the counter-based backtracking info on the stack. This is an 2534 // optimization for greedy loops (see below). 2535 ASSERT(trace->cp_offset() == text_length); 2536 macro_assembler->AdvanceCurrentPosition(text_length); 2537 macro_assembler->GoTo(trace->loop_label()); 2538 return; 2539 } 2540 ASSERT(trace->stop_node() == NULL); 2541 if (!trace->is_trivial()) { 2542 trace->Flush(compiler, this); 2543 return; 2544 } 2545 ChoiceNode::Emit(compiler, trace); 2546 } 2547 2548 2549 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler) { 2550 int preload_characters = EatsAtLeast(4, 0); 2551 if (compiler->macro_assembler()->CanReadUnaligned()) { 2552 bool ascii = compiler->ascii(); 2553 if (ascii) { 2554 if (preload_characters > 4) preload_characters = 4; 2555 // We can't preload 3 characters because there is no machine instruction 2556 // to do that. We can't just load 4 because we could be reading 2557 // beyond the end of the string, which could cause a memory fault. 2558 if (preload_characters == 3) preload_characters = 2; 2559 } else { 2560 if (preload_characters > 2) preload_characters = 2; 2561 } 2562 } else { 2563 if (preload_characters > 1) preload_characters = 1; 2564 } 2565 return preload_characters; 2566 } 2567 2568 2569 // This class is used when generating the alternatives in a choice node. It 2570 // records the way the alternative is being code generated. 2571 class AlternativeGeneration: public Malloced { 2572 public: 2573 AlternativeGeneration() 2574 : possible_success(), 2575 expects_preload(false), 2576 after(), 2577 quick_check_details() { } 2578 Label possible_success; 2579 bool expects_preload; 2580 Label after; 2581 QuickCheckDetails quick_check_details; 2582 }; 2583 2584 2585 // Creates a list of AlternativeGenerations. If the list has a reasonable 2586 // size then it is on the stack, otherwise the excess is on the heap. 2587 class AlternativeGenerationList { 2588 public: 2589 explicit AlternativeGenerationList(int count) 2590 : alt_gens_(count) { 2591 for (int i = 0; i < count && i < kAFew; i++) { 2592 alt_gens_.Add(a_few_alt_gens_ + i); 2593 } 2594 for (int i = kAFew; i < count; i++) { 2595 alt_gens_.Add(new AlternativeGeneration()); 2596 } 2597 } 2598 ~AlternativeGenerationList() { 2599 for (int i = kAFew; i < alt_gens_.length(); i++) { 2600 delete alt_gens_[i]; 2601 alt_gens_[i] = NULL; 2602 } 2603 } 2604 2605 AlternativeGeneration* at(int i) { 2606 return alt_gens_[i]; 2607 } 2608 private: 2609 static const int kAFew = 10; 2610 ZoneList<AlternativeGeneration*> alt_gens_; 2611 AlternativeGeneration a_few_alt_gens_[kAFew]; 2612 }; 2613 2614 2615 /* Code generation for choice nodes. 2616 * 2617 * We generate quick checks that do a mask and compare to eliminate a 2618 * choice. If the quick check succeeds then it jumps to the continuation to 2619 * do slow checks and check subsequent nodes. If it fails (the common case) 2620 * it falls through to the next choice. 2621 * 2622 * Here is the desired flow graph. Nodes directly below each other imply 2623 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative 2624 * 3 doesn't have a quick check so we have to call the slow check. 2625 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire 2626 * regexp continuation is generated directly after the Sn node, up to the 2627 * next GoTo if we decide to reuse some already generated code. Some 2628 * nodes expect preload_characters to be preloaded into the current 2629 * character register. R nodes do this preloading. Vertices are marked 2630 * F for failures and S for success (possible success in the case of quick 2631 * nodes). L, V, < and > are used as arrow heads. 2632 * 2633 * ----------> R 2634 * | 2635 * V 2636 * Q1 -----> S1 2637 * | S / 2638 * F| / 2639 * | F/ 2640 * | / 2641 * | R 2642 * | / 2643 * V L 2644 * Q2 -----> S2 2645 * | S / 2646 * F| / 2647 * | F/ 2648 * | / 2649 * | R 2650 * | / 2651 * V L 2652 * S3 2653 * | 2654 * F| 2655 * | 2656 * R 2657 * | 2658 * backtrack V 2659 * <----------Q4 2660 * \ F | 2661 * \ |S 2662 * \ F V 2663 * \-----S4 2664 * 2665 * For greedy loops we reverse our expectation and expect to match rather 2666 * than fail. Therefore we want the loop code to look like this (U is the 2667 * unwind code that steps back in the greedy loop). The following alternatives 2668 * look the same as above. 2669 * _____ 2670 * / \ 2671 * V | 2672 * ----------> S1 | 2673 * /| | 2674 * / |S | 2675 * F/ \_____/ 2676 * / 2677 * |<----------- 2678 * | \ 2679 * V \ 2680 * Q2 ---> S2 \ 2681 * | S / | 2682 * F| / | 2683 * | F/ | 2684 * | / | 2685 * | R | 2686 * | / | 2687 * F VL | 2688 * <------U | 2689 * back |S | 2690 * \______________/ 2691 */ 2692 2693 2694 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 2695 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 2696 int choice_count = alternatives_->length(); 2697 #ifdef DEBUG 2698 for (int i = 0; i < choice_count - 1; i++) { 2699 GuardedAlternative alternative = alternatives_->at(i); 2700 ZoneList<Guard*>* guards = alternative.guards(); 2701 int guard_count = (guards == NULL) ? 0 : guards->length(); 2702 for (int j = 0; j < guard_count; j++) { 2703 ASSERT(!trace->mentions_reg(guards->at(j)->reg())); 2704 } 2705 } 2706 #endif 2707 2708 LimitResult limit_result = LimitVersions(compiler, trace); 2709 if (limit_result == DONE) return; 2710 ASSERT(limit_result == CONTINUE); 2711 2712 int new_flush_budget = trace->flush_budget() / choice_count; 2713 if (trace->flush_budget() == 0 && trace->actions() != NULL) { 2714 trace->Flush(compiler, this); 2715 return; 2716 } 2717 2718 RecursionCheck rc(compiler); 2719 2720 Trace* current_trace = trace; 2721 2722 int text_length = GreedyLoopTextLength(&(alternatives_->at(0))); 2723 bool greedy_loop = false; 2724 Label greedy_loop_label; 2725 Trace counter_backtrack_trace; 2726 counter_backtrack_trace.set_backtrack(&greedy_loop_label); 2727 if (not_at_start()) counter_backtrack_trace.set_at_start(false); 2728 2729 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) { 2730 // Here we have special handling for greedy loops containing only text nodes 2731 // and other simple nodes. These are handled by pushing the current 2732 // position on the stack and then incrementing the current position each 2733 // time around the switch. On backtrack we decrement the current position 2734 // and check it against the pushed value. This avoids pushing backtrack 2735 // information for each iteration of the loop, which could take up a lot of 2736 // space. 2737 greedy_loop = true; 2738 ASSERT(trace->stop_node() == NULL); 2739 macro_assembler->PushCurrentPosition(); 2740 current_trace = &counter_backtrack_trace; 2741 Label greedy_match_failed; 2742 Trace greedy_match_trace; 2743 if (not_at_start()) greedy_match_trace.set_at_start(false); 2744 greedy_match_trace.set_backtrack(&greedy_match_failed); 2745 Label loop_label; 2746 macro_assembler->Bind(&loop_label); 2747 greedy_match_trace.set_stop_node(this); 2748 greedy_match_trace.set_loop_label(&loop_label); 2749 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace); 2750 macro_assembler->Bind(&greedy_match_failed); 2751 } 2752 2753 Label second_choice; // For use in greedy matches. 2754 macro_assembler->Bind(&second_choice); 2755 2756 int first_normal_choice = greedy_loop ? 1 : 0; 2757 2758 int preload_characters = CalculatePreloadCharacters(compiler); 2759 bool preload_is_current = 2760 (current_trace->characters_preloaded() == preload_characters); 2761 bool preload_has_checked_bounds = preload_is_current; 2762 2763 AlternativeGenerationList alt_gens(choice_count); 2764 2765 // For now we just call all choices one after the other. The idea ultimately 2766 // is to use the Dispatch table to try only the relevant ones. 2767 for (int i = first_normal_choice; i < choice_count; i++) { 2768 GuardedAlternative alternative = alternatives_->at(i); 2769 AlternativeGeneration* alt_gen = alt_gens.at(i); 2770 alt_gen->quick_check_details.set_characters(preload_characters); 2771 ZoneList<Guard*>* guards = alternative.guards(); 2772 int guard_count = (guards == NULL) ? 0 : guards->length(); 2773 Trace new_trace(*current_trace); 2774 new_trace.set_characters_preloaded(preload_is_current ? 2775 preload_characters : 2776 0); 2777 if (preload_has_checked_bounds) { 2778 new_trace.set_bound_checked_up_to(preload_characters); 2779 } 2780 new_trace.quick_check_performed()->Clear(); 2781 if (not_at_start_) new_trace.set_at_start(Trace::FALSE); 2782 alt_gen->expects_preload = preload_is_current; 2783 bool generate_full_check_inline = false; 2784 if (FLAG_regexp_optimization && 2785 try_to_emit_quick_check_for_alternative(i) && 2786 alternative.node()->EmitQuickCheck(compiler, 2787 &new_trace, 2788 preload_has_checked_bounds, 2789 &alt_gen->possible_success, 2790 &alt_gen->quick_check_details, 2791 i < choice_count - 1)) { 2792 // Quick check was generated for this choice. 2793 preload_is_current = true; 2794 preload_has_checked_bounds = true; 2795 // On the last choice in the ChoiceNode we generated the quick 2796 // check to fall through on possible success. So now we need to 2797 // generate the full check inline. 2798 if (i == choice_count - 1) { 2799 macro_assembler->Bind(&alt_gen->possible_success); 2800 new_trace.set_quick_check_performed(&alt_gen->quick_check_details); 2801 new_trace.set_characters_preloaded(preload_characters); 2802 new_trace.set_bound_checked_up_to(preload_characters); 2803 generate_full_check_inline = true; 2804 } 2805 } else if (alt_gen->quick_check_details.cannot_match()) { 2806 if (i == choice_count - 1 && !greedy_loop) { 2807 macro_assembler->GoTo(trace->backtrack()); 2808 } 2809 continue; 2810 } else { 2811 // No quick check was generated. Put the full code here. 2812 // If this is not the first choice then there could be slow checks from 2813 // previous cases that go here when they fail. There's no reason to 2814 // insist that they preload characters since the slow check we are about 2815 // to generate probably can't use it. 2816 if (i != first_normal_choice) { 2817 alt_gen->expects_preload = false; 2818 new_trace.InvalidateCurrentCharacter(); 2819 } 2820 if (i < choice_count - 1) { 2821 new_trace.set_backtrack(&alt_gen->after); 2822 } 2823 generate_full_check_inline = true; 2824 } 2825 if (generate_full_check_inline) { 2826 if (new_trace.actions() != NULL) { 2827 new_trace.set_flush_budget(new_flush_budget); 2828 } 2829 for (int j = 0; j < guard_count; j++) { 2830 GenerateGuard(macro_assembler, guards->at(j), &new_trace); 2831 } 2832 alternative.node()->Emit(compiler, &new_trace); 2833 preload_is_current = false; 2834 } 2835 macro_assembler->Bind(&alt_gen->after); 2836 } 2837 if (greedy_loop) { 2838 macro_assembler->Bind(&greedy_loop_label); 2839 // If we have unwound to the bottom then backtrack. 2840 macro_assembler->CheckGreedyLoop(trace->backtrack()); 2841 // Otherwise try the second priority at an earlier position. 2842 macro_assembler->AdvanceCurrentPosition(-text_length); 2843 macro_assembler->GoTo(&second_choice); 2844 } 2845 2846 // At this point we need to generate slow checks for the alternatives where 2847 // the quick check was inlined. We can recognize these because the associated 2848 // label was bound. 2849 for (int i = first_normal_choice; i < choice_count - 1; i++) { 2850 AlternativeGeneration* alt_gen = alt_gens.at(i); 2851 Trace new_trace(*current_trace); 2852 // If there are actions to be flushed we have to limit how many times 2853 // they are flushed. Take the budget of the parent trace and distribute 2854 // it fairly amongst the children. 2855 if (new_trace.actions() != NULL) { 2856 new_trace.set_flush_budget(new_flush_budget); 2857 } 2858 EmitOutOfLineContinuation(compiler, 2859 &new_trace, 2860 alternatives_->at(i), 2861 alt_gen, 2862 preload_characters, 2863 alt_gens.at(i + 1)->expects_preload); 2864 } 2865 } 2866 2867 2868 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler, 2869 Trace* trace, 2870 GuardedAlternative alternative, 2871 AlternativeGeneration* alt_gen, 2872 int preload_characters, 2873 bool next_expects_preload) { 2874 if (!alt_gen->possible_success.is_linked()) return; 2875 2876 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 2877 macro_assembler->Bind(&alt_gen->possible_success); 2878 Trace out_of_line_trace(*trace); 2879 out_of_line_trace.set_characters_preloaded(preload_characters); 2880 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details); 2881 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE); 2882 ZoneList<Guard*>* guards = alternative.guards(); 2883 int guard_count = (guards == NULL) ? 0 : guards->length(); 2884 if (next_expects_preload) { 2885 Label reload_current_char; 2886 out_of_line_trace.set_backtrack(&reload_current_char); 2887 for (int j = 0; j < guard_count; j++) { 2888 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace); 2889 } 2890 alternative.node()->Emit(compiler, &out_of_line_trace); 2891 macro_assembler->Bind(&reload_current_char); 2892 // Reload the current character, since the next quick check expects that. 2893 // We don't need to check bounds here because we only get into this 2894 // code through a quick check which already did the checked load. 2895 macro_assembler->LoadCurrentCharacter(trace->cp_offset(), 2896 NULL, 2897 false, 2898 preload_characters); 2899 macro_assembler->GoTo(&(alt_gen->after)); 2900 } else { 2901 out_of_line_trace.set_backtrack(&(alt_gen->after)); 2902 for (int j = 0; j < guard_count; j++) { 2903 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace); 2904 } 2905 alternative.node()->Emit(compiler, &out_of_line_trace); 2906 } 2907 } 2908 2909 2910 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) { 2911 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2912 LimitResult limit_result = LimitVersions(compiler, trace); 2913 if (limit_result == DONE) return; 2914 ASSERT(limit_result == CONTINUE); 2915 2916 RecursionCheck rc(compiler); 2917 2918 switch (type_) { 2919 case STORE_POSITION: { 2920 Trace::DeferredCapture 2921 new_capture(data_.u_position_register.reg, 2922 data_.u_position_register.is_capture, 2923 trace); 2924 Trace new_trace = *trace; 2925 new_trace.add_action(&new_capture); 2926 on_success()->Emit(compiler, &new_trace); 2927 break; 2928 } 2929 case INCREMENT_REGISTER: { 2930 Trace::DeferredIncrementRegister 2931 new_increment(data_.u_increment_register.reg); 2932 Trace new_trace = *trace; 2933 new_trace.add_action(&new_increment); 2934 on_success()->Emit(compiler, &new_trace); 2935 break; 2936 } 2937 case SET_REGISTER: { 2938 Trace::DeferredSetRegister 2939 new_set(data_.u_store_register.reg, data_.u_store_register.value); 2940 Trace new_trace = *trace; 2941 new_trace.add_action(&new_set); 2942 on_success()->Emit(compiler, &new_trace); 2943 break; 2944 } 2945 case CLEAR_CAPTURES: { 2946 Trace::DeferredClearCaptures 2947 new_capture(Interval(data_.u_clear_captures.range_from, 2948 data_.u_clear_captures.range_to)); 2949 Trace new_trace = *trace; 2950 new_trace.add_action(&new_capture); 2951 on_success()->Emit(compiler, &new_trace); 2952 break; 2953 } 2954 case BEGIN_SUBMATCH: 2955 if (!trace->is_trivial()) { 2956 trace->Flush(compiler, this); 2957 } else { 2958 assembler->WriteCurrentPositionToRegister( 2959 data_.u_submatch.current_position_register, 0); 2960 assembler->WriteStackPointerToRegister( 2961 data_.u_submatch.stack_pointer_register); 2962 on_success()->Emit(compiler, trace); 2963 } 2964 break; 2965 case EMPTY_MATCH_CHECK: { 2966 int start_pos_reg = data_.u_empty_match_check.start_register; 2967 int stored_pos = 0; 2968 int rep_reg = data_.u_empty_match_check.repetition_register; 2969 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister); 2970 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos); 2971 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) { 2972 // If we know we haven't advanced and there is no minimum we 2973 // can just backtrack immediately. 2974 assembler->GoTo(trace->backtrack()); 2975 } else if (know_dist && stored_pos < trace->cp_offset()) { 2976 // If we know we've advanced we can generate the continuation 2977 // immediately. 2978 on_success()->Emit(compiler, trace); 2979 } else if (!trace->is_trivial()) { 2980 trace->Flush(compiler, this); 2981 } else { 2982 Label skip_empty_check; 2983 // If we have a minimum number of repetitions we check the current 2984 // number first and skip the empty check if it's not enough. 2985 if (has_minimum) { 2986 int limit = data_.u_empty_match_check.repetition_limit; 2987 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check); 2988 } 2989 // If the match is empty we bail out, otherwise we fall through 2990 // to the on-success continuation. 2991 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register, 2992 trace->backtrack()); 2993 assembler->Bind(&skip_empty_check); 2994 on_success()->Emit(compiler, trace); 2995 } 2996 break; 2997 } 2998 case POSITIVE_SUBMATCH_SUCCESS: { 2999 if (!trace->is_trivial()) { 3000 trace->Flush(compiler, this); 3001 return; 3002 } 3003 assembler->ReadCurrentPositionFromRegister( 3004 data_.u_submatch.current_position_register); 3005 assembler->ReadStackPointerFromRegister( 3006 data_.u_submatch.stack_pointer_register); 3007 int clear_register_count = data_.u_submatch.clear_register_count; 3008 if (clear_register_count == 0) { 3009 on_success()->Emit(compiler, trace); 3010 return; 3011 } 3012 int clear_registers_from = data_.u_submatch.clear_register_from; 3013 Label clear_registers_backtrack; 3014 Trace new_trace = *trace; 3015 new_trace.set_backtrack(&clear_registers_backtrack); 3016 on_success()->Emit(compiler, &new_trace); 3017 3018 assembler->Bind(&clear_registers_backtrack); 3019 int clear_registers_to = clear_registers_from + clear_register_count - 1; 3020 assembler->ClearRegisters(clear_registers_from, clear_registers_to); 3021 3022 ASSERT(trace->backtrack() == NULL); 3023 assembler->Backtrack(); 3024 return; 3025 } 3026 default: 3027 UNREACHABLE(); 3028 } 3029 } 3030 3031 3032 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3033 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3034 if (!trace->is_trivial()) { 3035 trace->Flush(compiler, this); 3036 return; 3037 } 3038 3039 LimitResult limit_result = LimitVersions(compiler, trace); 3040 if (limit_result == DONE) return; 3041 ASSERT(limit_result == CONTINUE); 3042 3043 RecursionCheck rc(compiler); 3044 3045 ASSERT_EQ(start_reg_ + 1, end_reg_); 3046 if (compiler->ignore_case()) { 3047 assembler->CheckNotBackReferenceIgnoreCase(start_reg_, 3048 trace->backtrack()); 3049 } else { 3050 assembler->CheckNotBackReference(start_reg_, trace->backtrack()); 3051 } 3052 on_success()->Emit(compiler, trace); 3053 } 3054 3055 3056 // ------------------------------------------------------------------- 3057 // Dot/dotty output 3058 3059 3060 #ifdef DEBUG 3061 3062 3063 class DotPrinter: public NodeVisitor { 3064 public: 3065 explicit DotPrinter(bool ignore_case) 3066 : ignore_case_(ignore_case), 3067 stream_(&alloc_) { } 3068 void PrintNode(const char* label, RegExpNode* node); 3069 void Visit(RegExpNode* node); 3070 void PrintAttributes(RegExpNode* from); 3071 StringStream* stream() { return &stream_; } 3072 void PrintOnFailure(RegExpNode* from, RegExpNode* to); 3073 #define DECLARE_VISIT(Type) \ 3074 virtual void Visit##Type(Type##Node* that); 3075 FOR_EACH_NODE_TYPE(DECLARE_VISIT) 3076 #undef DECLARE_VISIT 3077 private: 3078 bool ignore_case_; 3079 HeapStringAllocator alloc_; 3080 StringStream stream_; 3081 }; 3082 3083 3084 void DotPrinter::PrintNode(const char* label, RegExpNode* node) { 3085 stream()->Add("digraph G {\n graph [label=\""); 3086 for (int i = 0; label[i]; i++) { 3087 switch (label[i]) { 3088 case '\\': 3089 stream()->Add("\\\\"); 3090 break; 3091 case '"': 3092 stream()->Add("\""); 3093 break; 3094 default: 3095 stream()->Put(label[i]); 3096 break; 3097 } 3098 } 3099 stream()->Add("\"];\n"); 3100 Visit(node); 3101 stream()->Add("}\n"); 3102 printf("%s", *(stream()->ToCString())); 3103 } 3104 3105 3106 void DotPrinter::Visit(RegExpNode* node) { 3107 if (node->info()->visited) return; 3108 node->info()->visited = true; 3109 node->Accept(this); 3110 } 3111 3112 3113 void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) { 3114 stream()->Add(" n%p -> n%p [style=dotted];\n", from, on_failure); 3115 Visit(on_failure); 3116 } 3117 3118 3119 class TableEntryBodyPrinter { 3120 public: 3121 TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice) 3122 : stream_(stream), choice_(choice) { } 3123 void Call(uc16 from, DispatchTable::Entry entry) { 3124 OutSet* out_set = entry.out_set(); 3125 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { 3126 if (out_set->Get(i)) { 3127 stream()->Add(" n%p:s%io%i -> n%p;\n", 3128 choice(), 3129 from, 3130 i, 3131 choice()->alternatives()->at(i).node()); 3132 } 3133 } 3134 } 3135 private: 3136 StringStream* stream() { return stream_; } 3137 ChoiceNode* choice() { return choice_; } 3138 StringStream* stream_; 3139 ChoiceNode* choice_; 3140 }; 3141 3142 3143 class TableEntryHeaderPrinter { 3144 public: 3145 explicit TableEntryHeaderPrinter(StringStream* stream) 3146 : first_(true), stream_(stream) { } 3147 void Call(uc16 from, DispatchTable::Entry entry) { 3148 if (first_) { 3149 first_ = false; 3150 } else { 3151 stream()->Add("|"); 3152 } 3153 stream()->Add("{\\%k-\\%k|{", from, entry.to()); 3154 OutSet* out_set = entry.out_set(); 3155 int priority = 0; 3156 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { 3157 if (out_set->Get(i)) { 3158 if (priority > 0) stream()->Add("|"); 3159 stream()->Add("<s%io%i> %i", from, i, priority); 3160 priority++; 3161 } 3162 } 3163 stream()->Add("}}"); 3164 } 3165 private: 3166 bool first_; 3167 StringStream* stream() { return stream_; } 3168 StringStream* stream_; 3169 }; 3170 3171 3172 class AttributePrinter { 3173 public: 3174 explicit AttributePrinter(DotPrinter* out) 3175 : out_(out), first_(true) { } 3176 void PrintSeparator() { 3177 if (first_) { 3178 first_ = false; 3179 } else { 3180 out_->stream()->Add("|"); 3181 } 3182 } 3183 void PrintBit(const char* name, bool value) { 3184 if (!value) return; 3185 PrintSeparator(); 3186 out_->stream()->Add("{%s}", name); 3187 } 3188 void PrintPositive(const char* name, int value) { 3189 if (value < 0) return; 3190 PrintSeparator(); 3191 out_->stream()->Add("{%s|%x}", name, value); 3192 } 3193 private: 3194 DotPrinter* out_; 3195 bool first_; 3196 }; 3197 3198 3199 void DotPrinter::PrintAttributes(RegExpNode* that) { 3200 stream()->Add(" a%p [shape=Mrecord, color=grey, fontcolor=grey, " 3201 "margin=0.1, fontsize=10, label=\"{", 3202 that); 3203 AttributePrinter printer(this); 3204 NodeInfo* info = that->info(); 3205 printer.PrintBit("NI", info->follows_newline_interest); 3206 printer.PrintBit("WI", info->follows_word_interest); 3207 printer.PrintBit("SI", info->follows_start_interest); 3208 Label* label = that->label(); 3209 if (label->is_bound()) 3210 printer.PrintPositive("@", label->pos()); 3211 stream()->Add("}\"];\n"); 3212 stream()->Add(" a%p -> n%p [style=dashed, color=grey, " 3213 "arrowhead=none];\n", that, that); 3214 } 3215 3216 3217 static const bool kPrintDispatchTable = false; 3218 void DotPrinter::VisitChoice(ChoiceNode* that) { 3219 if (kPrintDispatchTable) { 3220 stream()->Add(" n%p [shape=Mrecord, label=\"", that); 3221 TableEntryHeaderPrinter header_printer(stream()); 3222 that->GetTable(ignore_case_)->ForEach(&header_printer); 3223 stream()->Add("\"]\n", that); 3224 PrintAttributes(that); 3225 TableEntryBodyPrinter body_printer(stream(), that); 3226 that->GetTable(ignore_case_)->ForEach(&body_printer); 3227 } else { 3228 stream()->Add(" n%p [shape=Mrecord, label=\"?\"];\n", that); 3229 for (int i = 0; i < that->alternatives()->length(); i++) { 3230 GuardedAlternative alt = that->alternatives()->at(i); 3231 stream()->Add(" n%p -> n%p;\n", that, alt.node()); 3232 } 3233 } 3234 for (int i = 0; i < that->alternatives()->length(); i++) { 3235 GuardedAlternative alt = that->alternatives()->at(i); 3236 alt.node()->Accept(this); 3237 } 3238 } 3239 3240 3241 void DotPrinter::VisitText(TextNode* that) { 3242 stream()->Add(" n%p [label=\"", that); 3243 for (int i = 0; i < that->elements()->length(); i++) { 3244 if (i > 0) stream()->Add(" "); 3245 TextElement elm = that->elements()->at(i); 3246 switch (elm.type) { 3247 case TextElement::ATOM: { 3248 stream()->Add("'%w'", elm.data.u_atom->data()); 3249 break; 3250 } 3251 case TextElement::CHAR_CLASS: { 3252 RegExpCharacterClass* node = elm.data.u_char_class; 3253 stream()->Add("["); 3254 if (node->is_negated()) 3255 stream()->Add("^"); 3256 for (int j = 0; j < node->ranges()->length(); j++) { 3257 CharacterRange range = node->ranges()->at(j); 3258 stream()->Add("%k-%k", range.from(), range.to()); 3259 } 3260 stream()->Add("]"); 3261 break; 3262 } 3263 default: 3264 UNREACHABLE(); 3265 } 3266 } 3267 stream()->Add("\", shape=box, peripheries=2];\n"); 3268 PrintAttributes(that); 3269 stream()->Add(" n%p -> n%p;\n", that, that->on_success()); 3270 Visit(that->on_success()); 3271 } 3272 3273 3274 void DotPrinter::VisitBackReference(BackReferenceNode* that) { 3275 stream()->Add(" n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n", 3276 that, 3277 that->start_register(), 3278 that->end_register()); 3279 PrintAttributes(that); 3280 stream()->Add(" n%p -> n%p;\n", that, that->on_success()); 3281 Visit(that->on_success()); 3282 } 3283 3284 3285 void DotPrinter::VisitEnd(EndNode* that) { 3286 stream()->Add(" n%p [style=bold, shape=point];\n", that); 3287 PrintAttributes(that); 3288 } 3289 3290 3291 void DotPrinter::VisitAssertion(AssertionNode* that) { 3292 stream()->Add(" n%p [", that); 3293 switch (that->type()) { 3294 case AssertionNode::AT_END: 3295 stream()->Add("label=\"$\", shape=septagon"); 3296 break; 3297 case AssertionNode::AT_START: 3298 stream()->Add("label=\"^\", shape=septagon"); 3299 break; 3300 case AssertionNode::AT_BOUNDARY: 3301 stream()->Add("label=\"\\b\", shape=septagon"); 3302 break; 3303 case AssertionNode::AT_NON_BOUNDARY: 3304 stream()->Add("label=\"\\B\", shape=septagon"); 3305 break; 3306 case AssertionNode::AFTER_NEWLINE: 3307 stream()->Add("label=\"(?<=\\n)\", shape=septagon"); 3308 break; 3309 case AssertionNode::AFTER_WORD_CHARACTER: 3310 stream()->Add("label=\"(?<=\\w)\", shape=septagon"); 3311 break; 3312 case AssertionNode::AFTER_NONWORD_CHARACTER: 3313 stream()->Add("label=\"(?<=\\W)\", shape=septagon"); 3314 break; 3315 } 3316 stream()->Add("];\n"); 3317 PrintAttributes(that); 3318 RegExpNode* successor = that->on_success(); 3319 stream()->Add(" n%p -> n%p;\n", that, successor); 3320 Visit(successor); 3321 } 3322 3323 3324 void DotPrinter::VisitAction(ActionNode* that) { 3325 stream()->Add(" n%p [", that); 3326 switch (that->type_) { 3327 case ActionNode::SET_REGISTER: 3328 stream()->Add("label=\"$%i:=%i\", shape=octagon", 3329 that->data_.u_store_register.reg, 3330 that->data_.u_store_register.value); 3331 break; 3332 case ActionNode::INCREMENT_REGISTER: 3333 stream()->Add("label=\"$%i++\", shape=octagon", 3334 that->data_.u_increment_register.reg); 3335 break; 3336 case ActionNode::STORE_POSITION: 3337 stream()->Add("label=\"$%i:=$pos\", shape=octagon", 3338 that->data_.u_position_register.reg); 3339 break; 3340 case ActionNode::BEGIN_SUBMATCH: 3341 stream()->Add("label=\"$%i:=$pos,begin\", shape=septagon", 3342 that->data_.u_submatch.current_position_register); 3343 break; 3344 case ActionNode::POSITIVE_SUBMATCH_SUCCESS: 3345 stream()->Add("label=\"escape\", shape=septagon"); 3346 break; 3347 case ActionNode::EMPTY_MATCH_CHECK: 3348 stream()->Add("label=\"$%i=$pos?,$%i<%i?\", shape=septagon", 3349 that->data_.u_empty_match_check.start_register, 3350 that->data_.u_empty_match_check.repetition_register, 3351 that->data_.u_empty_match_check.repetition_limit); 3352 break; 3353 case ActionNode::CLEAR_CAPTURES: { 3354 stream()->Add("label=\"clear $%i to $%i\", shape=septagon", 3355 that->data_.u_clear_captures.range_from, 3356 that->data_.u_clear_captures.range_to); 3357 break; 3358 } 3359 } 3360 stream()->Add("];\n"); 3361 PrintAttributes(that); 3362 RegExpNode* successor = that->on_success(); 3363 stream()->Add(" n%p -> n%p;\n", that, successor); 3364 Visit(successor); 3365 } 3366 3367 3368 class DispatchTableDumper { 3369 public: 3370 explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { } 3371 void Call(uc16 key, DispatchTable::Entry entry); 3372 StringStream* stream() { return stream_; } 3373 private: 3374 StringStream* stream_; 3375 }; 3376 3377 3378 void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) { 3379 stream()->Add("[%k-%k]: {", key, entry.to()); 3380 OutSet* set = entry.out_set(); 3381 bool first = true; 3382 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { 3383 if (set->Get(i)) { 3384 if (first) { 3385 first = false; 3386 } else { 3387 stream()->Add(", "); 3388 } 3389 stream()->Add("%i", i); 3390 } 3391 } 3392 stream()->Add("}\n"); 3393 } 3394 3395 3396 void DispatchTable::Dump() { 3397 HeapStringAllocator alloc; 3398 StringStream stream(&alloc); 3399 DispatchTableDumper dumper(&stream); 3400 tree()->ForEach(&dumper); 3401 OS::PrintError("%s", *stream.ToCString()); 3402 } 3403 3404 3405 void RegExpEngine::DotPrint(const char* label, 3406 RegExpNode* node, 3407 bool ignore_case) { 3408 DotPrinter printer(ignore_case); 3409 printer.PrintNode(label, node); 3410 } 3411 3412 3413 #endif // DEBUG 3414 3415 3416 // ------------------------------------------------------------------- 3417 // Tree to graph conversion 3418 3419 static const int kSpaceRangeCount = 20; 3420 static const int kSpaceRangeAsciiCount = 4; 3421 static const uc16 kSpaceRanges[kSpaceRangeCount] = { 0x0009, 0x000D, 0x0020, 3422 0x0020, 0x00A0, 0x00A0, 0x1680, 0x1680, 0x180E, 0x180E, 0x2000, 0x200A, 3423 0x2028, 0x2029, 0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000 }; 3424 3425 static const int kWordRangeCount = 8; 3426 static const uc16 kWordRanges[kWordRangeCount] = { '0', '9', 'A', 'Z', '_', 3427 '_', 'a', 'z' }; 3428 3429 static const int kDigitRangeCount = 2; 3430 static const uc16 kDigitRanges[kDigitRangeCount] = { '0', '9' }; 3431 3432 static const int kLineTerminatorRangeCount = 6; 3433 static const uc16 kLineTerminatorRanges[kLineTerminatorRangeCount] = { 0x000A, 3434 0x000A, 0x000D, 0x000D, 0x2028, 0x2029 }; 3435 3436 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler, 3437 RegExpNode* on_success) { 3438 ZoneList<TextElement>* elms = new ZoneList<TextElement>(1); 3439 elms->Add(TextElement::Atom(this)); 3440 return new TextNode(elms, on_success); 3441 } 3442 3443 3444 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, 3445 RegExpNode* on_success) { 3446 return new TextNode(elements(), on_success); 3447 } 3448 3449 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges, 3450 const uc16* special_class, 3451 int length) { 3452 ASSERT(ranges->length() != 0); 3453 ASSERT(length != 0); 3454 ASSERT(special_class[0] != 0); 3455 if (ranges->length() != (length >> 1) + 1) { 3456 return false; 3457 } 3458 CharacterRange range = ranges->at(0); 3459 if (range.from() != 0) { 3460 return false; 3461 } 3462 for (int i = 0; i < length; i += 2) { 3463 if (special_class[i] != (range.to() + 1)) { 3464 return false; 3465 } 3466 range = ranges->at((i >> 1) + 1); 3467 if (special_class[i+1] != range.from() - 1) { 3468 return false; 3469 } 3470 } 3471 if (range.to() != 0xffff) { 3472 return false; 3473 } 3474 return true; 3475 } 3476 3477 3478 static bool CompareRanges(ZoneList<CharacterRange>* ranges, 3479 const uc16* special_class, 3480 int length) { 3481 if (ranges->length() * 2 != length) { 3482 return false; 3483 } 3484 for (int i = 0; i < length; i += 2) { 3485 CharacterRange range = ranges->at(i >> 1); 3486 if (range.from() != special_class[i] || range.to() != special_class[i+1]) { 3487 return false; 3488 } 3489 } 3490 return true; 3491 } 3492 3493 3494 bool RegExpCharacterClass::is_standard() { 3495 // TODO(lrn): Remove need for this function, by not throwing away information 3496 // along the way. 3497 if (is_negated_) { 3498 return false; 3499 } 3500 if (set_.is_standard()) { 3501 return true; 3502 } 3503 if (CompareRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) { 3504 set_.set_standard_set_type('s'); 3505 return true; 3506 } 3507 if (CompareInverseRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) { 3508 set_.set_standard_set_type('S'); 3509 return true; 3510 } 3511 if (CompareInverseRanges(set_.ranges(), 3512 kLineTerminatorRanges, 3513 kLineTerminatorRangeCount)) { 3514 set_.set_standard_set_type('.'); 3515 return true; 3516 } 3517 if (CompareRanges(set_.ranges(), 3518 kLineTerminatorRanges, 3519 kLineTerminatorRangeCount)) { 3520 set_.set_standard_set_type('n'); 3521 return true; 3522 } 3523 if (CompareRanges(set_.ranges(), kWordRanges, kWordRangeCount)) { 3524 set_.set_standard_set_type('w'); 3525 return true; 3526 } 3527 if (CompareInverseRanges(set_.ranges(), kWordRanges, kWordRangeCount)) { 3528 set_.set_standard_set_type('W'); 3529 return true; 3530 } 3531 return false; 3532 } 3533 3534 3535 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, 3536 RegExpNode* on_success) { 3537 return new TextNode(this, on_success); 3538 } 3539 3540 3541 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler, 3542 RegExpNode* on_success) { 3543 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 3544 int length = alternatives->length(); 3545 ChoiceNode* result = new ChoiceNode(length); 3546 for (int i = 0; i < length; i++) { 3547 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler, 3548 on_success)); 3549 result->AddAlternative(alternative); 3550 } 3551 return result; 3552 } 3553 3554 3555 RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler, 3556 RegExpNode* on_success) { 3557 return ToNode(min(), 3558 max(), 3559 is_greedy(), 3560 body(), 3561 compiler, 3562 on_success); 3563 } 3564 3565 3566 RegExpNode* RegExpQuantifier::ToNode(int min, 3567 int max, 3568 bool is_greedy, 3569 RegExpTree* body, 3570 RegExpCompiler* compiler, 3571 RegExpNode* on_success, 3572 bool not_at_start) { 3573 // x{f, t} becomes this: 3574 // 3575 // (r++)<-. 3576 // | ` 3577 // | (x) 3578 // v ^ 3579 // (r=0)-->(?)---/ [if r < t] 3580 // | 3581 // [if r >= f] \----> ... 3582 // 3583 3584 // 15.10.2.5 RepeatMatcher algorithm. 3585 // The parser has already eliminated the case where max is 0. In the case 3586 // where max_match is zero the parser has removed the quantifier if min was 3587 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom. 3588 3589 // If we know that we cannot match zero length then things are a little 3590 // simpler since we don't need to make the special zero length match check 3591 // from step 2.1. If the min and max are small we can unroll a little in 3592 // this case. 3593 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,} 3594 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3} 3595 if (max == 0) return on_success; // This can happen due to recursion. 3596 bool body_can_be_empty = (body->min_match() == 0); 3597 int body_start_reg = RegExpCompiler::kNoRegister; 3598 Interval capture_registers = body->CaptureRegisters(); 3599 bool needs_capture_clearing = !capture_registers.is_empty(); 3600 if (body_can_be_empty) { 3601 body_start_reg = compiler->AllocateRegister(); 3602 } else if (FLAG_regexp_optimization && !needs_capture_clearing) { 3603 // Only unroll if there are no captures and the body can't be 3604 // empty. 3605 if (min > 0 && min <= kMaxUnrolledMinMatches) { 3606 int new_max = (max == kInfinity) ? max : max - min; 3607 // Recurse once to get the loop or optional matches after the fixed ones. 3608 RegExpNode* answer = ToNode( 3609 0, new_max, is_greedy, body, compiler, on_success, true); 3610 // Unroll the forced matches from 0 to min. This can cause chains of 3611 // TextNodes (which the parser does not generate). These should be 3612 // combined if it turns out they hinder good code generation. 3613 for (int i = 0; i < min; i++) { 3614 answer = body->ToNode(compiler, answer); 3615 } 3616 return answer; 3617 } 3618 if (max <= kMaxUnrolledMaxMatches) { 3619 ASSERT(min == 0); 3620 // Unroll the optional matches up to max. 3621 RegExpNode* answer = on_success; 3622 for (int i = 0; i < max; i++) { 3623 ChoiceNode* alternation = new ChoiceNode(2); 3624 if (is_greedy) { 3625 alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler, 3626 answer))); 3627 alternation->AddAlternative(GuardedAlternative(on_success)); 3628 } else { 3629 alternation->AddAlternative(GuardedAlternative(on_success)); 3630 alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler, 3631 answer))); 3632 } 3633 answer = alternation; 3634 if (not_at_start) alternation->set_not_at_start(); 3635 } 3636 return answer; 3637 } 3638 } 3639 bool has_min = min > 0; 3640 bool has_max = max < RegExpTree::kInfinity; 3641 bool needs_counter = has_min || has_max; 3642 int reg_ctr = needs_counter 3643 ? compiler->AllocateRegister() 3644 : RegExpCompiler::kNoRegister; 3645 LoopChoiceNode* center = new LoopChoiceNode(body->min_match() == 0); 3646 if (not_at_start) center->set_not_at_start(); 3647 RegExpNode* loop_return = needs_counter 3648 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center)) 3649 : static_cast<RegExpNode*>(center); 3650 if (body_can_be_empty) { 3651 // If the body can be empty we need to check if it was and then 3652 // backtrack. 3653 loop_return = ActionNode::EmptyMatchCheck(body_start_reg, 3654 reg_ctr, 3655 min, 3656 loop_return); 3657 } 3658 RegExpNode* body_node = body->ToNode(compiler, loop_return); 3659 if (body_can_be_empty) { 3660 // If the body can be empty we need to store the start position 3661 // so we can bail out if it was empty. 3662 body_node = ActionNode::StorePosition(body_start_reg, false, body_node); 3663 } 3664 if (needs_capture_clearing) { 3665 // Before entering the body of this loop we need to clear captures. 3666 body_node = ActionNode::ClearCaptures(capture_registers, body_node); 3667 } 3668 GuardedAlternative body_alt(body_node); 3669 if (has_max) { 3670 Guard* body_guard = new Guard(reg_ctr, Guard::LT, max); 3671 body_alt.AddGuard(body_guard); 3672 } 3673 GuardedAlternative rest_alt(on_success); 3674 if (has_min) { 3675 Guard* rest_guard = new Guard(reg_ctr, Guard::GEQ, min); 3676 rest_alt.AddGuard(rest_guard); 3677 } 3678 if (is_greedy) { 3679 center->AddLoopAlternative(body_alt); 3680 center->AddContinueAlternative(rest_alt); 3681 } else { 3682 center->AddContinueAlternative(rest_alt); 3683 center->AddLoopAlternative(body_alt); 3684 } 3685 if (needs_counter) { 3686 return ActionNode::SetRegister(reg_ctr, 0, center); 3687 } else { 3688 return center; 3689 } 3690 } 3691 3692 3693 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, 3694 RegExpNode* on_success) { 3695 NodeInfo info; 3696 switch (type()) { 3697 case START_OF_LINE: 3698 return AssertionNode::AfterNewline(on_success); 3699 case START_OF_INPUT: 3700 return AssertionNode::AtStart(on_success); 3701 case BOUNDARY: 3702 return AssertionNode::AtBoundary(on_success); 3703 case NON_BOUNDARY: 3704 return AssertionNode::AtNonBoundary(on_success); 3705 case END_OF_INPUT: 3706 return AssertionNode::AtEnd(on_success); 3707 case END_OF_LINE: { 3708 // Compile $ in multiline regexps as an alternation with a positive 3709 // lookahead in one side and an end-of-input on the other side. 3710 // We need two registers for the lookahead. 3711 int stack_pointer_register = compiler->AllocateRegister(); 3712 int position_register = compiler->AllocateRegister(); 3713 // The ChoiceNode to distinguish between a newline and end-of-input. 3714 ChoiceNode* result = new ChoiceNode(2); 3715 // Create a newline atom. 3716 ZoneList<CharacterRange>* newline_ranges = 3717 new ZoneList<CharacterRange>(3); 3718 CharacterRange::AddClassEscape('n', newline_ranges); 3719 RegExpCharacterClass* newline_atom = new RegExpCharacterClass('n'); 3720 TextNode* newline_matcher = new TextNode( 3721 newline_atom, 3722 ActionNode::PositiveSubmatchSuccess(stack_pointer_register, 3723 position_register, 3724 0, // No captures inside. 3725 -1, // Ignored if no captures. 3726 on_success)); 3727 // Create an end-of-input matcher. 3728 RegExpNode* end_of_line = ActionNode::BeginSubmatch( 3729 stack_pointer_register, 3730 position_register, 3731 newline_matcher); 3732 // Add the two alternatives to the ChoiceNode. 3733 GuardedAlternative eol_alternative(end_of_line); 3734 result->AddAlternative(eol_alternative); 3735 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success)); 3736 result->AddAlternative(end_alternative); 3737 return result; 3738 } 3739 default: 3740 UNREACHABLE(); 3741 } 3742 return on_success; 3743 } 3744 3745 3746 RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler, 3747 RegExpNode* on_success) { 3748 return new BackReferenceNode(RegExpCapture::StartRegister(index()), 3749 RegExpCapture::EndRegister(index()), 3750 on_success); 3751 } 3752 3753 3754 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler, 3755 RegExpNode* on_success) { 3756 return on_success; 3757 } 3758 3759 3760 RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler, 3761 RegExpNode* on_success) { 3762 int stack_pointer_register = compiler->AllocateRegister(); 3763 int position_register = compiler->AllocateRegister(); 3764 3765 const int registers_per_capture = 2; 3766 const int register_of_first_capture = 2; 3767 int register_count = capture_count_ * registers_per_capture; 3768 int register_start = 3769 register_of_first_capture + capture_from_ * registers_per_capture; 3770 3771 RegExpNode* success; 3772 if (is_positive()) { 3773 RegExpNode* node = ActionNode::BeginSubmatch( 3774 stack_pointer_register, 3775 position_register, 3776 body()->ToNode( 3777 compiler, 3778 ActionNode::PositiveSubmatchSuccess(stack_pointer_register, 3779 position_register, 3780 register_count, 3781 register_start, 3782 on_success))); 3783 return node; 3784 } else { 3785 // We use a ChoiceNode for a negative lookahead because it has most of 3786 // the characteristics we need. It has the body of the lookahead as its 3787 // first alternative and the expression after the lookahead of the second 3788 // alternative. If the first alternative succeeds then the 3789 // NegativeSubmatchSuccess will unwind the stack including everything the 3790 // choice node set up and backtrack. If the first alternative fails then 3791 // the second alternative is tried, which is exactly the desired result 3792 // for a negative lookahead. The NegativeLookaheadChoiceNode is a special 3793 // ChoiceNode that knows to ignore the first exit when calculating quick 3794 // checks. 3795 GuardedAlternative body_alt( 3796 body()->ToNode( 3797 compiler, 3798 success = new NegativeSubmatchSuccess(stack_pointer_register, 3799 position_register, 3800 register_count, 3801 register_start))); 3802 ChoiceNode* choice_node = 3803 new NegativeLookaheadChoiceNode(body_alt, 3804 GuardedAlternative(on_success)); 3805 return ActionNode::BeginSubmatch(stack_pointer_register, 3806 position_register, 3807 choice_node); 3808 } 3809 } 3810 3811 3812 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler, 3813 RegExpNode* on_success) { 3814 return ToNode(body(), index(), compiler, on_success); 3815 } 3816 3817 3818 RegExpNode* RegExpCapture::ToNode(RegExpTree* body, 3819 int index, 3820 RegExpCompiler* compiler, 3821 RegExpNode* on_success) { 3822 int start_reg = RegExpCapture::StartRegister(index); 3823 int end_reg = RegExpCapture::EndRegister(index); 3824 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success); 3825 RegExpNode* body_node = body->ToNode(compiler, store_end); 3826 return ActionNode::StorePosition(start_reg, true, body_node); 3827 } 3828 3829 3830 RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler, 3831 RegExpNode* on_success) { 3832 ZoneList<RegExpTree*>* children = nodes(); 3833 RegExpNode* current = on_success; 3834 for (int i = children->length() - 1; i >= 0; i--) { 3835 current = children->at(i)->ToNode(compiler, current); 3836 } 3837 return current; 3838 } 3839 3840 3841 static void AddClass(const uc16* elmv, 3842 int elmc, 3843 ZoneList<CharacterRange>* ranges) { 3844 for (int i = 0; i < elmc; i += 2) { 3845 ASSERT(elmv[i] <= elmv[i + 1]); 3846 ranges->Add(CharacterRange(elmv[i], elmv[i + 1])); 3847 } 3848 } 3849 3850 3851 static void AddClassNegated(const uc16 *elmv, 3852 int elmc, 3853 ZoneList<CharacterRange>* ranges) { 3854 ASSERT(elmv[0] != 0x0000); 3855 ASSERT(elmv[elmc-1] != String::kMaxUC16CharCode); 3856 uc16 last = 0x0000; 3857 for (int i = 0; i < elmc; i += 2) { 3858 ASSERT(last <= elmv[i] - 1); 3859 ASSERT(elmv[i] <= elmv[i + 1]); 3860 ranges->Add(CharacterRange(last, elmv[i] - 1)); 3861 last = elmv[i + 1] + 1; 3862 } 3863 ranges->Add(CharacterRange(last, String::kMaxUC16CharCode)); 3864 } 3865 3866 3867 void CharacterRange::AddClassEscape(uc16 type, 3868 ZoneList<CharacterRange>* ranges) { 3869 switch (type) { 3870 case 's': 3871 AddClass(kSpaceRanges, kSpaceRangeCount, ranges); 3872 break; 3873 case 'S': 3874 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges); 3875 break; 3876 case 'w': 3877 AddClass(kWordRanges, kWordRangeCount, ranges); 3878 break; 3879 case 'W': 3880 AddClassNegated(kWordRanges, kWordRangeCount, ranges); 3881 break; 3882 case 'd': 3883 AddClass(kDigitRanges, kDigitRangeCount, ranges); 3884 break; 3885 case 'D': 3886 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges); 3887 break; 3888 case '.': 3889 AddClassNegated(kLineTerminatorRanges, 3890 kLineTerminatorRangeCount, 3891 ranges); 3892 break; 3893 // This is not a character range as defined by the spec but a 3894 // convenient shorthand for a character class that matches any 3895 // character. 3896 case '*': 3897 ranges->Add(CharacterRange::Everything()); 3898 break; 3899 // This is the set of characters matched by the $ and ^ symbols 3900 // in multiline mode. 3901 case 'n': 3902 AddClass(kLineTerminatorRanges, 3903 kLineTerminatorRangeCount, 3904 ranges); 3905 break; 3906 default: 3907 UNREACHABLE(); 3908 } 3909 } 3910 3911 3912 Vector<const uc16> CharacterRange::GetWordBounds() { 3913 return Vector<const uc16>(kWordRanges, kWordRangeCount); 3914 } 3915 3916 3917 class CharacterRangeSplitter { 3918 public: 3919 CharacterRangeSplitter(ZoneList<CharacterRange>** included, 3920 ZoneList<CharacterRange>** excluded) 3921 : included_(included), 3922 excluded_(excluded) { } 3923 void Call(uc16 from, DispatchTable::Entry entry); 3924 3925 static const int kInBase = 0; 3926 static const int kInOverlay = 1; 3927 3928 private: 3929 ZoneList<CharacterRange>** included_; 3930 ZoneList<CharacterRange>** excluded_; 3931 }; 3932 3933 3934 void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) { 3935 if (!entry.out_set()->Get(kInBase)) return; 3936 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay) 3937 ? included_ 3938 : excluded_; 3939 if (*target == NULL) *target = new ZoneList<CharacterRange>(2); 3940 (*target)->Add(CharacterRange(entry.from(), entry.to())); 3941 } 3942 3943 3944 void CharacterRange::Split(ZoneList<CharacterRange>* base, 3945 Vector<const uc16> overlay, 3946 ZoneList<CharacterRange>** included, 3947 ZoneList<CharacterRange>** excluded) { 3948 ASSERT_EQ(NULL, *included); 3949 ASSERT_EQ(NULL, *excluded); 3950 DispatchTable table; 3951 for (int i = 0; i < base->length(); i++) 3952 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase); 3953 for (int i = 0; i < overlay.length(); i += 2) { 3954 table.AddRange(CharacterRange(overlay[i], overlay[i+1]), 3955 CharacterRangeSplitter::kInOverlay); 3956 } 3957 CharacterRangeSplitter callback(included, excluded); 3958 table.ForEach(&callback); 3959 } 3960 3961 3962 static void AddUncanonicals(ZoneList<CharacterRange>* ranges, 3963 int bottom, 3964 int top); 3965 3966 3967 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges, 3968 bool is_ascii) { 3969 uc16 bottom = from(); 3970 uc16 top = to(); 3971 if (is_ascii) { 3972 if (bottom > String::kMaxAsciiCharCode) return; 3973 if (top > String::kMaxAsciiCharCode) top = String::kMaxAsciiCharCode; 3974 } 3975 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 3976 if (top == bottom) { 3977 // If this is a singleton we just expand the one character. 3978 int length = uncanonicalize.get(bottom, '\0', chars); 3979 for (int i = 0; i < length; i++) { 3980 uc32 chr = chars[i]; 3981 if (chr != bottom) { 3982 ranges->Add(CharacterRange::Singleton(chars[i])); 3983 } 3984 } 3985 } else if (bottom <= kRangeCanonicalizeMax && 3986 top <= kRangeCanonicalizeMax) { 3987 // If this is a range we expand the characters block by block, 3988 // expanding contiguous subranges (blocks) one at a time. 3989 // The approach is as follows. For a given start character we 3990 // look up the block that contains it, for instance 'a' if the 3991 // start character is 'c'. A block is characterized by the property 3992 // that all characters uncanonicalize in the same way as the first 3993 // element, except that each entry in the result is incremented 3994 // by the distance from the first element. So a-z is a block 3995 // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter 3996 // uncanonicalizes to ['a' + k, 'A' + k]. 3997 // Once we've found the start point we look up its uncanonicalization 3998 // and produce a range for each element. For instance for [c-f] 3999 // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only 4000 // add a range if it is not already contained in the input, so [c-f] 4001 // will be skipped but [C-F] will be added. If this range is not 4002 // completely contained in a block we do this for all the blocks 4003 // covered by the range. 4004 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 4005 // First, look up the block that contains the 'bottom' character. 4006 int length = canonrange.get(bottom, '\0', range); 4007 if (length == 0) { 4008 range[0] = bottom; 4009 } else { 4010 ASSERT_EQ(1, length); 4011 } 4012 int pos = bottom; 4013 // The start of the current block. Note that except for the first 4014 // iteration 'start' is always equal to 'pos'. 4015 int start; 4016 // If it is not the start point of a block the entry contains the 4017 // offset of the character from the start point. 4018 if ((range[0] & kStartMarker) == 0) { 4019 start = pos - range[0]; 4020 } else { 4021 start = pos; 4022 } 4023 // Then we add the ranges one at a time, incrementing the current 4024 // position to be after the last block each time. The position 4025 // always points to the start of a block. 4026 while (pos < top) { 4027 length = canonrange.get(start, '\0', range); 4028 if (length == 0) { 4029 range[0] = start; 4030 } else { 4031 ASSERT_EQ(1, length); 4032 } 4033 ASSERT((range[0] & kStartMarker) != 0); 4034 // The start point of a block contains the distance to the end 4035 // of the range. 4036 int block_end = start + (range[0] & kPayloadMask) - 1; 4037 int end = (block_end > top) ? top : block_end; 4038 length = uncanonicalize.get(start, '\0', range); 4039 for (int i = 0; i < length; i++) { 4040 uc32 c = range[i]; 4041 uc16 range_from = c + (pos - start); 4042 uc16 range_to = c + (end - start); 4043 if (!(bottom <= range_from && range_to <= top)) { 4044 ranges->Add(CharacterRange(range_from, range_to)); 4045 } 4046 } 4047 start = pos = block_end + 1; 4048 } 4049 } else { 4050 // Unibrow ranges don't work for high characters due to the "2^11 bug". 4051 // Therefore we do something dumber for these ranges. 4052 AddUncanonicals(ranges, bottom, top); 4053 } 4054 } 4055 4056 4057 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) { 4058 ASSERT_NOT_NULL(ranges); 4059 int n = ranges->length(); 4060 if (n <= 1) return true; 4061 int max = ranges->at(0).to(); 4062 for (int i = 1; i < n; i++) { 4063 CharacterRange next_range = ranges->at(i); 4064 if (next_range.from() <= max + 1) return false; 4065 max = next_range.to(); 4066 } 4067 return true; 4068 } 4069 4070 SetRelation CharacterRange::WordCharacterRelation( 4071 ZoneList<CharacterRange>* range) { 4072 ASSERT(IsCanonical(range)); 4073 int i = 0; // Word character range index. 4074 int j = 0; // Argument range index. 4075 ASSERT_NE(0, kWordRangeCount); 4076 SetRelation result; 4077 if (range->length() == 0) { 4078 result.SetElementsInSecondSet(); 4079 return result; 4080 } 4081 CharacterRange argument_range = range->at(0); 4082 CharacterRange word_range = CharacterRange(kWordRanges[0], kWordRanges[1]); 4083 while (i < kWordRangeCount && j < range->length()) { 4084 // Check the two ranges for the five cases: 4085 // - no overlap. 4086 // - partial overlap (there are elements in both ranges that isn't 4087 // in the other, and there are also elements that are in both). 4088 // - argument range entirely inside word range. 4089 // - word range entirely inside argument range. 4090 // - ranges are completely equal. 4091 4092 // First check for no overlap. The earlier range is not in the other set. 4093 if (argument_range.from() > word_range.to()) { 4094 // Ranges are disjoint. The earlier word range contains elements that 4095 // cannot be in the argument set. 4096 result.SetElementsInSecondSet(); 4097 } else if (word_range.from() > argument_range.to()) { 4098 // Ranges are disjoint. The earlier argument range contains elements that 4099 // cannot be in the word set. 4100 result.SetElementsInFirstSet(); 4101 } else if (word_range.from() <= argument_range.from() && 4102 word_range.to() >= argument_range.from()) { 4103 result.SetElementsInBothSets(); 4104 // argument range completely inside word range. 4105 if (word_range.from() < argument_range.from() || 4106 word_range.to() > argument_range.from()) { 4107 result.SetElementsInSecondSet(); 4108 } 4109 } else if (word_range.from() >= argument_range.from() && 4110 word_range.to() <= argument_range.from()) { 4111 result.SetElementsInBothSets(); 4112 result.SetElementsInFirstSet(); 4113 } else { 4114 // There is overlap, and neither is a subrange of the other 4115 result.SetElementsInFirstSet(); 4116 result.SetElementsInSecondSet(); 4117 result.SetElementsInBothSets(); 4118 } 4119 if (result.NonTrivialIntersection()) { 4120 // The result is as (im)precise as we can possibly make it. 4121 return result; 4122 } 4123 // Progress the range(s) with minimal to-character. 4124 uc16 word_to = word_range.to(); 4125 uc16 argument_to = argument_range.to(); 4126 if (argument_to <= word_to) { 4127 j++; 4128 if (j < range->length()) { 4129 argument_range = range->at(j); 4130 } 4131 } 4132 if (word_to <= argument_to) { 4133 i += 2; 4134 if (i < kWordRangeCount) { 4135 word_range = CharacterRange(kWordRanges[i], kWordRanges[i + 1]); 4136 } 4137 } 4138 } 4139 // Check if anything wasn't compared in the loop. 4140 if (i < kWordRangeCount) { 4141 // word range contains something not in argument range. 4142 result.SetElementsInSecondSet(); 4143 } else if (j < range->length()) { 4144 // Argument range contains something not in word range. 4145 result.SetElementsInFirstSet(); 4146 } 4147 4148 return result; 4149 } 4150 4151 4152 static void AddUncanonicals(ZoneList<CharacterRange>* ranges, 4153 int bottom, 4154 int top) { 4155 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 4156 // Zones with no case mappings. There is a DEBUG-mode loop to assert that 4157 // this table is correct. 4158 // 0x0600 - 0x0fff 4159 // 0x1100 - 0x1cff 4160 // 0x2000 - 0x20ff 4161 // 0x2200 - 0x23ff 4162 // 0x2500 - 0x2bff 4163 // 0x2e00 - 0xa5ff 4164 // 0xa800 - 0xfaff 4165 // 0xfc00 - 0xfeff 4166 const int boundary_count = 18; 4167 // The ASCII boundary and the kRangeCanonicalizeMax boundary are also in this 4168 // array. This is to split up big ranges and not because they actually denote 4169 // a case-mapping-free-zone. 4170 ASSERT(CharacterRange::kRangeCanonicalizeMax < 0x600); 4171 const int kFirstRealCaselessZoneIndex = 2; 4172 int boundaries[] = {0x80, CharacterRange::kRangeCanonicalizeMax, 4173 0x600, 0x1000, 0x1100, 0x1d00, 0x2000, 0x2100, 0x2200, 0x2400, 0x2500, 4174 0x2c00, 0x2e00, 0xa600, 0xa800, 0xfb00, 0xfc00, 0xff00}; 4175 4176 // Special ASCII rule from spec can save us some work here. 4177 if (bottom == 0x80 && top == 0xffff) return; 4178 4179 // We have optimized support for this range. 4180 if (top <= CharacterRange::kRangeCanonicalizeMax) { 4181 CharacterRange range(bottom, top); 4182 range.AddCaseEquivalents(ranges, false); 4183 return; 4184 } 4185 4186 // Split up very large ranges. This helps remove ranges where there are no 4187 // case mappings. 4188 for (int i = 0; i < boundary_count; i++) { 4189 if (bottom < boundaries[i] && top >= boundaries[i]) { 4190 AddUncanonicals(ranges, bottom, boundaries[i] - 1); 4191 AddUncanonicals(ranges, boundaries[i], top); 4192 return; 4193 } 4194 } 4195 4196 // If we are completely in a zone with no case mappings then we are done. 4197 // We start at 2 so as not to except the ASCII range from mappings. 4198 for (int i = kFirstRealCaselessZoneIndex; i < boundary_count; i += 2) { 4199 if (bottom >= boundaries[i] && top < boundaries[i + 1]) { 4200 #ifdef DEBUG 4201 for (int j = bottom; j <= top; j++) { 4202 unsigned current_char = j; 4203 int length = uncanonicalize.get(current_char, '\0', chars); 4204 for (int k = 0; k < length; k++) { 4205 ASSERT(chars[k] == current_char); 4206 } 4207 } 4208 #endif 4209 return; 4210 } 4211 } 4212 4213 // Step through the range finding equivalent characters. 4214 ZoneList<unibrow::uchar> *characters = new ZoneList<unibrow::uchar>(100); 4215 for (int i = bottom; i <= top; i++) { 4216 int length = uncanonicalize.get(i, '\0', chars); 4217 for (int j = 0; j < length; j++) { 4218 uc32 chr = chars[j]; 4219 if (chr != i && (chr < bottom || chr > top)) { 4220 characters->Add(chr); 4221 } 4222 } 4223 } 4224 4225 // Step through the equivalent characters finding simple ranges and 4226 // adding ranges to the character class. 4227 if (characters->length() > 0) { 4228 int new_from = characters->at(0); 4229 int new_to = new_from; 4230 for (int i = 1; i < characters->length(); i++) { 4231 int chr = characters->at(i); 4232 if (chr == new_to + 1) { 4233 new_to++; 4234 } else { 4235 if (new_to == new_from) { 4236 ranges->Add(CharacterRange::Singleton(new_from)); 4237 } else { 4238 ranges->Add(CharacterRange(new_from, new_to)); 4239 } 4240 new_from = new_to = chr; 4241 } 4242 } 4243 if (new_to == new_from) { 4244 ranges->Add(CharacterRange::Singleton(new_from)); 4245 } else { 4246 ranges->Add(CharacterRange(new_from, new_to)); 4247 } 4248 } 4249 } 4250 4251 4252 ZoneList<CharacterRange>* CharacterSet::ranges() { 4253 if (ranges_ == NULL) { 4254 ranges_ = new ZoneList<CharacterRange>(2); 4255 CharacterRange::AddClassEscape(standard_set_type_, ranges_); 4256 } 4257 return ranges_; 4258 } 4259 4260 4261 // Move a number of elements in a zonelist to another position 4262 // in the same list. Handles overlapping source and target areas. 4263 static void MoveRanges(ZoneList<CharacterRange>* list, 4264 int from, 4265 int to, 4266 int count) { 4267 // Ranges are potentially overlapping. 4268 if (from < to) { 4269 for (int i = count - 1; i >= 0; i--) { 4270 list->at(to + i) = list->at(from + i); 4271 } 4272 } else { 4273 for (int i = 0; i < count; i++) { 4274 list->at(to + i) = list->at(from + i); 4275 } 4276 } 4277 } 4278 4279 4280 static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list, 4281 int count, 4282 CharacterRange insert) { 4283 // Inserts a range into list[0..count[, which must be sorted 4284 // by from value and non-overlapping and non-adjacent, using at most 4285 // list[0..count] for the result. Returns the number of resulting 4286 // canonicalized ranges. Inserting a range may collapse existing ranges into 4287 // fewer ranges, so the return value can be anything in the range 1..count+1. 4288 uc16 from = insert.from(); 4289 uc16 to = insert.to(); 4290 int start_pos = 0; 4291 int end_pos = count; 4292 for (int i = count - 1; i >= 0; i--) { 4293 CharacterRange current = list->at(i); 4294 if (current.from() > to + 1) { 4295 end_pos = i; 4296 } else if (current.to() + 1 < from) { 4297 start_pos = i + 1; 4298 break; 4299 } 4300 } 4301 4302 // Inserted range overlaps, or is adjacent to, ranges at positions 4303 // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are 4304 // not affected by the insertion. 4305 // If start_pos == end_pos, the range must be inserted before start_pos. 4306 // if start_pos < end_pos, the entire range from start_pos to end_pos 4307 // must be merged with the insert range. 4308 4309 if (start_pos == end_pos) { 4310 // Insert between existing ranges at position start_pos. 4311 if (start_pos < count) { 4312 MoveRanges(list, start_pos, start_pos + 1, count - start_pos); 4313 } 4314 list->at(start_pos) = insert; 4315 return count + 1; 4316 } 4317 if (start_pos + 1 == end_pos) { 4318 // Replace single existing range at position start_pos. 4319 CharacterRange to_replace = list->at(start_pos); 4320 int new_from = Min(to_replace.from(), from); 4321 int new_to = Max(to_replace.to(), to); 4322 list->at(start_pos) = CharacterRange(new_from, new_to); 4323 return count; 4324 } 4325 // Replace a number of existing ranges from start_pos to end_pos - 1. 4326 // Move the remaining ranges down. 4327 4328 int new_from = Min(list->at(start_pos).from(), from); 4329 int new_to = Max(list->at(end_pos - 1).to(), to); 4330 if (end_pos < count) { 4331 MoveRanges(list, end_pos, start_pos + 1, count - end_pos); 4332 } 4333 list->at(start_pos) = CharacterRange(new_from, new_to); 4334 return count - (end_pos - start_pos) + 1; 4335 } 4336 4337 4338 void CharacterSet::Canonicalize() { 4339 // Special/default classes are always considered canonical. The result 4340 // of calling ranges() will be sorted. 4341 if (ranges_ == NULL) return; 4342 CharacterRange::Canonicalize(ranges_); 4343 } 4344 4345 4346 void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) { 4347 if (character_ranges->length() <= 1) return; 4348 // Check whether ranges are already canonical (increasing, non-overlapping, 4349 // non-adjacent). 4350 int n = character_ranges->length(); 4351 int max = character_ranges->at(0).to(); 4352 int i = 1; 4353 while (i < n) { 4354 CharacterRange current = character_ranges->at(i); 4355 if (current.from() <= max + 1) { 4356 break; 4357 } 4358 max = current.to(); 4359 i++; 4360 } 4361 // Canonical until the i'th range. If that's all of them, we are done. 4362 if (i == n) return; 4363 4364 // The ranges at index i and forward are not canonicalized. Make them so by 4365 // doing the equivalent of insertion sort (inserting each into the previous 4366 // list, in order). 4367 // Notice that inserting a range can reduce the number of ranges in the 4368 // result due to combining of adjacent and overlapping ranges. 4369 int read = i; // Range to insert. 4370 int num_canonical = i; // Length of canonicalized part of list. 4371 do { 4372 num_canonical = InsertRangeInCanonicalList(character_ranges, 4373 num_canonical, 4374 character_ranges->at(read)); 4375 read++; 4376 } while (read < n); 4377 character_ranges->Rewind(num_canonical); 4378 4379 ASSERT(CharacterRange::IsCanonical(character_ranges)); 4380 } 4381 4382 4383 // Utility function for CharacterRange::Merge. Adds a range at the end of 4384 // a canonicalized range list, if necessary merging the range with the last 4385 // range of the list. 4386 static void AddRangeToSet(ZoneList<CharacterRange>* set, CharacterRange range) { 4387 if (set == NULL) return; 4388 ASSERT(set->length() == 0 || set->at(set->length() - 1).to() < range.from()); 4389 int n = set->length(); 4390 if (n > 0) { 4391 CharacterRange lastRange = set->at(n - 1); 4392 if (lastRange.to() == range.from() - 1) { 4393 set->at(n - 1) = CharacterRange(lastRange.from(), range.to()); 4394 return; 4395 } 4396 } 4397 set->Add(range); 4398 } 4399 4400 4401 static void AddRangeToSelectedSet(int selector, 4402 ZoneList<CharacterRange>* first_set, 4403 ZoneList<CharacterRange>* second_set, 4404 ZoneList<CharacterRange>* intersection_set, 4405 CharacterRange range) { 4406 switch (selector) { 4407 case kInsideFirst: 4408 AddRangeToSet(first_set, range); 4409 break; 4410 case kInsideSecond: 4411 AddRangeToSet(second_set, range); 4412 break; 4413 case kInsideBoth: 4414 AddRangeToSet(intersection_set, range); 4415 break; 4416 } 4417 } 4418 4419 4420 4421 void CharacterRange::Merge(ZoneList<CharacterRange>* first_set, 4422 ZoneList<CharacterRange>* second_set, 4423 ZoneList<CharacterRange>* first_set_only_out, 4424 ZoneList<CharacterRange>* second_set_only_out, 4425 ZoneList<CharacterRange>* both_sets_out) { 4426 // Inputs are canonicalized. 4427 ASSERT(CharacterRange::IsCanonical(first_set)); 4428 ASSERT(CharacterRange::IsCanonical(second_set)); 4429 // Outputs are empty, if applicable. 4430 ASSERT(first_set_only_out == NULL || first_set_only_out->length() == 0); 4431 ASSERT(second_set_only_out == NULL || second_set_only_out->length() == 0); 4432 ASSERT(both_sets_out == NULL || both_sets_out->length() == 0); 4433 4434 // Merge sets by iterating through the lists in order of lowest "from" value, 4435 // and putting intervals into one of three sets. 4436 4437 if (first_set->length() == 0) { 4438 second_set_only_out->AddAll(*second_set); 4439 return; 4440 } 4441 if (second_set->length() == 0) { 4442 first_set_only_out->AddAll(*first_set); 4443 return; 4444 } 4445 // Indices into input lists. 4446 int i1 = 0; 4447 int i2 = 0; 4448 // Cache length of input lists. 4449 int n1 = first_set->length(); 4450 int n2 = second_set->length(); 4451 // Current range. May be invalid if state is kInsideNone. 4452 int from = 0; 4453 int to = -1; 4454 // Where current range comes from. 4455 int state = kInsideNone; 4456 4457 while (i1 < n1 || i2 < n2) { 4458 CharacterRange next_range; 4459 int range_source; 4460 if (i2 == n2 || 4461 (i1 < n1 && first_set->at(i1).from() < second_set->at(i2).from())) { 4462 // Next smallest element is in first set. 4463 next_range = first_set->at(i1++); 4464 range_source = kInsideFirst; 4465 } else { 4466 // Next smallest element is in second set. 4467 next_range = second_set->at(i2++); 4468 range_source = kInsideSecond; 4469 } 4470 if (to < next_range.from()) { 4471 // Ranges disjoint: |current| |next| 4472 AddRangeToSelectedSet(state, 4473 first_set_only_out, 4474 second_set_only_out, 4475 both_sets_out, 4476 CharacterRange(from, to)); 4477 from = next_range.from(); 4478 to = next_range.to(); 4479 state = range_source; 4480 } else { 4481 if (from < next_range.from()) { 4482 AddRangeToSelectedSet(state, 4483 first_set_only_out, 4484 second_set_only_out, 4485 both_sets_out, 4486 CharacterRange(from, next_range.from()-1)); 4487 } 4488 if (to < next_range.to()) { 4489 // Ranges overlap: |current| 4490 // |next| 4491 AddRangeToSelectedSet(state | range_source, 4492 first_set_only_out, 4493 second_set_only_out, 4494 both_sets_out, 4495 CharacterRange(next_range.from(), to)); 4496 from = to + 1; 4497 to = next_range.to(); 4498 state = range_source; 4499 } else { 4500 // Range included: |current| , possibly ending at same character. 4501 // |next| 4502 AddRangeToSelectedSet( 4503 state | range_source, 4504 first_set_only_out, 4505 second_set_only_out, 4506 both_sets_out, 4507 CharacterRange(next_range.from(), next_range.to())); 4508 from = next_range.to() + 1; 4509 // If ranges end at same character, both ranges are consumed completely. 4510 if (next_range.to() == to) state = kInsideNone; 4511 } 4512 } 4513 } 4514 AddRangeToSelectedSet(state, 4515 first_set_only_out, 4516 second_set_only_out, 4517 both_sets_out, 4518 CharacterRange(from, to)); 4519 } 4520 4521 4522 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges, 4523 ZoneList<CharacterRange>* negated_ranges) { 4524 ASSERT(CharacterRange::IsCanonical(ranges)); 4525 ASSERT_EQ(0, negated_ranges->length()); 4526 int range_count = ranges->length(); 4527 uc16 from = 0; 4528 int i = 0; 4529 if (range_count > 0 && ranges->at(0).from() == 0) { 4530 from = ranges->at(0).to(); 4531 i = 1; 4532 } 4533 while (i < range_count) { 4534 CharacterRange range = ranges->at(i); 4535 negated_ranges->Add(CharacterRange(from + 1, range.from() - 1)); 4536 from = range.to(); 4537 i++; 4538 } 4539 if (from < String::kMaxUC16CharCode) { 4540 negated_ranges->Add(CharacterRange(from + 1, String::kMaxUC16CharCode)); 4541 } 4542 } 4543 4544 4545 4546 // ------------------------------------------------------------------- 4547 // Interest propagation 4548 4549 4550 RegExpNode* RegExpNode::TryGetSibling(NodeInfo* info) { 4551 for (int i = 0; i < siblings_.length(); i++) { 4552 RegExpNode* sibling = siblings_.Get(i); 4553 if (sibling->info()->Matches(info)) 4554 return sibling; 4555 } 4556 return NULL; 4557 } 4558 4559 4560 RegExpNode* RegExpNode::EnsureSibling(NodeInfo* info, bool* cloned) { 4561 ASSERT_EQ(false, *cloned); 4562 siblings_.Ensure(this); 4563 RegExpNode* result = TryGetSibling(info); 4564 if (result != NULL) return result; 4565 result = this->Clone(); 4566 NodeInfo* new_info = result->info(); 4567 new_info->ResetCompilationState(); 4568 new_info->AddFromPreceding(info); 4569 AddSibling(result); 4570 *cloned = true; 4571 return result; 4572 } 4573 4574 4575 template <class C> 4576 static RegExpNode* PropagateToEndpoint(C* node, NodeInfo* info) { 4577 NodeInfo full_info(*node->info()); 4578 full_info.AddFromPreceding(info); 4579 bool cloned = false; 4580 return RegExpNode::EnsureSibling(node, &full_info, &cloned); 4581 } 4582 4583 4584 // ------------------------------------------------------------------- 4585 // Splay tree 4586 4587 4588 OutSet* OutSet::Extend(unsigned value) { 4589 if (Get(value)) 4590 return this; 4591 if (successors() != NULL) { 4592 for (int i = 0; i < successors()->length(); i++) { 4593 OutSet* successor = successors()->at(i); 4594 if (successor->Get(value)) 4595 return successor; 4596 } 4597 } else { 4598 successors_ = new ZoneList<OutSet*>(2); 4599 } 4600 OutSet* result = new OutSet(first_, remaining_); 4601 result->Set(value); 4602 successors()->Add(result); 4603 return result; 4604 } 4605 4606 4607 void OutSet::Set(unsigned value) { 4608 if (value < kFirstLimit) { 4609 first_ |= (1 << value); 4610 } else { 4611 if (remaining_ == NULL) 4612 remaining_ = new ZoneList<unsigned>(1); 4613 if (remaining_->is_empty() || !remaining_->Contains(value)) 4614 remaining_->Add(value); 4615 } 4616 } 4617 4618 4619 bool OutSet::Get(unsigned value) { 4620 if (value < kFirstLimit) { 4621 return (first_ & (1 << value)) != 0; 4622 } else if (remaining_ == NULL) { 4623 return false; 4624 } else { 4625 return remaining_->Contains(value); 4626 } 4627 } 4628 4629 4630 const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar; 4631 const DispatchTable::Entry DispatchTable::Config::kNoValue; 4632 4633 4634 void DispatchTable::AddRange(CharacterRange full_range, int value) { 4635 CharacterRange current = full_range; 4636 if (tree()->is_empty()) { 4637 // If this is the first range we just insert into the table. 4638 ZoneSplayTree<Config>::Locator loc; 4639 ASSERT_RESULT(tree()->Insert(current.from(), &loc)); 4640 loc.set_value(Entry(current.from(), current.to(), empty()->Extend(value))); 4641 return; 4642 } 4643 // First see if there is a range to the left of this one that 4644 // overlaps. 4645 ZoneSplayTree<Config>::Locator loc; 4646 if (tree()->FindGreatestLessThan(current.from(), &loc)) { 4647 Entry* entry = &loc.value(); 4648 // If we've found a range that overlaps with this one, and it 4649 // starts strictly to the left of this one, we have to fix it 4650 // because the following code only handles ranges that start on 4651 // or after the start point of the range we're adding. 4652 if (entry->from() < current.from() && entry->to() >= current.from()) { 4653 // Snap the overlapping range in half around the start point of 4654 // the range we're adding. 4655 CharacterRange left(entry->from(), current.from() - 1); 4656 CharacterRange right(current.from(), entry->to()); 4657 // The left part of the overlapping range doesn't overlap. 4658 // Truncate the whole entry to be just the left part. 4659 entry->set_to(left.to()); 4660 // The right part is the one that overlaps. We add this part 4661 // to the map and let the next step deal with merging it with 4662 // the range we're adding. 4663 ZoneSplayTree<Config>::Locator loc; 4664 ASSERT_RESULT(tree()->Insert(right.from(), &loc)); 4665 loc.set_value(Entry(right.from(), 4666 right.to(), 4667 entry->out_set())); 4668 } 4669 } 4670 while (current.is_valid()) { 4671 if (tree()->FindLeastGreaterThan(current.from(), &loc) && 4672 (loc.value().from() <= current.to()) && 4673 (loc.value().to() >= current.from())) { 4674 Entry* entry = &loc.value(); 4675 // We have overlap. If there is space between the start point of 4676 // the range we're adding and where the overlapping range starts 4677 // then we have to add a range covering just that space. 4678 if (current.from() < entry->from()) { 4679 ZoneSplayTree<Config>::Locator ins; 4680 ASSERT_RESULT(tree()->Insert(current.from(), &ins)); 4681 ins.set_value(Entry(current.from(), 4682 entry->from() - 1, 4683 empty()->Extend(value))); 4684 current.set_from(entry->from()); 4685 } 4686 ASSERT_EQ(current.from(), entry->from()); 4687 // If the overlapping range extends beyond the one we want to add 4688 // we have to snap the right part off and add it separately. 4689 if (entry->to() > current.to()) { 4690 ZoneSplayTree<Config>::Locator ins; 4691 ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins)); 4692 ins.set_value(Entry(current.to() + 1, 4693 entry->to(), 4694 entry->out_set())); 4695 entry->set_to(current.to()); 4696 } 4697 ASSERT(entry->to() <= current.to()); 4698 // The overlapping range is now completely contained by the range 4699 // we're adding so we can just update it and move the start point 4700 // of the range we're adding just past it. 4701 entry->AddValue(value); 4702 // Bail out if the last interval ended at 0xFFFF since otherwise 4703 // adding 1 will wrap around to 0. 4704 if (entry->to() == String::kMaxUC16CharCode) 4705 break; 4706 ASSERT(entry->to() + 1 > current.from()); 4707 current.set_from(entry->to() + 1); 4708 } else { 4709 // There is no overlap so we can just add the range 4710 ZoneSplayTree<Config>::Locator ins; 4711 ASSERT_RESULT(tree()->Insert(current.from(), &ins)); 4712 ins.set_value(Entry(current.from(), 4713 current.to(), 4714 empty()->Extend(value))); 4715 break; 4716 } 4717 } 4718 } 4719 4720 4721 OutSet* DispatchTable::Get(uc16 value) { 4722 ZoneSplayTree<Config>::Locator loc; 4723 if (!tree()->FindGreatestLessThan(value, &loc)) 4724 return empty(); 4725 Entry* entry = &loc.value(); 4726 if (value <= entry->to()) 4727 return entry->out_set(); 4728 else 4729 return empty(); 4730 } 4731 4732 4733 // ------------------------------------------------------------------- 4734 // Analysis 4735 4736 4737 void Analysis::EnsureAnalyzed(RegExpNode* that) { 4738 StackLimitCheck check; 4739 if (check.HasOverflowed()) { 4740 fail("Stack overflow"); 4741 return; 4742 } 4743 if (that->info()->been_analyzed || that->info()->being_analyzed) 4744 return; 4745 that->info()->being_analyzed = true; 4746 that->Accept(this); 4747 that->info()->being_analyzed = false; 4748 that->info()->been_analyzed = true; 4749 } 4750 4751 4752 void Analysis::VisitEnd(EndNode* that) { 4753 // nothing to do 4754 } 4755 4756 4757 void TextNode::CalculateOffsets() { 4758 int element_count = elements()->length(); 4759 // Set up the offsets of the elements relative to the start. This is a fixed 4760 // quantity since a TextNode can only contain fixed-width things. 4761 int cp_offset = 0; 4762 for (int i = 0; i < element_count; i++) { 4763 TextElement& elm = elements()->at(i); 4764 elm.cp_offset = cp_offset; 4765 if (elm.type == TextElement::ATOM) { 4766 cp_offset += elm.data.u_atom->data().length(); 4767 } else { 4768 cp_offset++; 4769 Vector<const uc16> quarks = elm.data.u_atom->data(); 4770 } 4771 } 4772 } 4773 4774 4775 void Analysis::VisitText(TextNode* that) { 4776 if (ignore_case_) { 4777 that->MakeCaseIndependent(is_ascii_); 4778 } 4779 EnsureAnalyzed(that->on_success()); 4780 if (!has_failed()) { 4781 that->CalculateOffsets(); 4782 } 4783 } 4784 4785 4786 void Analysis::VisitAction(ActionNode* that) { 4787 RegExpNode* target = that->on_success(); 4788 EnsureAnalyzed(target); 4789 if (!has_failed()) { 4790 // If the next node is interested in what it follows then this node 4791 // has to be interested too so it can pass the information on. 4792 that->info()->AddFromFollowing(target->info()); 4793 } 4794 } 4795 4796 4797 void Analysis::VisitChoice(ChoiceNode* that) { 4798 NodeInfo* info = that->info(); 4799 for (int i = 0; i < that->alternatives()->length(); i++) { 4800 RegExpNode* node = that->alternatives()->at(i).node(); 4801 EnsureAnalyzed(node); 4802 if (has_failed()) return; 4803 // Anything the following nodes need to know has to be known by 4804 // this node also, so it can pass it on. 4805 info->AddFromFollowing(node->info()); 4806 } 4807 } 4808 4809 4810 void Analysis::VisitLoopChoice(LoopChoiceNode* that) { 4811 NodeInfo* info = that->info(); 4812 for (int i = 0; i < that->alternatives()->length(); i++) { 4813 RegExpNode* node = that->alternatives()->at(i).node(); 4814 if (node != that->loop_node()) { 4815 EnsureAnalyzed(node); 4816 if (has_failed()) return; 4817 info->AddFromFollowing(node->info()); 4818 } 4819 } 4820 // Check the loop last since it may need the value of this node 4821 // to get a correct result. 4822 EnsureAnalyzed(that->loop_node()); 4823 if (!has_failed()) { 4824 info->AddFromFollowing(that->loop_node()->info()); 4825 } 4826 } 4827 4828 4829 void Analysis::VisitBackReference(BackReferenceNode* that) { 4830 EnsureAnalyzed(that->on_success()); 4831 } 4832 4833 4834 void Analysis::VisitAssertion(AssertionNode* that) { 4835 EnsureAnalyzed(that->on_success()); 4836 AssertionNode::AssertionNodeType type = that->type(); 4837 if (type == AssertionNode::AT_BOUNDARY || 4838 type == AssertionNode::AT_NON_BOUNDARY) { 4839 // Check if the following character is known to be a word character 4840 // or known to not be a word character. 4841 ZoneList<CharacterRange>* following_chars = that->FirstCharacterSet(); 4842 4843 CharacterRange::Canonicalize(following_chars); 4844 4845 SetRelation word_relation = 4846 CharacterRange::WordCharacterRelation(following_chars); 4847 if (word_relation.Disjoint()) { 4848 // Includes the case where following_chars is empty (e.g., end-of-input). 4849 // Following character is definitely *not* a word character. 4850 type = (type == AssertionNode::AT_BOUNDARY) ? 4851 AssertionNode::AFTER_WORD_CHARACTER : 4852 AssertionNode::AFTER_NONWORD_CHARACTER; 4853 that->set_type(type); 4854 } else if (word_relation.ContainedIn()) { 4855 // Following character is definitely a word character. 4856 type = (type == AssertionNode::AT_BOUNDARY) ? 4857 AssertionNode::AFTER_NONWORD_CHARACTER : 4858 AssertionNode::AFTER_WORD_CHARACTER; 4859 that->set_type(type); 4860 } 4861 } 4862 } 4863 4864 4865 ZoneList<CharacterRange>* RegExpNode::FirstCharacterSet() { 4866 if (first_character_set_ == NULL) { 4867 if (ComputeFirstCharacterSet(kFirstCharBudget) < 0) { 4868 // If we can't find an exact solution within the budget, we 4869 // set the value to the set of every character, i.e., all characters 4870 // are possible. 4871 ZoneList<CharacterRange>* all_set = new ZoneList<CharacterRange>(1); 4872 all_set->Add(CharacterRange::Everything()); 4873 first_character_set_ = all_set; 4874 } 4875 } 4876 return first_character_set_; 4877 } 4878 4879 4880 int RegExpNode::ComputeFirstCharacterSet(int budget) { 4881 // Default behavior is to not be able to determine the first character. 4882 return kComputeFirstCharacterSetFail; 4883 } 4884 4885 4886 int LoopChoiceNode::ComputeFirstCharacterSet(int budget) { 4887 budget--; 4888 if (budget >= 0) { 4889 // Find loop min-iteration. It's the value of the guarded choice node 4890 // with a GEQ guard, if any. 4891 int min_repetition = 0; 4892 4893 for (int i = 0; i <= 1; i++) { 4894 GuardedAlternative alternative = alternatives()->at(i); 4895 ZoneList<Guard*>* guards = alternative.guards(); 4896 if (guards != NULL && guards->length() > 0) { 4897 Guard* guard = guards->at(0); 4898 if (guard->op() == Guard::GEQ) { 4899 min_repetition = guard->value(); 4900 break; 4901 } 4902 } 4903 } 4904 4905 budget = loop_node()->ComputeFirstCharacterSet(budget); 4906 if (budget >= 0) { 4907 ZoneList<CharacterRange>* character_set = 4908 loop_node()->first_character_set(); 4909 if (body_can_be_zero_length() || min_repetition == 0) { 4910 budget = continue_node()->ComputeFirstCharacterSet(budget); 4911 if (budget < 0) return budget; 4912 ZoneList<CharacterRange>* body_set = 4913 continue_node()->first_character_set(); 4914 ZoneList<CharacterRange>* union_set = 4915 new ZoneList<CharacterRange>(Max(character_set->length(), 4916 body_set->length())); 4917 CharacterRange::Merge(character_set, 4918 body_set, 4919 union_set, 4920 union_set, 4921 union_set); 4922 character_set = union_set; 4923 } 4924 set_first_character_set(character_set); 4925 } 4926 } 4927 return budget; 4928 } 4929 4930 4931 int NegativeLookaheadChoiceNode::ComputeFirstCharacterSet(int budget) { 4932 budget--; 4933 if (budget >= 0) { 4934 GuardedAlternative successor = this->alternatives()->at(1); 4935 RegExpNode* successor_node = successor.node(); 4936 budget = successor_node->ComputeFirstCharacterSet(budget); 4937 if (budget >= 0) { 4938 set_first_character_set(successor_node->first_character_set()); 4939 } 4940 } 4941 return budget; 4942 } 4943 4944 4945 // The first character set of an EndNode is unknowable. Just use the 4946 // default implementation that fails and returns all characters as possible. 4947 4948 4949 int AssertionNode::ComputeFirstCharacterSet(int budget) { 4950 budget -= 1; 4951 if (budget >= 0) { 4952 switch (type_) { 4953 case AT_END: { 4954 set_first_character_set(new ZoneList<CharacterRange>(0)); 4955 break; 4956 } 4957 case AT_START: 4958 case AT_BOUNDARY: 4959 case AT_NON_BOUNDARY: 4960 case AFTER_NEWLINE: 4961 case AFTER_NONWORD_CHARACTER: 4962 case AFTER_WORD_CHARACTER: { 4963 ASSERT_NOT_NULL(on_success()); 4964 budget = on_success()->ComputeFirstCharacterSet(budget); 4965 set_first_character_set(on_success()->first_character_set()); 4966 break; 4967 } 4968 } 4969 } 4970 return budget; 4971 } 4972 4973 4974 int ActionNode::ComputeFirstCharacterSet(int budget) { 4975 if (type_ == POSITIVE_SUBMATCH_SUCCESS) return kComputeFirstCharacterSetFail; 4976 budget--; 4977 if (budget >= 0) { 4978 ASSERT_NOT_NULL(on_success()); 4979 budget = on_success()->ComputeFirstCharacterSet(budget); 4980 if (budget >= 0) { 4981 set_first_character_set(on_success()->first_character_set()); 4982 } 4983 } 4984 return budget; 4985 } 4986 4987 4988 int BackReferenceNode::ComputeFirstCharacterSet(int budget) { 4989 // We don't know anything about the first character of a backreference 4990 // at this point. 4991 return kComputeFirstCharacterSetFail; 4992 } 4993 4994 4995 int TextNode::ComputeFirstCharacterSet(int budget) { 4996 budget--; 4997 if (budget >= 0) { 4998 ASSERT_NE(0, elements()->length()); 4999 TextElement text = elements()->at(0); 5000 if (text.type == TextElement::ATOM) { 5001 RegExpAtom* atom = text.data.u_atom; 5002 ASSERT_NE(0, atom->length()); 5003 uc16 first_char = atom->data()[0]; 5004 ZoneList<CharacterRange>* range = new ZoneList<CharacterRange>(1); 5005 range->Add(CharacterRange(first_char, first_char)); 5006 set_first_character_set(range); 5007 } else { 5008 ASSERT(text.type == TextElement::CHAR_CLASS); 5009 RegExpCharacterClass* char_class = text.data.u_char_class; 5010 if (char_class->is_negated()) { 5011 ZoneList<CharacterRange>* ranges = char_class->ranges(); 5012 int length = ranges->length(); 5013 int new_length = length + 1; 5014 if (length > 0) { 5015 if (ranges->at(0).from() == 0) new_length--; 5016 if (ranges->at(length - 1).to() == String::kMaxUC16CharCode) { 5017 new_length--; 5018 } 5019 } 5020 ZoneList<CharacterRange>* negated_ranges = 5021 new ZoneList<CharacterRange>(new_length); 5022 CharacterRange::Negate(ranges, negated_ranges); 5023 set_first_character_set(negated_ranges); 5024 } else { 5025 set_first_character_set(char_class->ranges()); 5026 } 5027 } 5028 } 5029 return budget; 5030 } 5031 5032 5033 5034 // ------------------------------------------------------------------- 5035 // Dispatch table construction 5036 5037 5038 void DispatchTableConstructor::VisitEnd(EndNode* that) { 5039 AddRange(CharacterRange::Everything()); 5040 } 5041 5042 5043 void DispatchTableConstructor::BuildTable(ChoiceNode* node) { 5044 node->set_being_calculated(true); 5045 ZoneList<GuardedAlternative>* alternatives = node->alternatives(); 5046 for (int i = 0; i < alternatives->length(); i++) { 5047 set_choice_index(i); 5048 alternatives->at(i).node()->Accept(this); 5049 } 5050 node->set_being_calculated(false); 5051 } 5052 5053 5054 class AddDispatchRange { 5055 public: 5056 explicit AddDispatchRange(DispatchTableConstructor* constructor) 5057 : constructor_(constructor) { } 5058 void Call(uc32 from, DispatchTable::Entry entry); 5059 private: 5060 DispatchTableConstructor* constructor_; 5061 }; 5062 5063 5064 void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) { 5065 CharacterRange range(from, entry.to()); 5066 constructor_->AddRange(range); 5067 } 5068 5069 5070 void DispatchTableConstructor::VisitChoice(ChoiceNode* node) { 5071 if (node->being_calculated()) 5072 return; 5073 DispatchTable* table = node->GetTable(ignore_case_); 5074 AddDispatchRange adder(this); 5075 table->ForEach(&adder); 5076 } 5077 5078 5079 void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) { 5080 // TODO(160): Find the node that we refer back to and propagate its start 5081 // set back to here. For now we just accept anything. 5082 AddRange(CharacterRange::Everything()); 5083 } 5084 5085 5086 void DispatchTableConstructor::VisitAssertion(AssertionNode* that) { 5087 RegExpNode* target = that->on_success(); 5088 target->Accept(this); 5089 } 5090 5091 5092 static int CompareRangeByFrom(const CharacterRange* a, 5093 const CharacterRange* b) { 5094 return Compare<uc16>(a->from(), b->from()); 5095 } 5096 5097 5098 void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) { 5099 ranges->Sort(CompareRangeByFrom); 5100 uc16 last = 0; 5101 for (int i = 0; i < ranges->length(); i++) { 5102 CharacterRange range = ranges->at(i); 5103 if (last < range.from()) 5104 AddRange(CharacterRange(last, range.from() - 1)); 5105 if (range.to() >= last) { 5106 if (range.to() == String::kMaxUC16CharCode) { 5107 return; 5108 } else { 5109 last = range.to() + 1; 5110 } 5111 } 5112 } 5113 AddRange(CharacterRange(last, String::kMaxUC16CharCode)); 5114 } 5115 5116 5117 void DispatchTableConstructor::VisitText(TextNode* that) { 5118 TextElement elm = that->elements()->at(0); 5119 switch (elm.type) { 5120 case TextElement::ATOM: { 5121 uc16 c = elm.data.u_atom->data()[0]; 5122 AddRange(CharacterRange(c, c)); 5123 break; 5124 } 5125 case TextElement::CHAR_CLASS: { 5126 RegExpCharacterClass* tree = elm.data.u_char_class; 5127 ZoneList<CharacterRange>* ranges = tree->ranges(); 5128 if (tree->is_negated()) { 5129 AddInverse(ranges); 5130 } else { 5131 for (int i = 0; i < ranges->length(); i++) 5132 AddRange(ranges->at(i)); 5133 } 5134 break; 5135 } 5136 default: { 5137 UNIMPLEMENTED(); 5138 } 5139 } 5140 } 5141 5142 5143 void DispatchTableConstructor::VisitAction(ActionNode* that) { 5144 RegExpNode* target = that->on_success(); 5145 target->Accept(this); 5146 } 5147 5148 5149 RegExpEngine::CompilationResult RegExpEngine::Compile(RegExpCompileData* data, 5150 bool ignore_case, 5151 bool is_multiline, 5152 Handle<String> pattern, 5153 bool is_ascii) { 5154 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) { 5155 return IrregexpRegExpTooBig(); 5156 } 5157 RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii); 5158 // Wrap the body of the regexp in capture #0. 5159 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree, 5160 0, 5161 &compiler, 5162 compiler.accept()); 5163 RegExpNode* node = captured_body; 5164 if (!data->tree->IsAnchored()) { 5165 // Add a .*? at the beginning, outside the body capture, unless 5166 // this expression is anchored at the beginning. 5167 RegExpNode* loop_node = 5168 RegExpQuantifier::ToNode(0, 5169 RegExpTree::kInfinity, 5170 false, 5171 new RegExpCharacterClass('*'), 5172 &compiler, 5173 captured_body, 5174 data->contains_anchor); 5175 5176 if (data->contains_anchor) { 5177 // Unroll loop once, to take care of the case that might start 5178 // at the start of input. 5179 ChoiceNode* first_step_node = new ChoiceNode(2); 5180 first_step_node->AddAlternative(GuardedAlternative(captured_body)); 5181 first_step_node->AddAlternative(GuardedAlternative( 5182 new TextNode(new RegExpCharacterClass('*'), loop_node))); 5183 node = first_step_node; 5184 } else { 5185 node = loop_node; 5186 } 5187 } 5188 data->node = node; 5189 Analysis analysis(ignore_case, is_ascii); 5190 analysis.EnsureAnalyzed(node); 5191 if (analysis.has_failed()) { 5192 const char* error_message = analysis.error_message(); 5193 return CompilationResult(error_message); 5194 } 5195 5196 NodeInfo info = *node->info(); 5197 5198 // Create the correct assembler for the architecture. 5199 #ifdef V8_NATIVE_REGEXP 5200 // Native regexp implementation. 5201 5202 NativeRegExpMacroAssembler::Mode mode = 5203 is_ascii ? NativeRegExpMacroAssembler::ASCII 5204 : NativeRegExpMacroAssembler::UC16; 5205 5206 #if V8_TARGET_ARCH_IA32 5207 RegExpMacroAssemblerIA32 macro_assembler(mode, (data->capture_count + 1) * 2); 5208 #elif V8_TARGET_ARCH_X64 5209 RegExpMacroAssemblerX64 macro_assembler(mode, (data->capture_count + 1) * 2); 5210 #elif V8_TARGET_ARCH_ARM 5211 RegExpMacroAssemblerARM macro_assembler(mode, (data->capture_count + 1) * 2); 5212 #endif 5213 5214 #else // ! V8_NATIVE_REGEXP 5215 // Interpreted regexp implementation. 5216 EmbeddedVector<byte, 1024> codes; 5217 RegExpMacroAssemblerIrregexp macro_assembler(codes); 5218 #endif 5219 5220 return compiler.Assemble(¯o_assembler, 5221 node, 5222 data->capture_count, 5223 pattern); 5224 } 5225 5226 5227 int OffsetsVector::static_offsets_vector_[ 5228 OffsetsVector::kStaticOffsetsVectorSize]; 5229 5230 }} // namespace v8::internal 5231