1 // Copyright 2012 the V8 project authors. All rights reserved. 2 // Redistribution and use in source and binary forms, with or without 3 // modification, are permitted provided that the following conditions are 4 // met: 5 // 6 // * Redistributions of source code must retain the above copyright 7 // notice, this list of conditions and the following disclaimer. 8 // * Redistributions in binary form must reproduce the above 9 // copyright notice, this list of conditions and the following 10 // disclaimer in the documentation and/or other materials provided 11 // with the distribution. 12 // * Neither the name of Google Inc. nor the names of its 13 // contributors may be used to endorse or promote products derived 14 // from this software without specific prior written permission. 15 // 16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 28 #include "v8.h" 29 30 #include "ast.h" 31 #include "compiler.h" 32 #include "execution.h" 33 #include "factory.h" 34 #include "jsregexp.h" 35 #include "jsregexp-inl.h" 36 #include "platform.h" 37 #include "string-search.h" 38 #include "runtime.h" 39 #include "compilation-cache.h" 40 #include "string-stream.h" 41 #include "parser.h" 42 #include "regexp-macro-assembler.h" 43 #include "regexp-macro-assembler-tracer.h" 44 #include "regexp-macro-assembler-irregexp.h" 45 #include "regexp-stack.h" 46 47 #ifndef V8_INTERPRETED_REGEXP 48 #if V8_TARGET_ARCH_IA32 49 #include "ia32/regexp-macro-assembler-ia32.h" 50 #elif V8_TARGET_ARCH_X64 51 #include "x64/regexp-macro-assembler-x64.h" 52 #elif V8_TARGET_ARCH_ARM 53 #include "arm/regexp-macro-assembler-arm.h" 54 #elif V8_TARGET_ARCH_MIPS 55 #include "mips/regexp-macro-assembler-mips.h" 56 #else 57 #error Unsupported target architecture. 58 #endif 59 #endif 60 61 #include "interpreter-irregexp.h" 62 63 64 namespace v8 { 65 namespace internal { 66 67 Handle<Object> RegExpImpl::CreateRegExpLiteral(Handle<JSFunction> constructor, 68 Handle<String> pattern, 69 Handle<String> flags, 70 bool* has_pending_exception) { 71 // Call the construct code with 2 arguments. 72 Handle<Object> argv[] = { pattern, flags }; 73 return Execution::New(constructor, ARRAY_SIZE(argv), argv, 74 has_pending_exception); 75 } 76 77 78 static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) { 79 int flags = JSRegExp::NONE; 80 for (int i = 0; i < str->length(); i++) { 81 switch (str->Get(i)) { 82 case 'i': 83 flags |= JSRegExp::IGNORE_CASE; 84 break; 85 case 'g': 86 flags |= JSRegExp::GLOBAL; 87 break; 88 case 'm': 89 flags |= JSRegExp::MULTILINE; 90 break; 91 } 92 } 93 return JSRegExp::Flags(flags); 94 } 95 96 97 static inline void ThrowRegExpException(Handle<JSRegExp> re, 98 Handle<String> pattern, 99 Handle<String> error_text, 100 const char* message) { 101 Isolate* isolate = re->GetIsolate(); 102 Factory* factory = isolate->factory(); 103 Handle<FixedArray> elements = factory->NewFixedArray(2); 104 elements->set(0, *pattern); 105 elements->set(1, *error_text); 106 Handle<JSArray> array = factory->NewJSArrayWithElements(elements); 107 Handle<Object> regexp_err = factory->NewSyntaxError(message, array); 108 isolate->Throw(*regexp_err); 109 } 110 111 112 ContainedInLattice AddRange(ContainedInLattice containment, 113 const int* ranges, 114 int ranges_length, 115 Interval new_range) { 116 ASSERT((ranges_length & 1) == 1); 117 ASSERT(ranges[ranges_length - 1] == String::kMaxUtf16CodeUnit + 1); 118 if (containment == kLatticeUnknown) return containment; 119 bool inside = false; 120 int last = 0; 121 for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) { 122 // Consider the range from last to ranges[i]. 123 // We haven't got to the new range yet. 124 if (ranges[i] <= new_range.from()) continue; 125 // New range is wholly inside last-ranges[i]. Note that new_range.to() is 126 // inclusive, but the values in ranges are not. 127 if (last <= new_range.from() && new_range.to() < ranges[i]) { 128 return Combine(containment, inside ? kLatticeIn : kLatticeOut); 129 } 130 return kLatticeUnknown; 131 } 132 return containment; 133 } 134 135 136 // More makes code generation slower, less makes V8 benchmark score lower. 137 const int kMaxLookaheadForBoyerMoore = 8; 138 // In a 3-character pattern you can maximally step forwards 3 characters 139 // at a time, which is not always enough to pay for the extra logic. 140 const int kPatternTooShortForBoyerMoore = 2; 141 142 143 // Identifies the sort of regexps where the regexp engine is faster 144 // than the code used for atom matches. 145 static bool HasFewDifferentCharacters(Handle<String> pattern) { 146 int length = Min(kMaxLookaheadForBoyerMoore, pattern->length()); 147 if (length <= kPatternTooShortForBoyerMoore) return false; 148 const int kMod = 128; 149 bool character_found[kMod]; 150 int different = 0; 151 memset(&character_found[0], 0, sizeof(character_found)); 152 for (int i = 0; i < length; i++) { 153 int ch = (pattern->Get(i) & (kMod - 1)); 154 if (!character_found[ch]) { 155 character_found[ch] = true; 156 different++; 157 // We declare a regexp low-alphabet if it has at least 3 times as many 158 // characters as it has different characters. 159 if (different * 3 > length) return false; 160 } 161 } 162 return true; 163 } 164 165 166 // Generic RegExp methods. Dispatches to implementation specific methods. 167 168 169 Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re, 170 Handle<String> pattern, 171 Handle<String> flag_str) { 172 Isolate* isolate = re->GetIsolate(); 173 Zone zone(isolate); 174 JSRegExp::Flags flags = RegExpFlagsFromString(flag_str); 175 CompilationCache* compilation_cache = isolate->compilation_cache(); 176 Handle<FixedArray> cached = compilation_cache->LookupRegExp(pattern, flags); 177 bool in_cache = !cached.is_null(); 178 LOG(isolate, RegExpCompileEvent(re, in_cache)); 179 180 Handle<Object> result; 181 if (in_cache) { 182 re->set_data(*cached); 183 return re; 184 } 185 pattern = FlattenGetString(pattern); 186 PostponeInterruptsScope postpone(isolate); 187 RegExpCompileData parse_result; 188 FlatStringReader reader(isolate, pattern); 189 if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(), 190 &parse_result, &zone)) { 191 // Throw an exception if we fail to parse the pattern. 192 ThrowRegExpException(re, 193 pattern, 194 parse_result.error, 195 "malformed_regexp"); 196 return Handle<Object>::null(); 197 } 198 199 bool has_been_compiled = false; 200 201 if (parse_result.simple && 202 !flags.is_ignore_case() && 203 !HasFewDifferentCharacters(pattern)) { 204 // Parse-tree is a single atom that is equal to the pattern. 205 AtomCompile(re, pattern, flags, pattern); 206 has_been_compiled = true; 207 } else if (parse_result.tree->IsAtom() && 208 !flags.is_ignore_case() && 209 parse_result.capture_count == 0) { 210 RegExpAtom* atom = parse_result.tree->AsAtom(); 211 Vector<const uc16> atom_pattern = atom->data(); 212 Handle<String> atom_string = 213 isolate->factory()->NewStringFromTwoByte(atom_pattern); 214 if (!HasFewDifferentCharacters(atom_string)) { 215 AtomCompile(re, pattern, flags, atom_string); 216 has_been_compiled = true; 217 } 218 } 219 if (!has_been_compiled) { 220 IrregexpInitialize(re, pattern, flags, parse_result.capture_count); 221 } 222 ASSERT(re->data()->IsFixedArray()); 223 // Compilation succeeded so the data is set on the regexp 224 // and we can store it in the cache. 225 Handle<FixedArray> data(FixedArray::cast(re->data())); 226 compilation_cache->PutRegExp(pattern, flags, data); 227 228 return re; 229 } 230 231 232 Handle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp, 233 Handle<String> subject, 234 int index, 235 Handle<JSArray> last_match_info) { 236 switch (regexp->TypeTag()) { 237 case JSRegExp::ATOM: 238 return AtomExec(regexp, subject, index, last_match_info); 239 case JSRegExp::IRREGEXP: { 240 Handle<Object> result = 241 IrregexpExec(regexp, subject, index, last_match_info); 242 ASSERT(!result.is_null() || 243 regexp->GetIsolate()->has_pending_exception()); 244 return result; 245 } 246 default: 247 UNREACHABLE(); 248 return Handle<Object>::null(); 249 } 250 } 251 252 253 // RegExp Atom implementation: Simple string search using indexOf. 254 255 256 void RegExpImpl::AtomCompile(Handle<JSRegExp> re, 257 Handle<String> pattern, 258 JSRegExp::Flags flags, 259 Handle<String> match_pattern) { 260 re->GetIsolate()->factory()->SetRegExpAtomData(re, 261 JSRegExp::ATOM, 262 pattern, 263 flags, 264 match_pattern); 265 } 266 267 268 static void SetAtomLastCapture(FixedArray* array, 269 String* subject, 270 int from, 271 int to) { 272 SealHandleScope shs(array->GetIsolate()); 273 RegExpImpl::SetLastCaptureCount(array, 2); 274 RegExpImpl::SetLastSubject(array, subject); 275 RegExpImpl::SetLastInput(array, subject); 276 RegExpImpl::SetCapture(array, 0, from); 277 RegExpImpl::SetCapture(array, 1, to); 278 } 279 280 281 int RegExpImpl::AtomExecRaw(Handle<JSRegExp> regexp, 282 Handle<String> subject, 283 int index, 284 int32_t* output, 285 int output_size) { 286 Isolate* isolate = regexp->GetIsolate(); 287 288 ASSERT(0 <= index); 289 ASSERT(index <= subject->length()); 290 291 if (!subject->IsFlat()) FlattenString(subject); 292 DisallowHeapAllocation no_gc; // ensure vectors stay valid 293 294 String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex)); 295 int needle_len = needle->length(); 296 ASSERT(needle->IsFlat()); 297 ASSERT_LT(0, needle_len); 298 299 if (index + needle_len > subject->length()) { 300 return RegExpImpl::RE_FAILURE; 301 } 302 303 for (int i = 0; i < output_size; i += 2) { 304 String::FlatContent needle_content = needle->GetFlatContent(); 305 String::FlatContent subject_content = subject->GetFlatContent(); 306 ASSERT(needle_content.IsFlat()); 307 ASSERT(subject_content.IsFlat()); 308 // dispatch on type of strings 309 index = (needle_content.IsAscii() 310 ? (subject_content.IsAscii() 311 ? SearchString(isolate, 312 subject_content.ToOneByteVector(), 313 needle_content.ToOneByteVector(), 314 index) 315 : SearchString(isolate, 316 subject_content.ToUC16Vector(), 317 needle_content.ToOneByteVector(), 318 index)) 319 : (subject_content.IsAscii() 320 ? SearchString(isolate, 321 subject_content.ToOneByteVector(), 322 needle_content.ToUC16Vector(), 323 index) 324 : SearchString(isolate, 325 subject_content.ToUC16Vector(), 326 needle_content.ToUC16Vector(), 327 index))); 328 if (index == -1) { 329 return i / 2; // Return number of matches. 330 } else { 331 output[i] = index; 332 output[i+1] = index + needle_len; 333 index += needle_len; 334 } 335 } 336 return output_size / 2; 337 } 338 339 340 Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re, 341 Handle<String> subject, 342 int index, 343 Handle<JSArray> last_match_info) { 344 Isolate* isolate = re->GetIsolate(); 345 346 static const int kNumRegisters = 2; 347 STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize); 348 int32_t* output_registers = isolate->jsregexp_static_offsets_vector(); 349 350 int res = AtomExecRaw(re, subject, index, output_registers, kNumRegisters); 351 352 if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value(); 353 354 ASSERT_EQ(res, RegExpImpl::RE_SUCCESS); 355 SealHandleScope shs(isolate); 356 FixedArray* array = FixedArray::cast(last_match_info->elements()); 357 SetAtomLastCapture(array, *subject, output_registers[0], output_registers[1]); 358 return last_match_info; 359 } 360 361 362 // Irregexp implementation. 363 364 // Ensures that the regexp object contains a compiled version of the 365 // source for either ASCII or non-ASCII strings. 366 // If the compiled version doesn't already exist, it is compiled 367 // from the source pattern. 368 // If compilation fails, an exception is thrown and this function 369 // returns false. 370 bool RegExpImpl::EnsureCompiledIrregexp( 371 Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii) { 372 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii)); 373 #ifdef V8_INTERPRETED_REGEXP 374 if (compiled_code->IsByteArray()) return true; 375 #else // V8_INTERPRETED_REGEXP (RegExp native code) 376 if (compiled_code->IsCode()) return true; 377 #endif 378 // We could potentially have marked this as flushable, but have kept 379 // a saved version if we did not flush it yet. 380 Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_ascii)); 381 if (saved_code->IsCode()) { 382 // Reinstate the code in the original place. 383 re->SetDataAt(JSRegExp::code_index(is_ascii), saved_code); 384 ASSERT(compiled_code->IsSmi()); 385 return true; 386 } 387 return CompileIrregexp(re, sample_subject, is_ascii); 388 } 389 390 391 static bool CreateRegExpErrorObjectAndThrow(Handle<JSRegExp> re, 392 bool is_ascii, 393 Handle<String> error_message, 394 Isolate* isolate) { 395 Factory* factory = isolate->factory(); 396 Handle<FixedArray> elements = factory->NewFixedArray(2); 397 elements->set(0, re->Pattern()); 398 elements->set(1, *error_message); 399 Handle<JSArray> array = factory->NewJSArrayWithElements(elements); 400 Handle<Object> regexp_err = 401 factory->NewSyntaxError("malformed_regexp", array); 402 isolate->Throw(*regexp_err); 403 return false; 404 } 405 406 407 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, 408 Handle<String> sample_subject, 409 bool is_ascii) { 410 // Compile the RegExp. 411 Isolate* isolate = re->GetIsolate(); 412 Zone zone(isolate); 413 PostponeInterruptsScope postpone(isolate); 414 // If we had a compilation error the last time this is saved at the 415 // saved code index. 416 Object* entry = re->DataAt(JSRegExp::code_index(is_ascii)); 417 // When arriving here entry can only be a smi, either representing an 418 // uncompiled regexp, a previous compilation error, or code that has 419 // been flushed. 420 ASSERT(entry->IsSmi()); 421 int entry_value = Smi::cast(entry)->value(); 422 ASSERT(entry_value == JSRegExp::kUninitializedValue || 423 entry_value == JSRegExp::kCompilationErrorValue || 424 (entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0)); 425 426 if (entry_value == JSRegExp::kCompilationErrorValue) { 427 // A previous compilation failed and threw an error which we store in 428 // the saved code index (we store the error message, not the actual 429 // error). Recreate the error object and throw it. 430 Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_ascii)); 431 ASSERT(error_string->IsString()); 432 Handle<String> error_message(String::cast(error_string)); 433 CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate); 434 return false; 435 } 436 437 JSRegExp::Flags flags = re->GetFlags(); 438 439 Handle<String> pattern(re->Pattern()); 440 if (!pattern->IsFlat()) FlattenString(pattern); 441 RegExpCompileData compile_data; 442 FlatStringReader reader(isolate, pattern); 443 if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(), 444 &compile_data, 445 &zone)) { 446 // Throw an exception if we fail to parse the pattern. 447 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once. 448 ThrowRegExpException(re, 449 pattern, 450 compile_data.error, 451 "malformed_regexp"); 452 return false; 453 } 454 RegExpEngine::CompilationResult result = 455 RegExpEngine::Compile(&compile_data, 456 flags.is_ignore_case(), 457 flags.is_global(), 458 flags.is_multiline(), 459 pattern, 460 sample_subject, 461 is_ascii, 462 &zone); 463 if (result.error_message != NULL) { 464 // Unable to compile regexp. 465 Handle<String> error_message = 466 isolate->factory()->NewStringFromUtf8(CStrVector(result.error_message)); 467 CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate); 468 return false; 469 } 470 471 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data())); 472 data->set(JSRegExp::code_index(is_ascii), result.code); 473 int register_max = IrregexpMaxRegisterCount(*data); 474 if (result.num_registers > register_max) { 475 SetIrregexpMaxRegisterCount(*data, result.num_registers); 476 } 477 478 return true; 479 } 480 481 482 int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) { 483 return Smi::cast( 484 re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value(); 485 } 486 487 488 void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) { 489 re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value)); 490 } 491 492 493 int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) { 494 return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value(); 495 } 496 497 498 int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) { 499 return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value(); 500 } 501 502 503 ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_ascii) { 504 return ByteArray::cast(re->get(JSRegExp::code_index(is_ascii))); 505 } 506 507 508 Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_ascii) { 509 return Code::cast(re->get(JSRegExp::code_index(is_ascii))); 510 } 511 512 513 void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re, 514 Handle<String> pattern, 515 JSRegExp::Flags flags, 516 int capture_count) { 517 // Initialize compiled code entries to null. 518 re->GetIsolate()->factory()->SetRegExpIrregexpData(re, 519 JSRegExp::IRREGEXP, 520 pattern, 521 flags, 522 capture_count); 523 } 524 525 526 int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp, 527 Handle<String> subject) { 528 if (!subject->IsFlat()) FlattenString(subject); 529 530 // Check the asciiness of the underlying storage. 531 bool is_ascii = subject->IsOneByteRepresentationUnderneath(); 532 if (!EnsureCompiledIrregexp(regexp, subject, is_ascii)) return -1; 533 534 #ifdef V8_INTERPRETED_REGEXP 535 // Byte-code regexp needs space allocated for all its registers. 536 // The result captures are copied to the start of the registers array 537 // if the match succeeds. This way those registers are not clobbered 538 // when we set the last match info from last successful match. 539 return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) + 540 (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2; 541 #else // V8_INTERPRETED_REGEXP 542 // Native regexp only needs room to output captures. Registers are handled 543 // internally. 544 return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2; 545 #endif // V8_INTERPRETED_REGEXP 546 } 547 548 549 int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp, 550 Handle<String> subject, 551 int index, 552 int32_t* output, 553 int output_size) { 554 Isolate* isolate = regexp->GetIsolate(); 555 556 Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate); 557 558 ASSERT(index >= 0); 559 ASSERT(index <= subject->length()); 560 ASSERT(subject->IsFlat()); 561 562 bool is_ascii = subject->IsOneByteRepresentationUnderneath(); 563 564 #ifndef V8_INTERPRETED_REGEXP 565 ASSERT(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2); 566 do { 567 EnsureCompiledIrregexp(regexp, subject, is_ascii); 568 Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii), isolate); 569 // The stack is used to allocate registers for the compiled regexp code. 570 // This means that in case of failure, the output registers array is left 571 // untouched and contains the capture results from the previous successful 572 // match. We can use that to set the last match info lazily. 573 NativeRegExpMacroAssembler::Result res = 574 NativeRegExpMacroAssembler::Match(code, 575 subject, 576 output, 577 output_size, 578 index, 579 isolate); 580 if (res != NativeRegExpMacroAssembler::RETRY) { 581 ASSERT(res != NativeRegExpMacroAssembler::EXCEPTION || 582 isolate->has_pending_exception()); 583 STATIC_ASSERT( 584 static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS); 585 STATIC_ASSERT( 586 static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE); 587 STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION) 588 == RE_EXCEPTION); 589 return static_cast<IrregexpResult>(res); 590 } 591 // If result is RETRY, the string has changed representation, and we 592 // must restart from scratch. 593 // In this case, it means we must make sure we are prepared to handle 594 // the, potentially, different subject (the string can switch between 595 // being internal and external, and even between being ASCII and UC16, 596 // but the characters are always the same). 597 IrregexpPrepare(regexp, subject); 598 is_ascii = subject->IsOneByteRepresentationUnderneath(); 599 } while (true); 600 UNREACHABLE(); 601 return RE_EXCEPTION; 602 #else // V8_INTERPRETED_REGEXP 603 604 ASSERT(output_size >= IrregexpNumberOfRegisters(*irregexp)); 605 // We must have done EnsureCompiledIrregexp, so we can get the number of 606 // registers. 607 int number_of_capture_registers = 608 (IrregexpNumberOfCaptures(*irregexp) + 1) * 2; 609 int32_t* raw_output = &output[number_of_capture_registers]; 610 // We do not touch the actual capture result registers until we know there 611 // has been a match so that we can use those capture results to set the 612 // last match info. 613 for (int i = number_of_capture_registers - 1; i >= 0; i--) { 614 raw_output[i] = -1; 615 } 616 Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_ascii), isolate); 617 618 IrregexpResult result = IrregexpInterpreter::Match(isolate, 619 byte_codes, 620 subject, 621 raw_output, 622 index); 623 if (result == RE_SUCCESS) { 624 // Copy capture results to the start of the registers array. 625 OS::MemCopy( 626 output, raw_output, number_of_capture_registers * sizeof(int32_t)); 627 } 628 if (result == RE_EXCEPTION) { 629 ASSERT(!isolate->has_pending_exception()); 630 isolate->StackOverflow(); 631 } 632 return result; 633 #endif // V8_INTERPRETED_REGEXP 634 } 635 636 637 Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp, 638 Handle<String> subject, 639 int previous_index, 640 Handle<JSArray> last_match_info) { 641 Isolate* isolate = regexp->GetIsolate(); 642 ASSERT_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP); 643 644 // Prepare space for the return values. 645 #if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG) 646 if (FLAG_trace_regexp_bytecodes) { 647 String* pattern = regexp->Pattern(); 648 PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString())); 649 PrintF("\n\nSubject string: '%s'\n\n", *(subject->ToCString())); 650 } 651 #endif 652 int required_registers = RegExpImpl::IrregexpPrepare(regexp, subject); 653 if (required_registers < 0) { 654 // Compiling failed with an exception. 655 ASSERT(isolate->has_pending_exception()); 656 return Handle<Object>::null(); 657 } 658 659 int32_t* output_registers = NULL; 660 if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) { 661 output_registers = NewArray<int32_t>(required_registers); 662 } 663 SmartArrayPointer<int32_t> auto_release(output_registers); 664 if (output_registers == NULL) { 665 output_registers = isolate->jsregexp_static_offsets_vector(); 666 } 667 668 int res = RegExpImpl::IrregexpExecRaw( 669 regexp, subject, previous_index, output_registers, required_registers); 670 if (res == RE_SUCCESS) { 671 int capture_count = 672 IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())); 673 return SetLastMatchInfo( 674 last_match_info, subject, capture_count, output_registers); 675 } 676 if (res == RE_EXCEPTION) { 677 ASSERT(isolate->has_pending_exception()); 678 return Handle<Object>::null(); 679 } 680 ASSERT(res == RE_FAILURE); 681 return isolate->factory()->null_value(); 682 } 683 684 685 Handle<JSArray> RegExpImpl::SetLastMatchInfo(Handle<JSArray> last_match_info, 686 Handle<String> subject, 687 int capture_count, 688 int32_t* match) { 689 ASSERT(last_match_info->HasFastObjectElements()); 690 int capture_register_count = (capture_count + 1) * 2; 691 last_match_info->EnsureSize(capture_register_count + kLastMatchOverhead); 692 DisallowHeapAllocation no_allocation; 693 FixedArray* array = FixedArray::cast(last_match_info->elements()); 694 if (match != NULL) { 695 for (int i = 0; i < capture_register_count; i += 2) { 696 SetCapture(array, i, match[i]); 697 SetCapture(array, i + 1, match[i + 1]); 698 } 699 } 700 SetLastCaptureCount(array, capture_register_count); 701 SetLastSubject(array, *subject); 702 SetLastInput(array, *subject); 703 return last_match_info; 704 } 705 706 707 RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp, 708 Handle<String> subject, 709 bool is_global, 710 Isolate* isolate) 711 : register_array_(NULL), 712 register_array_size_(0), 713 regexp_(regexp), 714 subject_(subject) { 715 #ifdef V8_INTERPRETED_REGEXP 716 bool interpreted = true; 717 #else 718 bool interpreted = false; 719 #endif // V8_INTERPRETED_REGEXP 720 721 if (regexp_->TypeTag() == JSRegExp::ATOM) { 722 static const int kAtomRegistersPerMatch = 2; 723 registers_per_match_ = kAtomRegistersPerMatch; 724 // There is no distinction between interpreted and native for atom regexps. 725 interpreted = false; 726 } else { 727 registers_per_match_ = RegExpImpl::IrregexpPrepare(regexp_, subject_); 728 if (registers_per_match_ < 0) { 729 num_matches_ = -1; // Signal exception. 730 return; 731 } 732 } 733 734 if (is_global && !interpreted) { 735 register_array_size_ = 736 Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize); 737 max_matches_ = register_array_size_ / registers_per_match_; 738 } else { 739 // Global loop in interpreted regexp is not implemented. We choose 740 // the size of the offsets vector so that it can only store one match. 741 register_array_size_ = registers_per_match_; 742 max_matches_ = 1; 743 } 744 745 if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) { 746 register_array_ = NewArray<int32_t>(register_array_size_); 747 } else { 748 register_array_ = isolate->jsregexp_static_offsets_vector(); 749 } 750 751 // Set state so that fetching the results the first time triggers a call 752 // to the compiled regexp. 753 current_match_index_ = max_matches_ - 1; 754 num_matches_ = max_matches_; 755 ASSERT(registers_per_match_ >= 2); // Each match has at least one capture. 756 ASSERT_GE(register_array_size_, registers_per_match_); 757 int32_t* last_match = 758 ®ister_array_[current_match_index_ * registers_per_match_]; 759 last_match[0] = -1; 760 last_match[1] = 0; 761 } 762 763 764 // ------------------------------------------------------------------- 765 // Implementation of the Irregexp regular expression engine. 766 // 767 // The Irregexp regular expression engine is intended to be a complete 768 // implementation of ECMAScript regular expressions. It generates either 769 // bytecodes or native code. 770 771 // The Irregexp regexp engine is structured in three steps. 772 // 1) The parser generates an abstract syntax tree. See ast.cc. 773 // 2) From the AST a node network is created. The nodes are all 774 // subclasses of RegExpNode. The nodes represent states when 775 // executing a regular expression. Several optimizations are 776 // performed on the node network. 777 // 3) From the nodes we generate either byte codes or native code 778 // that can actually execute the regular expression (perform 779 // the search). The code generation step is described in more 780 // detail below. 781 782 // Code generation. 783 // 784 // The nodes are divided into four main categories. 785 // * Choice nodes 786 // These represent places where the regular expression can 787 // match in more than one way. For example on entry to an 788 // alternation (foo|bar) or a repetition (*, +, ? or {}). 789 // * Action nodes 790 // These represent places where some action should be 791 // performed. Examples include recording the current position 792 // in the input string to a register (in order to implement 793 // captures) or other actions on register for example in order 794 // to implement the counters needed for {} repetitions. 795 // * Matching nodes 796 // These attempt to match some element part of the input string. 797 // Examples of elements include character classes, plain strings 798 // or back references. 799 // * End nodes 800 // These are used to implement the actions required on finding 801 // a successful match or failing to find a match. 802 // 803 // The code generated (whether as byte codes or native code) maintains 804 // some state as it runs. This consists of the following elements: 805 // 806 // * The capture registers. Used for string captures. 807 // * Other registers. Used for counters etc. 808 // * The current position. 809 // * The stack of backtracking information. Used when a matching node 810 // fails to find a match and needs to try an alternative. 811 // 812 // Conceptual regular expression execution model: 813 // 814 // There is a simple conceptual model of regular expression execution 815 // which will be presented first. The actual code generated is a more 816 // efficient simulation of the simple conceptual model: 817 // 818 // * Choice nodes are implemented as follows: 819 // For each choice except the last { 820 // push current position 821 // push backtrack code location 822 // <generate code to test for choice> 823 // backtrack code location: 824 // pop current position 825 // } 826 // <generate code to test for last choice> 827 // 828 // * Actions nodes are generated as follows 829 // <push affected registers on backtrack stack> 830 // <generate code to perform action> 831 // push backtrack code location 832 // <generate code to test for following nodes> 833 // backtrack code location: 834 // <pop affected registers to restore their state> 835 // <pop backtrack location from stack and go to it> 836 // 837 // * Matching nodes are generated as follows: 838 // if input string matches at current position 839 // update current position 840 // <generate code to test for following nodes> 841 // else 842 // <pop backtrack location from stack and go to it> 843 // 844 // Thus it can be seen that the current position is saved and restored 845 // by the choice nodes, whereas the registers are saved and restored by 846 // by the action nodes that manipulate them. 847 // 848 // The other interesting aspect of this model is that nodes are generated 849 // at the point where they are needed by a recursive call to Emit(). If 850 // the node has already been code generated then the Emit() call will 851 // generate a jump to the previously generated code instead. In order to 852 // limit recursion it is possible for the Emit() function to put the node 853 // on a work list for later generation and instead generate a jump. The 854 // destination of the jump is resolved later when the code is generated. 855 // 856 // Actual regular expression code generation. 857 // 858 // Code generation is actually more complicated than the above. In order 859 // to improve the efficiency of the generated code some optimizations are 860 // performed 861 // 862 // * Choice nodes have 1-character lookahead. 863 // A choice node looks at the following character and eliminates some of 864 // the choices immediately based on that character. This is not yet 865 // implemented. 866 // * Simple greedy loops store reduced backtracking information. 867 // A quantifier like /.*foo/m will greedily match the whole input. It will 868 // then need to backtrack to a point where it can match "foo". The naive 869 // implementation of this would push each character position onto the 870 // backtracking stack, then pop them off one by one. This would use space 871 // proportional to the length of the input string. However since the "." 872 // can only match in one way and always has a constant length (in this case 873 // of 1) it suffices to store the current position on the top of the stack 874 // once. Matching now becomes merely incrementing the current position and 875 // backtracking becomes decrementing the current position and checking the 876 // result against the stored current position. This is faster and saves 877 // space. 878 // * The current state is virtualized. 879 // This is used to defer expensive operations until it is clear that they 880 // are needed and to generate code for a node more than once, allowing 881 // specialized an efficient versions of the code to be created. This is 882 // explained in the section below. 883 // 884 // Execution state virtualization. 885 // 886 // Instead of emitting code, nodes that manipulate the state can record their 887 // manipulation in an object called the Trace. The Trace object can record a 888 // current position offset, an optional backtrack code location on the top of 889 // the virtualized backtrack stack and some register changes. When a node is 890 // to be emitted it can flush the Trace or update it. Flushing the Trace 891 // will emit code to bring the actual state into line with the virtual state. 892 // Avoiding flushing the state can postpone some work (e.g. updates of capture 893 // registers). Postponing work can save time when executing the regular 894 // expression since it may be found that the work never has to be done as a 895 // failure to match can occur. In addition it is much faster to jump to a 896 // known backtrack code location than it is to pop an unknown backtrack 897 // location from the stack and jump there. 898 // 899 // The virtual state found in the Trace affects code generation. For example 900 // the virtual state contains the difference between the actual current 901 // position and the virtual current position, and matching code needs to use 902 // this offset to attempt a match in the correct location of the input 903 // string. Therefore code generated for a non-trivial trace is specialized 904 // to that trace. The code generator therefore has the ability to generate 905 // code for each node several times. In order to limit the size of the 906 // generated code there is an arbitrary limit on how many specialized sets of 907 // code may be generated for a given node. If the limit is reached, the 908 // trace is flushed and a generic version of the code for a node is emitted. 909 // This is subsequently used for that node. The code emitted for non-generic 910 // trace is not recorded in the node and so it cannot currently be reused in 911 // the event that code generation is requested for an identical trace. 912 913 914 void RegExpTree::AppendToText(RegExpText* text, Zone* zone) { 915 UNREACHABLE(); 916 } 917 918 919 void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) { 920 text->AddElement(TextElement::Atom(this), zone); 921 } 922 923 924 void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) { 925 text->AddElement(TextElement::CharClass(this), zone); 926 } 927 928 929 void RegExpText::AppendToText(RegExpText* text, Zone* zone) { 930 for (int i = 0; i < elements()->length(); i++) 931 text->AddElement(elements()->at(i), zone); 932 } 933 934 935 TextElement TextElement::Atom(RegExpAtom* atom) { 936 return TextElement(ATOM, atom); 937 } 938 939 940 TextElement TextElement::CharClass(RegExpCharacterClass* char_class) { 941 return TextElement(CHAR_CLASS, char_class); 942 } 943 944 945 int TextElement::length() const { 946 switch (text_type()) { 947 case ATOM: 948 return atom()->length(); 949 950 case CHAR_CLASS: 951 return 1; 952 } 953 UNREACHABLE(); 954 return 0; 955 } 956 957 958 DispatchTable* ChoiceNode::GetTable(bool ignore_case) { 959 if (table_ == NULL) { 960 table_ = new(zone()) DispatchTable(zone()); 961 DispatchTableConstructor cons(table_, ignore_case, zone()); 962 cons.BuildTable(this); 963 } 964 return table_; 965 } 966 967 968 class FrequencyCollator { 969 public: 970 FrequencyCollator() : total_samples_(0) { 971 for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) { 972 frequencies_[i] = CharacterFrequency(i); 973 } 974 } 975 976 void CountCharacter(int character) { 977 int index = (character & RegExpMacroAssembler::kTableMask); 978 frequencies_[index].Increment(); 979 total_samples_++; 980 } 981 982 // Does not measure in percent, but rather per-128 (the table size from the 983 // regexp macro assembler). 984 int Frequency(int in_character) { 985 ASSERT((in_character & RegExpMacroAssembler::kTableMask) == in_character); 986 if (total_samples_ < 1) return 1; // Division by zero. 987 int freq_in_per128 = 988 (frequencies_[in_character].counter() * 128) / total_samples_; 989 return freq_in_per128; 990 } 991 992 private: 993 class CharacterFrequency { 994 public: 995 CharacterFrequency() : counter_(0), character_(-1) { } 996 explicit CharacterFrequency(int character) 997 : counter_(0), character_(character) { } 998 999 void Increment() { counter_++; } 1000 int counter() { return counter_; } 1001 int character() { return character_; } 1002 1003 private: 1004 int counter_; 1005 int character_; 1006 }; 1007 1008 1009 private: 1010 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize]; 1011 int total_samples_; 1012 }; 1013 1014 1015 class RegExpCompiler { 1016 public: 1017 RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii, 1018 Zone* zone); 1019 1020 int AllocateRegister() { 1021 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) { 1022 reg_exp_too_big_ = true; 1023 return next_register_; 1024 } 1025 return next_register_++; 1026 } 1027 1028 RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler, 1029 RegExpNode* start, 1030 int capture_count, 1031 Handle<String> pattern); 1032 1033 inline void AddWork(RegExpNode* node) { work_list_->Add(node); } 1034 1035 static const int kImplementationOffset = 0; 1036 static const int kNumberOfRegistersOffset = 0; 1037 static const int kCodeOffset = 1; 1038 1039 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; } 1040 EndNode* accept() { return accept_; } 1041 1042 static const int kMaxRecursion = 100; 1043 inline int recursion_depth() { return recursion_depth_; } 1044 inline void IncrementRecursionDepth() { recursion_depth_++; } 1045 inline void DecrementRecursionDepth() { recursion_depth_--; } 1046 1047 void SetRegExpTooBig() { reg_exp_too_big_ = true; } 1048 1049 inline bool ignore_case() { return ignore_case_; } 1050 inline bool ascii() { return ascii_; } 1051 FrequencyCollator* frequency_collator() { return &frequency_collator_; } 1052 1053 int current_expansion_factor() { return current_expansion_factor_; } 1054 void set_current_expansion_factor(int value) { 1055 current_expansion_factor_ = value; 1056 } 1057 1058 Zone* zone() const { return zone_; } 1059 1060 static const int kNoRegister = -1; 1061 1062 private: 1063 EndNode* accept_; 1064 int next_register_; 1065 List<RegExpNode*>* work_list_; 1066 int recursion_depth_; 1067 RegExpMacroAssembler* macro_assembler_; 1068 bool ignore_case_; 1069 bool ascii_; 1070 bool reg_exp_too_big_; 1071 int current_expansion_factor_; 1072 FrequencyCollator frequency_collator_; 1073 Zone* zone_; 1074 }; 1075 1076 1077 class RecursionCheck { 1078 public: 1079 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) { 1080 compiler->IncrementRecursionDepth(); 1081 } 1082 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); } 1083 private: 1084 RegExpCompiler* compiler_; 1085 }; 1086 1087 1088 static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) { 1089 return RegExpEngine::CompilationResult(isolate, "RegExp too big"); 1090 } 1091 1092 1093 // Attempts to compile the regexp using an Irregexp code generator. Returns 1094 // a fixed array or a null handle depending on whether it succeeded. 1095 RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii, 1096 Zone* zone) 1097 : next_register_(2 * (capture_count + 1)), 1098 work_list_(NULL), 1099 recursion_depth_(0), 1100 ignore_case_(ignore_case), 1101 ascii_(ascii), 1102 reg_exp_too_big_(false), 1103 current_expansion_factor_(1), 1104 frequency_collator_(), 1105 zone_(zone) { 1106 accept_ = new(zone) EndNode(EndNode::ACCEPT, zone); 1107 ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister); 1108 } 1109 1110 1111 RegExpEngine::CompilationResult RegExpCompiler::Assemble( 1112 RegExpMacroAssembler* macro_assembler, 1113 RegExpNode* start, 1114 int capture_count, 1115 Handle<String> pattern) { 1116 Heap* heap = pattern->GetHeap(); 1117 1118 bool use_slow_safe_regexp_compiler = false; 1119 if (heap->total_regexp_code_generated() > 1120 RegExpImpl::kRegWxpCompiledLimit && 1121 heap->isolate()->memory_allocator()->SizeExecutable() > 1122 RegExpImpl::kRegExpExecutableMemoryLimit) { 1123 use_slow_safe_regexp_compiler = true; 1124 } 1125 1126 macro_assembler->set_slow_safe(use_slow_safe_regexp_compiler); 1127 1128 #ifdef DEBUG 1129 if (FLAG_trace_regexp_assembler) 1130 macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler); 1131 else 1132 #endif 1133 macro_assembler_ = macro_assembler; 1134 1135 List <RegExpNode*> work_list(0); 1136 work_list_ = &work_list; 1137 Label fail; 1138 macro_assembler_->PushBacktrack(&fail); 1139 Trace new_trace; 1140 start->Emit(this, &new_trace); 1141 macro_assembler_->Bind(&fail); 1142 macro_assembler_->Fail(); 1143 while (!work_list.is_empty()) { 1144 work_list.RemoveLast()->Emit(this, &new_trace); 1145 } 1146 if (reg_exp_too_big_) return IrregexpRegExpTooBig(zone_->isolate()); 1147 1148 Handle<HeapObject> code = macro_assembler_->GetCode(pattern); 1149 heap->IncreaseTotalRegexpCodeGenerated(code->Size()); 1150 work_list_ = NULL; 1151 #ifdef DEBUG 1152 if (FLAG_print_code) { 1153 CodeTracer::Scope trace_scope(heap->isolate()->GetCodeTracer()); 1154 Handle<Code>::cast(code)->Disassemble(*pattern->ToCString(), 1155 trace_scope.file()); 1156 } 1157 if (FLAG_trace_regexp_assembler) { 1158 delete macro_assembler_; 1159 } 1160 #endif 1161 return RegExpEngine::CompilationResult(*code, next_register_); 1162 } 1163 1164 1165 bool Trace::DeferredAction::Mentions(int that) { 1166 if (action_type() == ActionNode::CLEAR_CAPTURES) { 1167 Interval range = static_cast<DeferredClearCaptures*>(this)->range(); 1168 return range.Contains(that); 1169 } else { 1170 return reg() == that; 1171 } 1172 } 1173 1174 1175 bool Trace::mentions_reg(int reg) { 1176 for (DeferredAction* action = actions_; 1177 action != NULL; 1178 action = action->next()) { 1179 if (action->Mentions(reg)) 1180 return true; 1181 } 1182 return false; 1183 } 1184 1185 1186 bool Trace::GetStoredPosition(int reg, int* cp_offset) { 1187 ASSERT_EQ(0, *cp_offset); 1188 for (DeferredAction* action = actions_; 1189 action != NULL; 1190 action = action->next()) { 1191 if (action->Mentions(reg)) { 1192 if (action->action_type() == ActionNode::STORE_POSITION) { 1193 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset(); 1194 return true; 1195 } else { 1196 return false; 1197 } 1198 } 1199 } 1200 return false; 1201 } 1202 1203 1204 int Trace::FindAffectedRegisters(OutSet* affected_registers, 1205 Zone* zone) { 1206 int max_register = RegExpCompiler::kNoRegister; 1207 for (DeferredAction* action = actions_; 1208 action != NULL; 1209 action = action->next()) { 1210 if (action->action_type() == ActionNode::CLEAR_CAPTURES) { 1211 Interval range = static_cast<DeferredClearCaptures*>(action)->range(); 1212 for (int i = range.from(); i <= range.to(); i++) 1213 affected_registers->Set(i, zone); 1214 if (range.to() > max_register) max_register = range.to(); 1215 } else { 1216 affected_registers->Set(action->reg(), zone); 1217 if (action->reg() > max_register) max_register = action->reg(); 1218 } 1219 } 1220 return max_register; 1221 } 1222 1223 1224 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler, 1225 int max_register, 1226 OutSet& registers_to_pop, 1227 OutSet& registers_to_clear) { 1228 for (int reg = max_register; reg >= 0; reg--) { 1229 if (registers_to_pop.Get(reg)) assembler->PopRegister(reg); 1230 else if (registers_to_clear.Get(reg)) { 1231 int clear_to = reg; 1232 while (reg > 0 && registers_to_clear.Get(reg - 1)) { 1233 reg--; 1234 } 1235 assembler->ClearRegisters(reg, clear_to); 1236 } 1237 } 1238 } 1239 1240 1241 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler, 1242 int max_register, 1243 OutSet& affected_registers, 1244 OutSet* registers_to_pop, 1245 OutSet* registers_to_clear, 1246 Zone* zone) { 1247 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1. 1248 const int push_limit = (assembler->stack_limit_slack() + 1) / 2; 1249 1250 // Count pushes performed to force a stack limit check occasionally. 1251 int pushes = 0; 1252 1253 for (int reg = 0; reg <= max_register; reg++) { 1254 if (!affected_registers.Get(reg)) { 1255 continue; 1256 } 1257 1258 // The chronologically first deferred action in the trace 1259 // is used to infer the action needed to restore a register 1260 // to its previous state (or not, if it's safe to ignore it). 1261 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR }; 1262 DeferredActionUndoType undo_action = IGNORE; 1263 1264 int value = 0; 1265 bool absolute = false; 1266 bool clear = false; 1267 int store_position = -1; 1268 // This is a little tricky because we are scanning the actions in reverse 1269 // historical order (newest first). 1270 for (DeferredAction* action = actions_; 1271 action != NULL; 1272 action = action->next()) { 1273 if (action->Mentions(reg)) { 1274 switch (action->action_type()) { 1275 case ActionNode::SET_REGISTER: { 1276 Trace::DeferredSetRegister* psr = 1277 static_cast<Trace::DeferredSetRegister*>(action); 1278 if (!absolute) { 1279 value += psr->value(); 1280 absolute = true; 1281 } 1282 // SET_REGISTER is currently only used for newly introduced loop 1283 // counters. They can have a significant previous value if they 1284 // occour in a loop. TODO(lrn): Propagate this information, so 1285 // we can set undo_action to IGNORE if we know there is no value to 1286 // restore. 1287 undo_action = RESTORE; 1288 ASSERT_EQ(store_position, -1); 1289 ASSERT(!clear); 1290 break; 1291 } 1292 case ActionNode::INCREMENT_REGISTER: 1293 if (!absolute) { 1294 value++; 1295 } 1296 ASSERT_EQ(store_position, -1); 1297 ASSERT(!clear); 1298 undo_action = RESTORE; 1299 break; 1300 case ActionNode::STORE_POSITION: { 1301 Trace::DeferredCapture* pc = 1302 static_cast<Trace::DeferredCapture*>(action); 1303 if (!clear && store_position == -1) { 1304 store_position = pc->cp_offset(); 1305 } 1306 1307 // For captures we know that stores and clears alternate. 1308 // Other register, are never cleared, and if the occur 1309 // inside a loop, they might be assigned more than once. 1310 if (reg <= 1) { 1311 // Registers zero and one, aka "capture zero", is 1312 // always set correctly if we succeed. There is no 1313 // need to undo a setting on backtrack, because we 1314 // will set it again or fail. 1315 undo_action = IGNORE; 1316 } else { 1317 undo_action = pc->is_capture() ? CLEAR : RESTORE; 1318 } 1319 ASSERT(!absolute); 1320 ASSERT_EQ(value, 0); 1321 break; 1322 } 1323 case ActionNode::CLEAR_CAPTURES: { 1324 // Since we're scanning in reverse order, if we've already 1325 // set the position we have to ignore historically earlier 1326 // clearing operations. 1327 if (store_position == -1) { 1328 clear = true; 1329 } 1330 undo_action = RESTORE; 1331 ASSERT(!absolute); 1332 ASSERT_EQ(value, 0); 1333 break; 1334 } 1335 default: 1336 UNREACHABLE(); 1337 break; 1338 } 1339 } 1340 } 1341 // Prepare for the undo-action (e.g., push if it's going to be popped). 1342 if (undo_action == RESTORE) { 1343 pushes++; 1344 RegExpMacroAssembler::StackCheckFlag stack_check = 1345 RegExpMacroAssembler::kNoStackLimitCheck; 1346 if (pushes == push_limit) { 1347 stack_check = RegExpMacroAssembler::kCheckStackLimit; 1348 pushes = 0; 1349 } 1350 1351 assembler->PushRegister(reg, stack_check); 1352 registers_to_pop->Set(reg, zone); 1353 } else if (undo_action == CLEAR) { 1354 registers_to_clear->Set(reg, zone); 1355 } 1356 // Perform the chronologically last action (or accumulated increment) 1357 // for the register. 1358 if (store_position != -1) { 1359 assembler->WriteCurrentPositionToRegister(reg, store_position); 1360 } else if (clear) { 1361 assembler->ClearRegisters(reg, reg); 1362 } else if (absolute) { 1363 assembler->SetRegister(reg, value); 1364 } else if (value != 0) { 1365 assembler->AdvanceRegister(reg, value); 1366 } 1367 } 1368 } 1369 1370 1371 // This is called as we come into a loop choice node and some other tricky 1372 // nodes. It normalizes the state of the code generator to ensure we can 1373 // generate generic code. 1374 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) { 1375 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1376 1377 ASSERT(!is_trivial()); 1378 1379 if (actions_ == NULL && backtrack() == NULL) { 1380 // Here we just have some deferred cp advances to fix and we are back to 1381 // a normal situation. We may also have to forget some information gained 1382 // through a quick check that was already performed. 1383 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_); 1384 // Create a new trivial state and generate the node with that. 1385 Trace new_state; 1386 successor->Emit(compiler, &new_state); 1387 return; 1388 } 1389 1390 // Generate deferred actions here along with code to undo them again. 1391 OutSet affected_registers; 1392 1393 if (backtrack() != NULL) { 1394 // Here we have a concrete backtrack location. These are set up by choice 1395 // nodes and so they indicate that we have a deferred save of the current 1396 // position which we may need to emit here. 1397 assembler->PushCurrentPosition(); 1398 } 1399 1400 int max_register = FindAffectedRegisters(&affected_registers, 1401 compiler->zone()); 1402 OutSet registers_to_pop; 1403 OutSet registers_to_clear; 1404 PerformDeferredActions(assembler, 1405 max_register, 1406 affected_registers, 1407 ®isters_to_pop, 1408 ®isters_to_clear, 1409 compiler->zone()); 1410 if (cp_offset_ != 0) { 1411 assembler->AdvanceCurrentPosition(cp_offset_); 1412 } 1413 1414 // Create a new trivial state and generate the node with that. 1415 Label undo; 1416 assembler->PushBacktrack(&undo); 1417 Trace new_state; 1418 successor->Emit(compiler, &new_state); 1419 1420 // On backtrack we need to restore state. 1421 assembler->Bind(&undo); 1422 RestoreAffectedRegisters(assembler, 1423 max_register, 1424 registers_to_pop, 1425 registers_to_clear); 1426 if (backtrack() == NULL) { 1427 assembler->Backtrack(); 1428 } else { 1429 assembler->PopCurrentPosition(); 1430 assembler->GoTo(backtrack()); 1431 } 1432 } 1433 1434 1435 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) { 1436 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1437 1438 // Omit flushing the trace. We discard the entire stack frame anyway. 1439 1440 if (!label()->is_bound()) { 1441 // We are completely independent of the trace, since we ignore it, 1442 // so this code can be used as the generic version. 1443 assembler->Bind(label()); 1444 } 1445 1446 // Throw away everything on the backtrack stack since the start 1447 // of the negative submatch and restore the character position. 1448 assembler->ReadCurrentPositionFromRegister(current_position_register_); 1449 assembler->ReadStackPointerFromRegister(stack_pointer_register_); 1450 if (clear_capture_count_ > 0) { 1451 // Clear any captures that might have been performed during the success 1452 // of the body of the negative look-ahead. 1453 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1; 1454 assembler->ClearRegisters(clear_capture_start_, clear_capture_end); 1455 } 1456 // Now that we have unwound the stack we find at the top of the stack the 1457 // backtrack that the BeginSubmatch node got. 1458 assembler->Backtrack(); 1459 } 1460 1461 1462 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) { 1463 if (!trace->is_trivial()) { 1464 trace->Flush(compiler, this); 1465 return; 1466 } 1467 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1468 if (!label()->is_bound()) { 1469 assembler->Bind(label()); 1470 } 1471 switch (action_) { 1472 case ACCEPT: 1473 assembler->Succeed(); 1474 return; 1475 case BACKTRACK: 1476 assembler->GoTo(trace->backtrack()); 1477 return; 1478 case NEGATIVE_SUBMATCH_SUCCESS: 1479 // This case is handled in a different virtual method. 1480 UNREACHABLE(); 1481 } 1482 UNIMPLEMENTED(); 1483 } 1484 1485 1486 void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) { 1487 if (guards_ == NULL) 1488 guards_ = new(zone) ZoneList<Guard*>(1, zone); 1489 guards_->Add(guard, zone); 1490 } 1491 1492 1493 ActionNode* ActionNode::SetRegister(int reg, 1494 int val, 1495 RegExpNode* on_success) { 1496 ActionNode* result = 1497 new(on_success->zone()) ActionNode(SET_REGISTER, on_success); 1498 result->data_.u_store_register.reg = reg; 1499 result->data_.u_store_register.value = val; 1500 return result; 1501 } 1502 1503 1504 ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) { 1505 ActionNode* result = 1506 new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success); 1507 result->data_.u_increment_register.reg = reg; 1508 return result; 1509 } 1510 1511 1512 ActionNode* ActionNode::StorePosition(int reg, 1513 bool is_capture, 1514 RegExpNode* on_success) { 1515 ActionNode* result = 1516 new(on_success->zone()) ActionNode(STORE_POSITION, on_success); 1517 result->data_.u_position_register.reg = reg; 1518 result->data_.u_position_register.is_capture = is_capture; 1519 return result; 1520 } 1521 1522 1523 ActionNode* ActionNode::ClearCaptures(Interval range, 1524 RegExpNode* on_success) { 1525 ActionNode* result = 1526 new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success); 1527 result->data_.u_clear_captures.range_from = range.from(); 1528 result->data_.u_clear_captures.range_to = range.to(); 1529 return result; 1530 } 1531 1532 1533 ActionNode* ActionNode::BeginSubmatch(int stack_reg, 1534 int position_reg, 1535 RegExpNode* on_success) { 1536 ActionNode* result = 1537 new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success); 1538 result->data_.u_submatch.stack_pointer_register = stack_reg; 1539 result->data_.u_submatch.current_position_register = position_reg; 1540 return result; 1541 } 1542 1543 1544 ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg, 1545 int position_reg, 1546 int clear_register_count, 1547 int clear_register_from, 1548 RegExpNode* on_success) { 1549 ActionNode* result = 1550 new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success); 1551 result->data_.u_submatch.stack_pointer_register = stack_reg; 1552 result->data_.u_submatch.current_position_register = position_reg; 1553 result->data_.u_submatch.clear_register_count = clear_register_count; 1554 result->data_.u_submatch.clear_register_from = clear_register_from; 1555 return result; 1556 } 1557 1558 1559 ActionNode* ActionNode::EmptyMatchCheck(int start_register, 1560 int repetition_register, 1561 int repetition_limit, 1562 RegExpNode* on_success) { 1563 ActionNode* result = 1564 new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success); 1565 result->data_.u_empty_match_check.start_register = start_register; 1566 result->data_.u_empty_match_check.repetition_register = repetition_register; 1567 result->data_.u_empty_match_check.repetition_limit = repetition_limit; 1568 return result; 1569 } 1570 1571 1572 #define DEFINE_ACCEPT(Type) \ 1573 void Type##Node::Accept(NodeVisitor* visitor) { \ 1574 visitor->Visit##Type(this); \ 1575 } 1576 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT) 1577 #undef DEFINE_ACCEPT 1578 1579 1580 void LoopChoiceNode::Accept(NodeVisitor* visitor) { 1581 visitor->VisitLoopChoice(this); 1582 } 1583 1584 1585 // ------------------------------------------------------------------- 1586 // Emit code. 1587 1588 1589 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler, 1590 Guard* guard, 1591 Trace* trace) { 1592 switch (guard->op()) { 1593 case Guard::LT: 1594 ASSERT(!trace->mentions_reg(guard->reg())); 1595 macro_assembler->IfRegisterGE(guard->reg(), 1596 guard->value(), 1597 trace->backtrack()); 1598 break; 1599 case Guard::GEQ: 1600 ASSERT(!trace->mentions_reg(guard->reg())); 1601 macro_assembler->IfRegisterLT(guard->reg(), 1602 guard->value(), 1603 trace->backtrack()); 1604 break; 1605 } 1606 } 1607 1608 1609 // Returns the number of characters in the equivalence class, omitting those 1610 // that cannot occur in the source string because it is ASCII. 1611 static int GetCaseIndependentLetters(Isolate* isolate, 1612 uc16 character, 1613 bool ascii_subject, 1614 unibrow::uchar* letters) { 1615 int length = 1616 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters); 1617 // Unibrow returns 0 or 1 for characters where case independence is 1618 // trivial. 1619 if (length == 0) { 1620 letters[0] = character; 1621 length = 1; 1622 } 1623 if (!ascii_subject || character <= String::kMaxOneByteCharCode) { 1624 return length; 1625 } 1626 // The standard requires that non-ASCII characters cannot have ASCII 1627 // character codes in their equivalence class. 1628 return 0; 1629 } 1630 1631 1632 static inline bool EmitSimpleCharacter(Isolate* isolate, 1633 RegExpCompiler* compiler, 1634 uc16 c, 1635 Label* on_failure, 1636 int cp_offset, 1637 bool check, 1638 bool preloaded) { 1639 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1640 bool bound_checked = false; 1641 if (!preloaded) { 1642 assembler->LoadCurrentCharacter( 1643 cp_offset, 1644 on_failure, 1645 check); 1646 bound_checked = true; 1647 } 1648 assembler->CheckNotCharacter(c, on_failure); 1649 return bound_checked; 1650 } 1651 1652 1653 // Only emits non-letters (things that don't have case). Only used for case 1654 // independent matches. 1655 static inline bool EmitAtomNonLetter(Isolate* isolate, 1656 RegExpCompiler* compiler, 1657 uc16 c, 1658 Label* on_failure, 1659 int cp_offset, 1660 bool check, 1661 bool preloaded) { 1662 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 1663 bool ascii = compiler->ascii(); 1664 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1665 int length = GetCaseIndependentLetters(isolate, c, ascii, chars); 1666 if (length < 1) { 1667 // This can't match. Must be an ASCII subject and a non-ASCII character. 1668 // We do not need to do anything since the ASCII pass already handled this. 1669 return false; // Bounds not checked. 1670 } 1671 bool checked = false; 1672 // We handle the length > 1 case in a later pass. 1673 if (length == 1) { 1674 if (ascii && c > String::kMaxOneByteCharCodeU) { 1675 // Can't match - see above. 1676 return false; // Bounds not checked. 1677 } 1678 if (!preloaded) { 1679 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 1680 checked = check; 1681 } 1682 macro_assembler->CheckNotCharacter(c, on_failure); 1683 } 1684 return checked; 1685 } 1686 1687 1688 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler, 1689 bool ascii, 1690 uc16 c1, 1691 uc16 c2, 1692 Label* on_failure) { 1693 uc16 char_mask; 1694 if (ascii) { 1695 char_mask = String::kMaxOneByteCharCode; 1696 } else { 1697 char_mask = String::kMaxUtf16CodeUnit; 1698 } 1699 uc16 exor = c1 ^ c2; 1700 // Check whether exor has only one bit set. 1701 if (((exor - 1) & exor) == 0) { 1702 // If c1 and c2 differ only by one bit. 1703 // Ecma262UnCanonicalize always gives the highest number last. 1704 ASSERT(c2 > c1); 1705 uc16 mask = char_mask ^ exor; 1706 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure); 1707 return true; 1708 } 1709 ASSERT(c2 > c1); 1710 uc16 diff = c2 - c1; 1711 if (((diff - 1) & diff) == 0 && c1 >= diff) { 1712 // If the characters differ by 2^n but don't differ by one bit then 1713 // subtract the difference from the found character, then do the or 1714 // trick. We avoid the theoretical case where negative numbers are 1715 // involved in order to simplify code generation. 1716 uc16 mask = char_mask ^ diff; 1717 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff, 1718 diff, 1719 mask, 1720 on_failure); 1721 return true; 1722 } 1723 return false; 1724 } 1725 1726 1727 typedef bool EmitCharacterFunction(Isolate* isolate, 1728 RegExpCompiler* compiler, 1729 uc16 c, 1730 Label* on_failure, 1731 int cp_offset, 1732 bool check, 1733 bool preloaded); 1734 1735 // Only emits letters (things that have case). Only used for case independent 1736 // matches. 1737 static inline bool EmitAtomLetter(Isolate* isolate, 1738 RegExpCompiler* compiler, 1739 uc16 c, 1740 Label* on_failure, 1741 int cp_offset, 1742 bool check, 1743 bool preloaded) { 1744 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 1745 bool ascii = compiler->ascii(); 1746 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1747 int length = GetCaseIndependentLetters(isolate, c, ascii, chars); 1748 if (length <= 1) return false; 1749 // We may not need to check against the end of the input string 1750 // if this character lies before a character that matched. 1751 if (!preloaded) { 1752 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 1753 } 1754 Label ok; 1755 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4); 1756 switch (length) { 1757 case 2: { 1758 if (ShortCutEmitCharacterPair(macro_assembler, 1759 ascii, 1760 chars[0], 1761 chars[1], 1762 on_failure)) { 1763 } else { 1764 macro_assembler->CheckCharacter(chars[0], &ok); 1765 macro_assembler->CheckNotCharacter(chars[1], on_failure); 1766 macro_assembler->Bind(&ok); 1767 } 1768 break; 1769 } 1770 case 4: 1771 macro_assembler->CheckCharacter(chars[3], &ok); 1772 // Fall through! 1773 case 3: 1774 macro_assembler->CheckCharacter(chars[0], &ok); 1775 macro_assembler->CheckCharacter(chars[1], &ok); 1776 macro_assembler->CheckNotCharacter(chars[2], on_failure); 1777 macro_assembler->Bind(&ok); 1778 break; 1779 default: 1780 UNREACHABLE(); 1781 break; 1782 } 1783 return true; 1784 } 1785 1786 1787 static void EmitBoundaryTest(RegExpMacroAssembler* masm, 1788 int border, 1789 Label* fall_through, 1790 Label* above_or_equal, 1791 Label* below) { 1792 if (below != fall_through) { 1793 masm->CheckCharacterLT(border, below); 1794 if (above_or_equal != fall_through) masm->GoTo(above_or_equal); 1795 } else { 1796 masm->CheckCharacterGT(border - 1, above_or_equal); 1797 } 1798 } 1799 1800 1801 static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm, 1802 int first, 1803 int last, 1804 Label* fall_through, 1805 Label* in_range, 1806 Label* out_of_range) { 1807 if (in_range == fall_through) { 1808 if (first == last) { 1809 masm->CheckNotCharacter(first, out_of_range); 1810 } else { 1811 masm->CheckCharacterNotInRange(first, last, out_of_range); 1812 } 1813 } else { 1814 if (first == last) { 1815 masm->CheckCharacter(first, in_range); 1816 } else { 1817 masm->CheckCharacterInRange(first, last, in_range); 1818 } 1819 if (out_of_range != fall_through) masm->GoTo(out_of_range); 1820 } 1821 } 1822 1823 1824 // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even. 1825 // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd. 1826 static void EmitUseLookupTable( 1827 RegExpMacroAssembler* masm, 1828 ZoneList<int>* ranges, 1829 int start_index, 1830 int end_index, 1831 int min_char, 1832 Label* fall_through, 1833 Label* even_label, 1834 Label* odd_label) { 1835 static const int kSize = RegExpMacroAssembler::kTableSize; 1836 static const int kMask = RegExpMacroAssembler::kTableMask; 1837 1838 int base = (min_char & ~kMask); 1839 USE(base); 1840 1841 // Assert that everything is on one kTableSize page. 1842 for (int i = start_index; i <= end_index; i++) { 1843 ASSERT_EQ(ranges->at(i) & ~kMask, base); 1844 } 1845 ASSERT(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base); 1846 1847 char templ[kSize]; 1848 Label* on_bit_set; 1849 Label* on_bit_clear; 1850 int bit; 1851 if (even_label == fall_through) { 1852 on_bit_set = odd_label; 1853 on_bit_clear = even_label; 1854 bit = 1; 1855 } else { 1856 on_bit_set = even_label; 1857 on_bit_clear = odd_label; 1858 bit = 0; 1859 } 1860 for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) { 1861 templ[i] = bit; 1862 } 1863 int j = 0; 1864 bit ^= 1; 1865 for (int i = start_index; i < end_index; i++) { 1866 for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) { 1867 templ[j] = bit; 1868 } 1869 bit ^= 1; 1870 } 1871 for (int i = j; i < kSize; i++) { 1872 templ[i] = bit; 1873 } 1874 Factory* factory = masm->zone()->isolate()->factory(); 1875 // TODO(erikcorry): Cache these. 1876 Handle<ByteArray> ba = factory->NewByteArray(kSize, TENURED); 1877 for (int i = 0; i < kSize; i++) { 1878 ba->set(i, templ[i]); 1879 } 1880 masm->CheckBitInTable(ba, on_bit_set); 1881 if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear); 1882 } 1883 1884 1885 static void CutOutRange(RegExpMacroAssembler* masm, 1886 ZoneList<int>* ranges, 1887 int start_index, 1888 int end_index, 1889 int cut_index, 1890 Label* even_label, 1891 Label* odd_label) { 1892 bool odd = (((cut_index - start_index) & 1) == 1); 1893 Label* in_range_label = odd ? odd_label : even_label; 1894 Label dummy; 1895 EmitDoubleBoundaryTest(masm, 1896 ranges->at(cut_index), 1897 ranges->at(cut_index + 1) - 1, 1898 &dummy, 1899 in_range_label, 1900 &dummy); 1901 ASSERT(!dummy.is_linked()); 1902 // Cut out the single range by rewriting the array. This creates a new 1903 // range that is a merger of the two ranges on either side of the one we 1904 // are cutting out. The oddity of the labels is preserved. 1905 for (int j = cut_index; j > start_index; j--) { 1906 ranges->at(j) = ranges->at(j - 1); 1907 } 1908 for (int j = cut_index + 1; j < end_index; j++) { 1909 ranges->at(j) = ranges->at(j + 1); 1910 } 1911 } 1912 1913 1914 // Unicode case. Split the search space into kSize spaces that are handled 1915 // with recursion. 1916 static void SplitSearchSpace(ZoneList<int>* ranges, 1917 int start_index, 1918 int end_index, 1919 int* new_start_index, 1920 int* new_end_index, 1921 int* border) { 1922 static const int kSize = RegExpMacroAssembler::kTableSize; 1923 static const int kMask = RegExpMacroAssembler::kTableMask; 1924 1925 int first = ranges->at(start_index); 1926 int last = ranges->at(end_index) - 1; 1927 1928 *new_start_index = start_index; 1929 *border = (ranges->at(start_index) & ~kMask) + kSize; 1930 while (*new_start_index < end_index) { 1931 if (ranges->at(*new_start_index) > *border) break; 1932 (*new_start_index)++; 1933 } 1934 // new_start_index is the index of the first edge that is beyond the 1935 // current kSize space. 1936 1937 // For very large search spaces we do a binary chop search of the non-ASCII 1938 // space instead of just going to the end of the current kSize space. The 1939 // heuristics are complicated a little by the fact that any 128-character 1940 // encoding space can be quickly tested with a table lookup, so we don't 1941 // wish to do binary chop search at a smaller granularity than that. A 1942 // 128-character space can take up a lot of space in the ranges array if, 1943 // for example, we only want to match every second character (eg. the lower 1944 // case characters on some Unicode pages). 1945 int binary_chop_index = (end_index + start_index) / 2; 1946 // The first test ensures that we get to the code that handles the ASCII 1947 // range with a single not-taken branch, speeding up this important 1948 // character range (even non-ASCII charset-based text has spaces and 1949 // punctuation). 1950 if (*border - 1 > String::kMaxOneByteCharCode && // ASCII case. 1951 end_index - start_index > (*new_start_index - start_index) * 2 && 1952 last - first > kSize * 2 && 1953 binary_chop_index > *new_start_index && 1954 ranges->at(binary_chop_index) >= first + 2 * kSize) { 1955 int scan_forward_for_section_border = binary_chop_index;; 1956 int new_border = (ranges->at(binary_chop_index) | kMask) + 1; 1957 1958 while (scan_forward_for_section_border < end_index) { 1959 if (ranges->at(scan_forward_for_section_border) > new_border) { 1960 *new_start_index = scan_forward_for_section_border; 1961 *border = new_border; 1962 break; 1963 } 1964 scan_forward_for_section_border++; 1965 } 1966 } 1967 1968 ASSERT(*new_start_index > start_index); 1969 *new_end_index = *new_start_index - 1; 1970 if (ranges->at(*new_end_index) == *border) { 1971 (*new_end_index)--; 1972 } 1973 if (*border >= ranges->at(end_index)) { 1974 *border = ranges->at(end_index); 1975 *new_start_index = end_index; // Won't be used. 1976 *new_end_index = end_index - 1; 1977 } 1978 } 1979 1980 1981 // Gets a series of segment boundaries representing a character class. If the 1982 // character is in the range between an even and an odd boundary (counting from 1983 // start_index) then go to even_label, otherwise go to odd_label. We already 1984 // know that the character is in the range of min_char to max_char inclusive. 1985 // Either label can be NULL indicating backtracking. Either label can also be 1986 // equal to the fall_through label. 1987 static void GenerateBranches(RegExpMacroAssembler* masm, 1988 ZoneList<int>* ranges, 1989 int start_index, 1990 int end_index, 1991 uc16 min_char, 1992 uc16 max_char, 1993 Label* fall_through, 1994 Label* even_label, 1995 Label* odd_label) { 1996 int first = ranges->at(start_index); 1997 int last = ranges->at(end_index) - 1; 1998 1999 ASSERT_LT(min_char, first); 2000 2001 // Just need to test if the character is before or on-or-after 2002 // a particular character. 2003 if (start_index == end_index) { 2004 EmitBoundaryTest(masm, first, fall_through, even_label, odd_label); 2005 return; 2006 } 2007 2008 // Another almost trivial case: There is one interval in the middle that is 2009 // different from the end intervals. 2010 if (start_index + 1 == end_index) { 2011 EmitDoubleBoundaryTest( 2012 masm, first, last, fall_through, even_label, odd_label); 2013 return; 2014 } 2015 2016 // It's not worth using table lookup if there are very few intervals in the 2017 // character class. 2018 if (end_index - start_index <= 6) { 2019 // It is faster to test for individual characters, so we look for those 2020 // first, then try arbitrary ranges in the second round. 2021 static int kNoCutIndex = -1; 2022 int cut = kNoCutIndex; 2023 for (int i = start_index; i < end_index; i++) { 2024 if (ranges->at(i) == ranges->at(i + 1) - 1) { 2025 cut = i; 2026 break; 2027 } 2028 } 2029 if (cut == kNoCutIndex) cut = start_index; 2030 CutOutRange( 2031 masm, ranges, start_index, end_index, cut, even_label, odd_label); 2032 ASSERT_GE(end_index - start_index, 2); 2033 GenerateBranches(masm, 2034 ranges, 2035 start_index + 1, 2036 end_index - 1, 2037 min_char, 2038 max_char, 2039 fall_through, 2040 even_label, 2041 odd_label); 2042 return; 2043 } 2044 2045 // If there are a lot of intervals in the regexp, then we will use tables to 2046 // determine whether the character is inside or outside the character class. 2047 static const int kBits = RegExpMacroAssembler::kTableSizeBits; 2048 2049 if ((max_char >> kBits) == (min_char >> kBits)) { 2050 EmitUseLookupTable(masm, 2051 ranges, 2052 start_index, 2053 end_index, 2054 min_char, 2055 fall_through, 2056 even_label, 2057 odd_label); 2058 return; 2059 } 2060 2061 if ((min_char >> kBits) != (first >> kBits)) { 2062 masm->CheckCharacterLT(first, odd_label); 2063 GenerateBranches(masm, 2064 ranges, 2065 start_index + 1, 2066 end_index, 2067 first, 2068 max_char, 2069 fall_through, 2070 odd_label, 2071 even_label); 2072 return; 2073 } 2074 2075 int new_start_index = 0; 2076 int new_end_index = 0; 2077 int border = 0; 2078 2079 SplitSearchSpace(ranges, 2080 start_index, 2081 end_index, 2082 &new_start_index, 2083 &new_end_index, 2084 &border); 2085 2086 Label handle_rest; 2087 Label* above = &handle_rest; 2088 if (border == last + 1) { 2089 // We didn't find any section that started after the limit, so everything 2090 // above the border is one of the terminal labels. 2091 above = (end_index & 1) != (start_index & 1) ? odd_label : even_label; 2092 ASSERT(new_end_index == end_index - 1); 2093 } 2094 2095 ASSERT_LE(start_index, new_end_index); 2096 ASSERT_LE(new_start_index, end_index); 2097 ASSERT_LT(start_index, new_start_index); 2098 ASSERT_LT(new_end_index, end_index); 2099 ASSERT(new_end_index + 1 == new_start_index || 2100 (new_end_index + 2 == new_start_index && 2101 border == ranges->at(new_end_index + 1))); 2102 ASSERT_LT(min_char, border - 1); 2103 ASSERT_LT(border, max_char); 2104 ASSERT_LT(ranges->at(new_end_index), border); 2105 ASSERT(border < ranges->at(new_start_index) || 2106 (border == ranges->at(new_start_index) && 2107 new_start_index == end_index && 2108 new_end_index == end_index - 1 && 2109 border == last + 1)); 2110 ASSERT(new_start_index == 0 || border >= ranges->at(new_start_index - 1)); 2111 2112 masm->CheckCharacterGT(border - 1, above); 2113 Label dummy; 2114 GenerateBranches(masm, 2115 ranges, 2116 start_index, 2117 new_end_index, 2118 min_char, 2119 border - 1, 2120 &dummy, 2121 even_label, 2122 odd_label); 2123 if (handle_rest.is_linked()) { 2124 masm->Bind(&handle_rest); 2125 bool flip = (new_start_index & 1) != (start_index & 1); 2126 GenerateBranches(masm, 2127 ranges, 2128 new_start_index, 2129 end_index, 2130 border, 2131 max_char, 2132 &dummy, 2133 flip ? odd_label : even_label, 2134 flip ? even_label : odd_label); 2135 } 2136 } 2137 2138 2139 static void EmitCharClass(RegExpMacroAssembler* macro_assembler, 2140 RegExpCharacterClass* cc, 2141 bool ascii, 2142 Label* on_failure, 2143 int cp_offset, 2144 bool check_offset, 2145 bool preloaded, 2146 Zone* zone) { 2147 ZoneList<CharacterRange>* ranges = cc->ranges(zone); 2148 if (!CharacterRange::IsCanonical(ranges)) { 2149 CharacterRange::Canonicalize(ranges); 2150 } 2151 2152 int max_char; 2153 if (ascii) { 2154 max_char = String::kMaxOneByteCharCode; 2155 } else { 2156 max_char = String::kMaxUtf16CodeUnit; 2157 } 2158 2159 int range_count = ranges->length(); 2160 2161 int last_valid_range = range_count - 1; 2162 while (last_valid_range >= 0) { 2163 CharacterRange& range = ranges->at(last_valid_range); 2164 if (range.from() <= max_char) { 2165 break; 2166 } 2167 last_valid_range--; 2168 } 2169 2170 if (last_valid_range < 0) { 2171 if (!cc->is_negated()) { 2172 macro_assembler->GoTo(on_failure); 2173 } 2174 if (check_offset) { 2175 macro_assembler->CheckPosition(cp_offset, on_failure); 2176 } 2177 return; 2178 } 2179 2180 if (last_valid_range == 0 && 2181 ranges->at(0).IsEverything(max_char)) { 2182 if (cc->is_negated()) { 2183 macro_assembler->GoTo(on_failure); 2184 } else { 2185 // This is a common case hit by non-anchored expressions. 2186 if (check_offset) { 2187 macro_assembler->CheckPosition(cp_offset, on_failure); 2188 } 2189 } 2190 return; 2191 } 2192 if (last_valid_range == 0 && 2193 !cc->is_negated() && 2194 ranges->at(0).IsEverything(max_char)) { 2195 // This is a common case hit by non-anchored expressions. 2196 if (check_offset) { 2197 macro_assembler->CheckPosition(cp_offset, on_failure); 2198 } 2199 return; 2200 } 2201 2202 if (!preloaded) { 2203 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset); 2204 } 2205 2206 if (cc->is_standard(zone) && 2207 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(), 2208 on_failure)) { 2209 return; 2210 } 2211 2212 2213 // A new list with ascending entries. Each entry is a code unit 2214 // where there is a boundary between code units that are part of 2215 // the class and code units that are not. Normally we insert an 2216 // entry at zero which goes to the failure label, but if there 2217 // was already one there we fall through for success on that entry. 2218 // Subsequent entries have alternating meaning (success/failure). 2219 ZoneList<int>* range_boundaries = 2220 new(zone) ZoneList<int>(last_valid_range, zone); 2221 2222 bool zeroth_entry_is_failure = !cc->is_negated(); 2223 2224 for (int i = 0; i <= last_valid_range; i++) { 2225 CharacterRange& range = ranges->at(i); 2226 if (range.from() == 0) { 2227 ASSERT_EQ(i, 0); 2228 zeroth_entry_is_failure = !zeroth_entry_is_failure; 2229 } else { 2230 range_boundaries->Add(range.from(), zone); 2231 } 2232 range_boundaries->Add(range.to() + 1, zone); 2233 } 2234 int end_index = range_boundaries->length() - 1; 2235 if (range_boundaries->at(end_index) > max_char) { 2236 end_index--; 2237 } 2238 2239 Label fall_through; 2240 GenerateBranches(macro_assembler, 2241 range_boundaries, 2242 0, // start_index. 2243 end_index, 2244 0, // min_char. 2245 max_char, 2246 &fall_through, 2247 zeroth_entry_is_failure ? &fall_through : on_failure, 2248 zeroth_entry_is_failure ? on_failure : &fall_through); 2249 macro_assembler->Bind(&fall_through); 2250 } 2251 2252 2253 RegExpNode::~RegExpNode() { 2254 } 2255 2256 2257 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler, 2258 Trace* trace) { 2259 // If we are generating a greedy loop then don't stop and don't reuse code. 2260 if (trace->stop_node() != NULL) { 2261 return CONTINUE; 2262 } 2263 2264 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 2265 if (trace->is_trivial()) { 2266 if (label_.is_bound()) { 2267 // We are being asked to generate a generic version, but that's already 2268 // been done so just go to it. 2269 macro_assembler->GoTo(&label_); 2270 return DONE; 2271 } 2272 if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) { 2273 // To avoid too deep recursion we push the node to the work queue and just 2274 // generate a goto here. 2275 compiler->AddWork(this); 2276 macro_assembler->GoTo(&label_); 2277 return DONE; 2278 } 2279 // Generate generic version of the node and bind the label for later use. 2280 macro_assembler->Bind(&label_); 2281 return CONTINUE; 2282 } 2283 2284 // We are being asked to make a non-generic version. Keep track of how many 2285 // non-generic versions we generate so as not to overdo it. 2286 trace_count_++; 2287 if (FLAG_regexp_optimization && 2288 trace_count_ < kMaxCopiesCodeGenerated && 2289 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) { 2290 return CONTINUE; 2291 } 2292 2293 // If we get here code has been generated for this node too many times or 2294 // recursion is too deep. Time to switch to a generic version. The code for 2295 // generic versions above can handle deep recursion properly. 2296 trace->Flush(compiler, this); 2297 return DONE; 2298 } 2299 2300 2301 int ActionNode::EatsAtLeast(int still_to_find, 2302 int budget, 2303 bool not_at_start) { 2304 if (budget <= 0) return 0; 2305 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input! 2306 return on_success()->EatsAtLeast(still_to_find, 2307 budget - 1, 2308 not_at_start); 2309 } 2310 2311 2312 void ActionNode::FillInBMInfo(int offset, 2313 int budget, 2314 BoyerMooreLookahead* bm, 2315 bool not_at_start) { 2316 if (action_type_ == BEGIN_SUBMATCH) { 2317 bm->SetRest(offset); 2318 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) { 2319 on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start); 2320 } 2321 SaveBMInfo(bm, not_at_start, offset); 2322 } 2323 2324 2325 int AssertionNode::EatsAtLeast(int still_to_find, 2326 int budget, 2327 bool not_at_start) { 2328 if (budget <= 0) return 0; 2329 // If we know we are not at the start and we are asked "how many characters 2330 // will you match if you succeed?" then we can answer anything since false 2331 // implies false. So lets just return the max answer (still_to_find) since 2332 // that won't prevent us from preloading a lot of characters for the other 2333 // branches in the node graph. 2334 if (assertion_type() == AT_START && not_at_start) return still_to_find; 2335 return on_success()->EatsAtLeast(still_to_find, 2336 budget - 1, 2337 not_at_start); 2338 } 2339 2340 2341 void AssertionNode::FillInBMInfo(int offset, 2342 int budget, 2343 BoyerMooreLookahead* bm, 2344 bool not_at_start) { 2345 // Match the behaviour of EatsAtLeast on this node. 2346 if (assertion_type() == AT_START && not_at_start) return; 2347 on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start); 2348 SaveBMInfo(bm, not_at_start, offset); 2349 } 2350 2351 2352 int BackReferenceNode::EatsAtLeast(int still_to_find, 2353 int budget, 2354 bool not_at_start) { 2355 if (budget <= 0) return 0; 2356 return on_success()->EatsAtLeast(still_to_find, 2357 budget - 1, 2358 not_at_start); 2359 } 2360 2361 2362 int TextNode::EatsAtLeast(int still_to_find, 2363 int budget, 2364 bool not_at_start) { 2365 int answer = Length(); 2366 if (answer >= still_to_find) return answer; 2367 if (budget <= 0) return answer; 2368 // We are not at start after this node so we set the last argument to 'true'. 2369 return answer + on_success()->EatsAtLeast(still_to_find - answer, 2370 budget - 1, 2371 true); 2372 } 2373 2374 2375 int NegativeLookaheadChoiceNode::EatsAtLeast(int still_to_find, 2376 int budget, 2377 bool not_at_start) { 2378 if (budget <= 0) return 0; 2379 // Alternative 0 is the negative lookahead, alternative 1 is what comes 2380 // afterwards. 2381 RegExpNode* node = alternatives_->at(1).node(); 2382 return node->EatsAtLeast(still_to_find, budget - 1, not_at_start); 2383 } 2384 2385 2386 void NegativeLookaheadChoiceNode::GetQuickCheckDetails( 2387 QuickCheckDetails* details, 2388 RegExpCompiler* compiler, 2389 int filled_in, 2390 bool not_at_start) { 2391 // Alternative 0 is the negative lookahead, alternative 1 is what comes 2392 // afterwards. 2393 RegExpNode* node = alternatives_->at(1).node(); 2394 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start); 2395 } 2396 2397 2398 int ChoiceNode::EatsAtLeastHelper(int still_to_find, 2399 int budget, 2400 RegExpNode* ignore_this_node, 2401 bool not_at_start) { 2402 if (budget <= 0) return 0; 2403 int min = 100; 2404 int choice_count = alternatives_->length(); 2405 budget = (budget - 1) / choice_count; 2406 for (int i = 0; i < choice_count; i++) { 2407 RegExpNode* node = alternatives_->at(i).node(); 2408 if (node == ignore_this_node) continue; 2409 int node_eats_at_least = 2410 node->EatsAtLeast(still_to_find, budget, not_at_start); 2411 if (node_eats_at_least < min) min = node_eats_at_least; 2412 if (min == 0) return 0; 2413 } 2414 return min; 2415 } 2416 2417 2418 int LoopChoiceNode::EatsAtLeast(int still_to_find, 2419 int budget, 2420 bool not_at_start) { 2421 return EatsAtLeastHelper(still_to_find, 2422 budget - 1, 2423 loop_node_, 2424 not_at_start); 2425 } 2426 2427 2428 int ChoiceNode::EatsAtLeast(int still_to_find, 2429 int budget, 2430 bool not_at_start) { 2431 return EatsAtLeastHelper(still_to_find, 2432 budget, 2433 NULL, 2434 not_at_start); 2435 } 2436 2437 2438 // Takes the left-most 1-bit and smears it out, setting all bits to its right. 2439 static inline uint32_t SmearBitsRight(uint32_t v) { 2440 v |= v >> 1; 2441 v |= v >> 2; 2442 v |= v >> 4; 2443 v |= v >> 8; 2444 v |= v >> 16; 2445 return v; 2446 } 2447 2448 2449 bool QuickCheckDetails::Rationalize(bool asc) { 2450 bool found_useful_op = false; 2451 uint32_t char_mask; 2452 if (asc) { 2453 char_mask = String::kMaxOneByteCharCode; 2454 } else { 2455 char_mask = String::kMaxUtf16CodeUnit; 2456 } 2457 mask_ = 0; 2458 value_ = 0; 2459 int char_shift = 0; 2460 for (int i = 0; i < characters_; i++) { 2461 Position* pos = &positions_[i]; 2462 if ((pos->mask & String::kMaxOneByteCharCode) != 0) { 2463 found_useful_op = true; 2464 } 2465 mask_ |= (pos->mask & char_mask) << char_shift; 2466 value_ |= (pos->value & char_mask) << char_shift; 2467 char_shift += asc ? 8 : 16; 2468 } 2469 return found_useful_op; 2470 } 2471 2472 2473 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler, 2474 Trace* trace, 2475 bool preload_has_checked_bounds, 2476 Label* on_possible_success, 2477 QuickCheckDetails* details, 2478 bool fall_through_on_failure) { 2479 if (details->characters() == 0) return false; 2480 GetQuickCheckDetails( 2481 details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE); 2482 if (details->cannot_match()) return false; 2483 if (!details->Rationalize(compiler->ascii())) return false; 2484 ASSERT(details->characters() == 1 || 2485 compiler->macro_assembler()->CanReadUnaligned()); 2486 uint32_t mask = details->mask(); 2487 uint32_t value = details->value(); 2488 2489 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2490 2491 if (trace->characters_preloaded() != details->characters()) { 2492 assembler->LoadCurrentCharacter(trace->cp_offset(), 2493 trace->backtrack(), 2494 !preload_has_checked_bounds, 2495 details->characters()); 2496 } 2497 2498 2499 bool need_mask = true; 2500 2501 if (details->characters() == 1) { 2502 // If number of characters preloaded is 1 then we used a byte or 16 bit 2503 // load so the value is already masked down. 2504 uint32_t char_mask; 2505 if (compiler->ascii()) { 2506 char_mask = String::kMaxOneByteCharCode; 2507 } else { 2508 char_mask = String::kMaxUtf16CodeUnit; 2509 } 2510 if ((mask & char_mask) == char_mask) need_mask = false; 2511 mask &= char_mask; 2512 } else { 2513 // For 2-character preloads in ASCII mode or 1-character preloads in 2514 // TWO_BYTE mode we also use a 16 bit load with zero extend. 2515 if (details->characters() == 2 && compiler->ascii()) { 2516 if ((mask & 0xffff) == 0xffff) need_mask = false; 2517 } else if (details->characters() == 1 && !compiler->ascii()) { 2518 if ((mask & 0xffff) == 0xffff) need_mask = false; 2519 } else { 2520 if (mask == 0xffffffff) need_mask = false; 2521 } 2522 } 2523 2524 if (fall_through_on_failure) { 2525 if (need_mask) { 2526 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success); 2527 } else { 2528 assembler->CheckCharacter(value, on_possible_success); 2529 } 2530 } else { 2531 if (need_mask) { 2532 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack()); 2533 } else { 2534 assembler->CheckNotCharacter(value, trace->backtrack()); 2535 } 2536 } 2537 return true; 2538 } 2539 2540 2541 // Here is the meat of GetQuickCheckDetails (see also the comment on the 2542 // super-class in the .h file). 2543 // 2544 // We iterate along the text object, building up for each character a 2545 // mask and value that can be used to test for a quick failure to match. 2546 // The masks and values for the positions will be combined into a single 2547 // machine word for the current character width in order to be used in 2548 // generating a quick check. 2549 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, 2550 RegExpCompiler* compiler, 2551 int characters_filled_in, 2552 bool not_at_start) { 2553 Isolate* isolate = compiler->macro_assembler()->zone()->isolate(); 2554 ASSERT(characters_filled_in < details->characters()); 2555 int characters = details->characters(); 2556 int char_mask; 2557 if (compiler->ascii()) { 2558 char_mask = String::kMaxOneByteCharCode; 2559 } else { 2560 char_mask = String::kMaxUtf16CodeUnit; 2561 } 2562 for (int k = 0; k < elms_->length(); k++) { 2563 TextElement elm = elms_->at(k); 2564 if (elm.text_type() == TextElement::ATOM) { 2565 Vector<const uc16> quarks = elm.atom()->data(); 2566 for (int i = 0; i < characters && i < quarks.length(); i++) { 2567 QuickCheckDetails::Position* pos = 2568 details->positions(characters_filled_in); 2569 uc16 c = quarks[i]; 2570 if (c > char_mask) { 2571 // If we expect a non-ASCII character from an ASCII string, 2572 // there is no way we can match. Not even case independent 2573 // matching can turn an ASCII character into non-ASCII or 2574 // vice versa. 2575 details->set_cannot_match(); 2576 pos->determines_perfectly = false; 2577 return; 2578 } 2579 if (compiler->ignore_case()) { 2580 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 2581 int length = GetCaseIndependentLetters(isolate, c, compiler->ascii(), 2582 chars); 2583 ASSERT(length != 0); // Can only happen if c > char_mask (see above). 2584 if (length == 1) { 2585 // This letter has no case equivalents, so it's nice and simple 2586 // and the mask-compare will determine definitely whether we have 2587 // a match at this character position. 2588 pos->mask = char_mask; 2589 pos->value = c; 2590 pos->determines_perfectly = true; 2591 } else { 2592 uint32_t common_bits = char_mask; 2593 uint32_t bits = chars[0]; 2594 for (int j = 1; j < length; j++) { 2595 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits); 2596 common_bits ^= differing_bits; 2597 bits &= common_bits; 2598 } 2599 // If length is 2 and common bits has only one zero in it then 2600 // our mask and compare instruction will determine definitely 2601 // whether we have a match at this character position. Otherwise 2602 // it can only be an approximate check. 2603 uint32_t one_zero = (common_bits | ~char_mask); 2604 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) { 2605 pos->determines_perfectly = true; 2606 } 2607 pos->mask = common_bits; 2608 pos->value = bits; 2609 } 2610 } else { 2611 // Don't ignore case. Nice simple case where the mask-compare will 2612 // determine definitely whether we have a match at this character 2613 // position. 2614 pos->mask = char_mask; 2615 pos->value = c; 2616 pos->determines_perfectly = true; 2617 } 2618 characters_filled_in++; 2619 ASSERT(characters_filled_in <= details->characters()); 2620 if (characters_filled_in == details->characters()) { 2621 return; 2622 } 2623 } 2624 } else { 2625 QuickCheckDetails::Position* pos = 2626 details->positions(characters_filled_in); 2627 RegExpCharacterClass* tree = elm.char_class(); 2628 ZoneList<CharacterRange>* ranges = tree->ranges(zone()); 2629 if (tree->is_negated()) { 2630 // A quick check uses multi-character mask and compare. There is no 2631 // useful way to incorporate a negative char class into this scheme 2632 // so we just conservatively create a mask and value that will always 2633 // succeed. 2634 pos->mask = 0; 2635 pos->value = 0; 2636 } else { 2637 int first_range = 0; 2638 while (ranges->at(first_range).from() > char_mask) { 2639 first_range++; 2640 if (first_range == ranges->length()) { 2641 details->set_cannot_match(); 2642 pos->determines_perfectly = false; 2643 return; 2644 } 2645 } 2646 CharacterRange range = ranges->at(first_range); 2647 uc16 from = range.from(); 2648 uc16 to = range.to(); 2649 if (to > char_mask) { 2650 to = char_mask; 2651 } 2652 uint32_t differing_bits = (from ^ to); 2653 // A mask and compare is only perfect if the differing bits form a 2654 // number like 00011111 with one single block of trailing 1s. 2655 if ((differing_bits & (differing_bits + 1)) == 0 && 2656 from + differing_bits == to) { 2657 pos->determines_perfectly = true; 2658 } 2659 uint32_t common_bits = ~SmearBitsRight(differing_bits); 2660 uint32_t bits = (from & common_bits); 2661 for (int i = first_range + 1; i < ranges->length(); i++) { 2662 CharacterRange range = ranges->at(i); 2663 uc16 from = range.from(); 2664 uc16 to = range.to(); 2665 if (from > char_mask) continue; 2666 if (to > char_mask) to = char_mask; 2667 // Here we are combining more ranges into the mask and compare 2668 // value. With each new range the mask becomes more sparse and 2669 // so the chances of a false positive rise. A character class 2670 // with multiple ranges is assumed never to be equivalent to a 2671 // mask and compare operation. 2672 pos->determines_perfectly = false; 2673 uint32_t new_common_bits = (from ^ to); 2674 new_common_bits = ~SmearBitsRight(new_common_bits); 2675 common_bits &= new_common_bits; 2676 bits &= new_common_bits; 2677 uint32_t differing_bits = (from & common_bits) ^ bits; 2678 common_bits ^= differing_bits; 2679 bits &= common_bits; 2680 } 2681 pos->mask = common_bits; 2682 pos->value = bits; 2683 } 2684 characters_filled_in++; 2685 ASSERT(characters_filled_in <= details->characters()); 2686 if (characters_filled_in == details->characters()) { 2687 return; 2688 } 2689 } 2690 } 2691 ASSERT(characters_filled_in != details->characters()); 2692 if (!details->cannot_match()) { 2693 on_success()-> GetQuickCheckDetails(details, 2694 compiler, 2695 characters_filled_in, 2696 true); 2697 } 2698 } 2699 2700 2701 void QuickCheckDetails::Clear() { 2702 for (int i = 0; i < characters_; i++) { 2703 positions_[i].mask = 0; 2704 positions_[i].value = 0; 2705 positions_[i].determines_perfectly = false; 2706 } 2707 characters_ = 0; 2708 } 2709 2710 2711 void QuickCheckDetails::Advance(int by, bool ascii) { 2712 ASSERT(by >= 0); 2713 if (by >= characters_) { 2714 Clear(); 2715 return; 2716 } 2717 for (int i = 0; i < characters_ - by; i++) { 2718 positions_[i] = positions_[by + i]; 2719 } 2720 for (int i = characters_ - by; i < characters_; i++) { 2721 positions_[i].mask = 0; 2722 positions_[i].value = 0; 2723 positions_[i].determines_perfectly = false; 2724 } 2725 characters_ -= by; 2726 // We could change mask_ and value_ here but we would never advance unless 2727 // they had already been used in a check and they won't be used again because 2728 // it would gain us nothing. So there's no point. 2729 } 2730 2731 2732 void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) { 2733 ASSERT(characters_ == other->characters_); 2734 if (other->cannot_match_) { 2735 return; 2736 } 2737 if (cannot_match_) { 2738 *this = *other; 2739 return; 2740 } 2741 for (int i = from_index; i < characters_; i++) { 2742 QuickCheckDetails::Position* pos = positions(i); 2743 QuickCheckDetails::Position* other_pos = other->positions(i); 2744 if (pos->mask != other_pos->mask || 2745 pos->value != other_pos->value || 2746 !other_pos->determines_perfectly) { 2747 // Our mask-compare operation will be approximate unless we have the 2748 // exact same operation on both sides of the alternation. 2749 pos->determines_perfectly = false; 2750 } 2751 pos->mask &= other_pos->mask; 2752 pos->value &= pos->mask; 2753 other_pos->value &= pos->mask; 2754 uc16 differing_bits = (pos->value ^ other_pos->value); 2755 pos->mask &= ~differing_bits; 2756 pos->value &= pos->mask; 2757 } 2758 } 2759 2760 2761 class VisitMarker { 2762 public: 2763 explicit VisitMarker(NodeInfo* info) : info_(info) { 2764 ASSERT(!info->visited); 2765 info->visited = true; 2766 } 2767 ~VisitMarker() { 2768 info_->visited = false; 2769 } 2770 private: 2771 NodeInfo* info_; 2772 }; 2773 2774 2775 RegExpNode* SeqRegExpNode::FilterASCII(int depth, bool ignore_case) { 2776 if (info()->replacement_calculated) return replacement(); 2777 if (depth < 0) return this; 2778 ASSERT(!info()->visited); 2779 VisitMarker marker(info()); 2780 return FilterSuccessor(depth - 1, ignore_case); 2781 } 2782 2783 2784 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) { 2785 RegExpNode* next = on_success_->FilterASCII(depth - 1, ignore_case); 2786 if (next == NULL) return set_replacement(NULL); 2787 on_success_ = next; 2788 return set_replacement(this); 2789 } 2790 2791 2792 // We need to check for the following characters: 0x39c 0x3bc 0x178. 2793 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) { 2794 // TODO(dcarney): this could be a lot more efficient. 2795 return range.Contains(0x39c) || 2796 range.Contains(0x3bc) || range.Contains(0x178); 2797 } 2798 2799 2800 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) { 2801 for (int i = 0; i < ranges->length(); i++) { 2802 // TODO(dcarney): this could be a lot more efficient. 2803 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true; 2804 } 2805 return false; 2806 } 2807 2808 2809 RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) { 2810 if (info()->replacement_calculated) return replacement(); 2811 if (depth < 0) return this; 2812 ASSERT(!info()->visited); 2813 VisitMarker marker(info()); 2814 int element_count = elms_->length(); 2815 for (int i = 0; i < element_count; i++) { 2816 TextElement elm = elms_->at(i); 2817 if (elm.text_type() == TextElement::ATOM) { 2818 Vector<const uc16> quarks = elm.atom()->data(); 2819 for (int j = 0; j < quarks.length(); j++) { 2820 uint16_t c = quarks[j]; 2821 if (c <= String::kMaxOneByteCharCode) continue; 2822 if (!ignore_case) return set_replacement(NULL); 2823 // Here, we need to check for characters whose upper and lower cases 2824 // are outside the Latin-1 range. 2825 uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c); 2826 // Character is outside Latin-1 completely 2827 if (converted == 0) return set_replacement(NULL); 2828 // Convert quark to Latin-1 in place. 2829 uint16_t* copy = const_cast<uint16_t*>(quarks.start()); 2830 copy[j] = converted; 2831 } 2832 } else { 2833 ASSERT(elm.text_type() == TextElement::CHAR_CLASS); 2834 RegExpCharacterClass* cc = elm.char_class(); 2835 ZoneList<CharacterRange>* ranges = cc->ranges(zone()); 2836 if (!CharacterRange::IsCanonical(ranges)) { 2837 CharacterRange::Canonicalize(ranges); 2838 } 2839 // Now they are in order so we only need to look at the first. 2840 int range_count = ranges->length(); 2841 if (cc->is_negated()) { 2842 if (range_count != 0 && 2843 ranges->at(0).from() == 0 && 2844 ranges->at(0).to() >= String::kMaxOneByteCharCode) { 2845 // This will be handled in a later filter. 2846 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue; 2847 return set_replacement(NULL); 2848 } 2849 } else { 2850 if (range_count == 0 || 2851 ranges->at(0).from() > String::kMaxOneByteCharCode) { 2852 // This will be handled in a later filter. 2853 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue; 2854 return set_replacement(NULL); 2855 } 2856 } 2857 } 2858 } 2859 return FilterSuccessor(depth - 1, ignore_case); 2860 } 2861 2862 2863 RegExpNode* LoopChoiceNode::FilterASCII(int depth, bool ignore_case) { 2864 if (info()->replacement_calculated) return replacement(); 2865 if (depth < 0) return this; 2866 if (info()->visited) return this; 2867 { 2868 VisitMarker marker(info()); 2869 2870 RegExpNode* continue_replacement = 2871 continue_node_->FilterASCII(depth - 1, ignore_case); 2872 // If we can't continue after the loop then there is no sense in doing the 2873 // loop. 2874 if (continue_replacement == NULL) return set_replacement(NULL); 2875 } 2876 2877 return ChoiceNode::FilterASCII(depth - 1, ignore_case); 2878 } 2879 2880 2881 RegExpNode* ChoiceNode::FilterASCII(int depth, bool ignore_case) { 2882 if (info()->replacement_calculated) return replacement(); 2883 if (depth < 0) return this; 2884 if (info()->visited) return this; 2885 VisitMarker marker(info()); 2886 int choice_count = alternatives_->length(); 2887 2888 for (int i = 0; i < choice_count; i++) { 2889 GuardedAlternative alternative = alternatives_->at(i); 2890 if (alternative.guards() != NULL && alternative.guards()->length() != 0) { 2891 set_replacement(this); 2892 return this; 2893 } 2894 } 2895 2896 int surviving = 0; 2897 RegExpNode* survivor = NULL; 2898 for (int i = 0; i < choice_count; i++) { 2899 GuardedAlternative alternative = alternatives_->at(i); 2900 RegExpNode* replacement = 2901 alternative.node()->FilterASCII(depth - 1, ignore_case); 2902 ASSERT(replacement != this); // No missing EMPTY_MATCH_CHECK. 2903 if (replacement != NULL) { 2904 alternatives_->at(i).set_node(replacement); 2905 surviving++; 2906 survivor = replacement; 2907 } 2908 } 2909 if (surviving < 2) return set_replacement(survivor); 2910 2911 set_replacement(this); 2912 if (surviving == choice_count) { 2913 return this; 2914 } 2915 // Only some of the nodes survived the filtering. We need to rebuild the 2916 // alternatives list. 2917 ZoneList<GuardedAlternative>* new_alternatives = 2918 new(zone()) ZoneList<GuardedAlternative>(surviving, zone()); 2919 for (int i = 0; i < choice_count; i++) { 2920 RegExpNode* replacement = 2921 alternatives_->at(i).node()->FilterASCII(depth - 1, ignore_case); 2922 if (replacement != NULL) { 2923 alternatives_->at(i).set_node(replacement); 2924 new_alternatives->Add(alternatives_->at(i), zone()); 2925 } 2926 } 2927 alternatives_ = new_alternatives; 2928 return this; 2929 } 2930 2931 2932 RegExpNode* NegativeLookaheadChoiceNode::FilterASCII(int depth, 2933 bool ignore_case) { 2934 if (info()->replacement_calculated) return replacement(); 2935 if (depth < 0) return this; 2936 if (info()->visited) return this; 2937 VisitMarker marker(info()); 2938 // Alternative 0 is the negative lookahead, alternative 1 is what comes 2939 // afterwards. 2940 RegExpNode* node = alternatives_->at(1).node(); 2941 RegExpNode* replacement = node->FilterASCII(depth - 1, ignore_case); 2942 if (replacement == NULL) return set_replacement(NULL); 2943 alternatives_->at(1).set_node(replacement); 2944 2945 RegExpNode* neg_node = alternatives_->at(0).node(); 2946 RegExpNode* neg_replacement = neg_node->FilterASCII(depth - 1, ignore_case); 2947 // If the negative lookahead is always going to fail then 2948 // we don't need to check it. 2949 if (neg_replacement == NULL) return set_replacement(replacement); 2950 alternatives_->at(0).set_node(neg_replacement); 2951 return set_replacement(this); 2952 } 2953 2954 2955 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 2956 RegExpCompiler* compiler, 2957 int characters_filled_in, 2958 bool not_at_start) { 2959 if (body_can_be_zero_length_ || info()->visited) return; 2960 VisitMarker marker(info()); 2961 return ChoiceNode::GetQuickCheckDetails(details, 2962 compiler, 2963 characters_filled_in, 2964 not_at_start); 2965 } 2966 2967 2968 void LoopChoiceNode::FillInBMInfo(int offset, 2969 int budget, 2970 BoyerMooreLookahead* bm, 2971 bool not_at_start) { 2972 if (body_can_be_zero_length_ || budget <= 0) { 2973 bm->SetRest(offset); 2974 SaveBMInfo(bm, not_at_start, offset); 2975 return; 2976 } 2977 ChoiceNode::FillInBMInfo(offset, budget - 1, bm, not_at_start); 2978 SaveBMInfo(bm, not_at_start, offset); 2979 } 2980 2981 2982 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 2983 RegExpCompiler* compiler, 2984 int characters_filled_in, 2985 bool not_at_start) { 2986 not_at_start = (not_at_start || not_at_start_); 2987 int choice_count = alternatives_->length(); 2988 ASSERT(choice_count > 0); 2989 alternatives_->at(0).node()->GetQuickCheckDetails(details, 2990 compiler, 2991 characters_filled_in, 2992 not_at_start); 2993 for (int i = 1; i < choice_count; i++) { 2994 QuickCheckDetails new_details(details->characters()); 2995 RegExpNode* node = alternatives_->at(i).node(); 2996 node->GetQuickCheckDetails(&new_details, compiler, 2997 characters_filled_in, 2998 not_at_start); 2999 // Here we merge the quick match details of the two branches. 3000 details->Merge(&new_details, characters_filled_in); 3001 } 3002 } 3003 3004 3005 // Check for [0-9A-Z_a-z]. 3006 static void EmitWordCheck(RegExpMacroAssembler* assembler, 3007 Label* word, 3008 Label* non_word, 3009 bool fall_through_on_word) { 3010 if (assembler->CheckSpecialCharacterClass( 3011 fall_through_on_word ? 'w' : 'W', 3012 fall_through_on_word ? non_word : word)) { 3013 // Optimized implementation available. 3014 return; 3015 } 3016 assembler->CheckCharacterGT('z', non_word); 3017 assembler->CheckCharacterLT('0', non_word); 3018 assembler->CheckCharacterGT('a' - 1, word); 3019 assembler->CheckCharacterLT('9' + 1, word); 3020 assembler->CheckCharacterLT('A', non_word); 3021 assembler->CheckCharacterLT('Z' + 1, word); 3022 if (fall_through_on_word) { 3023 assembler->CheckNotCharacter('_', non_word); 3024 } else { 3025 assembler->CheckCharacter('_', word); 3026 } 3027 } 3028 3029 3030 // Emit the code to check for a ^ in multiline mode (1-character lookbehind 3031 // that matches newline or the start of input). 3032 static void EmitHat(RegExpCompiler* compiler, 3033 RegExpNode* on_success, 3034 Trace* trace) { 3035 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3036 // We will be loading the previous character into the current character 3037 // register. 3038 Trace new_trace(*trace); 3039 new_trace.InvalidateCurrentCharacter(); 3040 3041 Label ok; 3042 if (new_trace.cp_offset() == 0) { 3043 // The start of input counts as a newline in this context, so skip to 3044 // ok if we are at the start. 3045 assembler->CheckAtStart(&ok); 3046 } 3047 // We already checked that we are not at the start of input so it must be 3048 // OK to load the previous character. 3049 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1, 3050 new_trace.backtrack(), 3051 false); 3052 if (!assembler->CheckSpecialCharacterClass('n', 3053 new_trace.backtrack())) { 3054 // Newline means \n, \r, 0x2028 or 0x2029. 3055 if (!compiler->ascii()) { 3056 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok); 3057 } 3058 assembler->CheckCharacter('\n', &ok); 3059 assembler->CheckNotCharacter('\r', new_trace.backtrack()); 3060 } 3061 assembler->Bind(&ok); 3062 on_success->Emit(compiler, &new_trace); 3063 } 3064 3065 3066 // Emit the code to handle \b and \B (word-boundary or non-word-boundary). 3067 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) { 3068 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3069 Trace::TriBool next_is_word_character = Trace::UNKNOWN; 3070 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE); 3071 BoyerMooreLookahead* lookahead = bm_info(not_at_start); 3072 if (lookahead == NULL) { 3073 int eats_at_least = 3074 Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore, 3075 kRecursionBudget, 3076 not_at_start)); 3077 if (eats_at_least >= 1) { 3078 BoyerMooreLookahead* bm = 3079 new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone()); 3080 FillInBMInfo(0, kRecursionBudget, bm, not_at_start); 3081 if (bm->at(0)->is_non_word()) 3082 next_is_word_character = Trace::FALSE_VALUE; 3083 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE; 3084 } 3085 } else { 3086 if (lookahead->at(0)->is_non_word()) 3087 next_is_word_character = Trace::FALSE_VALUE; 3088 if (lookahead->at(0)->is_word()) 3089 next_is_word_character = Trace::TRUE_VALUE; 3090 } 3091 bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY); 3092 if (next_is_word_character == Trace::UNKNOWN) { 3093 Label before_non_word; 3094 Label before_word; 3095 if (trace->characters_preloaded() != 1) { 3096 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word); 3097 } 3098 // Fall through on non-word. 3099 EmitWordCheck(assembler, &before_word, &before_non_word, false); 3100 // Next character is not a word character. 3101 assembler->Bind(&before_non_word); 3102 Label ok; 3103 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord); 3104 assembler->GoTo(&ok); 3105 3106 assembler->Bind(&before_word); 3107 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord); 3108 assembler->Bind(&ok); 3109 } else if (next_is_word_character == Trace::TRUE_VALUE) { 3110 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord); 3111 } else { 3112 ASSERT(next_is_word_character == Trace::FALSE_VALUE); 3113 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord); 3114 } 3115 } 3116 3117 3118 void AssertionNode::BacktrackIfPrevious( 3119 RegExpCompiler* compiler, 3120 Trace* trace, 3121 AssertionNode::IfPrevious backtrack_if_previous) { 3122 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3123 Trace new_trace(*trace); 3124 new_trace.InvalidateCurrentCharacter(); 3125 3126 Label fall_through, dummy; 3127 3128 Label* non_word = backtrack_if_previous == kIsNonWord ? 3129 new_trace.backtrack() : 3130 &fall_through; 3131 Label* word = backtrack_if_previous == kIsNonWord ? 3132 &fall_through : 3133 new_trace.backtrack(); 3134 3135 if (new_trace.cp_offset() == 0) { 3136 // The start of input counts as a non-word character, so the question is 3137 // decided if we are at the start. 3138 assembler->CheckAtStart(non_word); 3139 } 3140 // We already checked that we are not at the start of input so it must be 3141 // OK to load the previous character. 3142 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false); 3143 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord); 3144 3145 assembler->Bind(&fall_through); 3146 on_success()->Emit(compiler, &new_trace); 3147 } 3148 3149 3150 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details, 3151 RegExpCompiler* compiler, 3152 int filled_in, 3153 bool not_at_start) { 3154 if (assertion_type_ == AT_START && not_at_start) { 3155 details->set_cannot_match(); 3156 return; 3157 } 3158 return on_success()->GetQuickCheckDetails(details, 3159 compiler, 3160 filled_in, 3161 not_at_start); 3162 } 3163 3164 3165 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3166 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3167 switch (assertion_type_) { 3168 case AT_END: { 3169 Label ok; 3170 assembler->CheckPosition(trace->cp_offset(), &ok); 3171 assembler->GoTo(trace->backtrack()); 3172 assembler->Bind(&ok); 3173 break; 3174 } 3175 case AT_START: { 3176 if (trace->at_start() == Trace::FALSE_VALUE) { 3177 assembler->GoTo(trace->backtrack()); 3178 return; 3179 } 3180 if (trace->at_start() == Trace::UNKNOWN) { 3181 assembler->CheckNotAtStart(trace->backtrack()); 3182 Trace at_start_trace = *trace; 3183 at_start_trace.set_at_start(true); 3184 on_success()->Emit(compiler, &at_start_trace); 3185 return; 3186 } 3187 } 3188 break; 3189 case AFTER_NEWLINE: 3190 EmitHat(compiler, on_success(), trace); 3191 return; 3192 case AT_BOUNDARY: 3193 case AT_NON_BOUNDARY: { 3194 EmitBoundaryCheck(compiler, trace); 3195 return; 3196 } 3197 } 3198 on_success()->Emit(compiler, trace); 3199 } 3200 3201 3202 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) { 3203 if (quick_check == NULL) return false; 3204 if (offset >= quick_check->characters()) return false; 3205 return quick_check->positions(offset)->determines_perfectly; 3206 } 3207 3208 3209 static void UpdateBoundsCheck(int index, int* checked_up_to) { 3210 if (index > *checked_up_to) { 3211 *checked_up_to = index; 3212 } 3213 } 3214 3215 3216 // We call this repeatedly to generate code for each pass over the text node. 3217 // The passes are in increasing order of difficulty because we hope one 3218 // of the first passes will fail in which case we are saved the work of the 3219 // later passes. for example for the case independent regexp /%[asdfghjkl]a/ 3220 // we will check the '%' in the first pass, the case independent 'a' in the 3221 // second pass and the character class in the last pass. 3222 // 3223 // The passes are done from right to left, so for example to test for /bar/ 3224 // we will first test for an 'r' with offset 2, then an 'a' with offset 1 3225 // and then a 'b' with offset 0. This means we can avoid the end-of-input 3226 // bounds check most of the time. In the example we only need to check for 3227 // end-of-input when loading the putative 'r'. 3228 // 3229 // A slight complication involves the fact that the first character may already 3230 // be fetched into a register by the previous node. In this case we want to 3231 // do the test for that character first. We do this in separate passes. The 3232 // 'preloaded' argument indicates that we are doing such a 'pass'. If such a 3233 // pass has been performed then subsequent passes will have true in 3234 // first_element_checked to indicate that that character does not need to be 3235 // checked again. 3236 // 3237 // In addition to all this we are passed a Trace, which can 3238 // contain an AlternativeGeneration object. In this AlternativeGeneration 3239 // object we can see details of any quick check that was already passed in 3240 // order to get to the code we are now generating. The quick check can involve 3241 // loading characters, which means we do not need to recheck the bounds 3242 // up to the limit the quick check already checked. In addition the quick 3243 // check can have involved a mask and compare operation which may simplify 3244 // or obviate the need for further checks at some character positions. 3245 void TextNode::TextEmitPass(RegExpCompiler* compiler, 3246 TextEmitPassType pass, 3247 bool preloaded, 3248 Trace* trace, 3249 bool first_element_checked, 3250 int* checked_up_to) { 3251 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3252 Isolate* isolate = assembler->zone()->isolate(); 3253 bool ascii = compiler->ascii(); 3254 Label* backtrack = trace->backtrack(); 3255 QuickCheckDetails* quick_check = trace->quick_check_performed(); 3256 int element_count = elms_->length(); 3257 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) { 3258 TextElement elm = elms_->at(i); 3259 int cp_offset = trace->cp_offset() + elm.cp_offset(); 3260 if (elm.text_type() == TextElement::ATOM) { 3261 Vector<const uc16> quarks = elm.atom()->data(); 3262 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { 3263 if (first_element_checked && i == 0 && j == 0) continue; 3264 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue; 3265 EmitCharacterFunction* emit_function = NULL; 3266 switch (pass) { 3267 case NON_ASCII_MATCH: 3268 ASSERT(ascii); 3269 if (quarks[j] > String::kMaxOneByteCharCode) { 3270 assembler->GoTo(backtrack); 3271 return; 3272 } 3273 break; 3274 case NON_LETTER_CHARACTER_MATCH: 3275 emit_function = &EmitAtomNonLetter; 3276 break; 3277 case SIMPLE_CHARACTER_MATCH: 3278 emit_function = &EmitSimpleCharacter; 3279 break; 3280 case CASE_CHARACTER_MATCH: 3281 emit_function = &EmitAtomLetter; 3282 break; 3283 default: 3284 break; 3285 } 3286 if (emit_function != NULL) { 3287 bool bound_checked = emit_function(isolate, 3288 compiler, 3289 quarks[j], 3290 backtrack, 3291 cp_offset + j, 3292 *checked_up_to < cp_offset + j, 3293 preloaded); 3294 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); 3295 } 3296 } 3297 } else { 3298 ASSERT_EQ(TextElement::CHAR_CLASS, elm.text_type()); 3299 if (pass == CHARACTER_CLASS_MATCH) { 3300 if (first_element_checked && i == 0) continue; 3301 if (DeterminedAlready(quick_check, elm.cp_offset())) continue; 3302 RegExpCharacterClass* cc = elm.char_class(); 3303 EmitCharClass(assembler, 3304 cc, 3305 ascii, 3306 backtrack, 3307 cp_offset, 3308 *checked_up_to < cp_offset, 3309 preloaded, 3310 zone()); 3311 UpdateBoundsCheck(cp_offset, checked_up_to); 3312 } 3313 } 3314 } 3315 } 3316 3317 3318 int TextNode::Length() { 3319 TextElement elm = elms_->last(); 3320 ASSERT(elm.cp_offset() >= 0); 3321 return elm.cp_offset() + elm.length(); 3322 } 3323 3324 3325 bool TextNode::SkipPass(int int_pass, bool ignore_case) { 3326 TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass); 3327 if (ignore_case) { 3328 return pass == SIMPLE_CHARACTER_MATCH; 3329 } else { 3330 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH; 3331 } 3332 } 3333 3334 3335 // This generates the code to match a text node. A text node can contain 3336 // straight character sequences (possibly to be matched in a case-independent 3337 // way) and character classes. For efficiency we do not do this in a single 3338 // pass from left to right. Instead we pass over the text node several times, 3339 // emitting code for some character positions every time. See the comment on 3340 // TextEmitPass for details. 3341 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3342 LimitResult limit_result = LimitVersions(compiler, trace); 3343 if (limit_result == DONE) return; 3344 ASSERT(limit_result == CONTINUE); 3345 3346 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) { 3347 compiler->SetRegExpTooBig(); 3348 return; 3349 } 3350 3351 if (compiler->ascii()) { 3352 int dummy = 0; 3353 TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy); 3354 } 3355 3356 bool first_elt_done = false; 3357 int bound_checked_to = trace->cp_offset() - 1; 3358 bound_checked_to += trace->bound_checked_up_to(); 3359 3360 // If a character is preloaded into the current character register then 3361 // check that now. 3362 if (trace->characters_preloaded() == 1) { 3363 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { 3364 if (!SkipPass(pass, compiler->ignore_case())) { 3365 TextEmitPass(compiler, 3366 static_cast<TextEmitPassType>(pass), 3367 true, 3368 trace, 3369 false, 3370 &bound_checked_to); 3371 } 3372 } 3373 first_elt_done = true; 3374 } 3375 3376 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { 3377 if (!SkipPass(pass, compiler->ignore_case())) { 3378 TextEmitPass(compiler, 3379 static_cast<TextEmitPassType>(pass), 3380 false, 3381 trace, 3382 first_elt_done, 3383 &bound_checked_to); 3384 } 3385 } 3386 3387 Trace successor_trace(*trace); 3388 successor_trace.set_at_start(false); 3389 successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler); 3390 RecursionCheck rc(compiler); 3391 on_success()->Emit(compiler, &successor_trace); 3392 } 3393 3394 3395 void Trace::InvalidateCurrentCharacter() { 3396 characters_preloaded_ = 0; 3397 } 3398 3399 3400 void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) { 3401 ASSERT(by > 0); 3402 // We don't have an instruction for shifting the current character register 3403 // down or for using a shifted value for anything so lets just forget that 3404 // we preloaded any characters into it. 3405 characters_preloaded_ = 0; 3406 // Adjust the offsets of the quick check performed information. This 3407 // information is used to find out what we already determined about the 3408 // characters by means of mask and compare. 3409 quick_check_performed_.Advance(by, compiler->ascii()); 3410 cp_offset_ += by; 3411 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) { 3412 compiler->SetRegExpTooBig(); 3413 cp_offset_ = 0; 3414 } 3415 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by); 3416 } 3417 3418 3419 void TextNode::MakeCaseIndependent(bool is_ascii) { 3420 int element_count = elms_->length(); 3421 for (int i = 0; i < element_count; i++) { 3422 TextElement elm = elms_->at(i); 3423 if (elm.text_type() == TextElement::CHAR_CLASS) { 3424 RegExpCharacterClass* cc = elm.char_class(); 3425 // None of the standard character classes is different in the case 3426 // independent case and it slows us down if we don't know that. 3427 if (cc->is_standard(zone())) continue; 3428 ZoneList<CharacterRange>* ranges = cc->ranges(zone()); 3429 int range_count = ranges->length(); 3430 for (int j = 0; j < range_count; j++) { 3431 ranges->at(j).AddCaseEquivalents(ranges, is_ascii, zone()); 3432 } 3433 } 3434 } 3435 } 3436 3437 3438 int TextNode::GreedyLoopTextLength() { 3439 TextElement elm = elms_->at(elms_->length() - 1); 3440 return elm.cp_offset() + elm.length(); 3441 } 3442 3443 3444 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode( 3445 RegExpCompiler* compiler) { 3446 if (elms_->length() != 1) return NULL; 3447 TextElement elm = elms_->at(0); 3448 if (elm.text_type() != TextElement::CHAR_CLASS) return NULL; 3449 RegExpCharacterClass* node = elm.char_class(); 3450 ZoneList<CharacterRange>* ranges = node->ranges(zone()); 3451 if (!CharacterRange::IsCanonical(ranges)) { 3452 CharacterRange::Canonicalize(ranges); 3453 } 3454 if (node->is_negated()) { 3455 return ranges->length() == 0 ? on_success() : NULL; 3456 } 3457 if (ranges->length() != 1) return NULL; 3458 uint32_t max_char; 3459 if (compiler->ascii()) { 3460 max_char = String::kMaxOneByteCharCode; 3461 } else { 3462 max_char = String::kMaxUtf16CodeUnit; 3463 } 3464 return ranges->at(0).IsEverything(max_char) ? on_success() : NULL; 3465 } 3466 3467 3468 // Finds the fixed match length of a sequence of nodes that goes from 3469 // this alternative and back to this choice node. If there are variable 3470 // length nodes or other complications in the way then return a sentinel 3471 // value indicating that a greedy loop cannot be constructed. 3472 int ChoiceNode::GreedyLoopTextLengthForAlternative( 3473 GuardedAlternative* alternative) { 3474 int length = 0; 3475 RegExpNode* node = alternative->node(); 3476 // Later we will generate code for all these text nodes using recursion 3477 // so we have to limit the max number. 3478 int recursion_depth = 0; 3479 while (node != this) { 3480 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) { 3481 return kNodeIsTooComplexForGreedyLoops; 3482 } 3483 int node_length = node->GreedyLoopTextLength(); 3484 if (node_length == kNodeIsTooComplexForGreedyLoops) { 3485 return kNodeIsTooComplexForGreedyLoops; 3486 } 3487 length += node_length; 3488 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node); 3489 node = seq_node->on_success(); 3490 } 3491 return length; 3492 } 3493 3494 3495 void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) { 3496 ASSERT_EQ(loop_node_, NULL); 3497 AddAlternative(alt); 3498 loop_node_ = alt.node(); 3499 } 3500 3501 3502 void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) { 3503 ASSERT_EQ(continue_node_, NULL); 3504 AddAlternative(alt); 3505 continue_node_ = alt.node(); 3506 } 3507 3508 3509 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3510 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 3511 if (trace->stop_node() == this) { 3512 int text_length = 3513 GreedyLoopTextLengthForAlternative(&(alternatives_->at(0))); 3514 ASSERT(text_length != kNodeIsTooComplexForGreedyLoops); 3515 // Update the counter-based backtracking info on the stack. This is an 3516 // optimization for greedy loops (see below). 3517 ASSERT(trace->cp_offset() == text_length); 3518 macro_assembler->AdvanceCurrentPosition(text_length); 3519 macro_assembler->GoTo(trace->loop_label()); 3520 return; 3521 } 3522 ASSERT(trace->stop_node() == NULL); 3523 if (!trace->is_trivial()) { 3524 trace->Flush(compiler, this); 3525 return; 3526 } 3527 ChoiceNode::Emit(compiler, trace); 3528 } 3529 3530 3531 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler, 3532 int eats_at_least) { 3533 int preload_characters = Min(4, eats_at_least); 3534 if (compiler->macro_assembler()->CanReadUnaligned()) { 3535 bool ascii = compiler->ascii(); 3536 if (ascii) { 3537 if (preload_characters > 4) preload_characters = 4; 3538 // We can't preload 3 characters because there is no machine instruction 3539 // to do that. We can't just load 4 because we could be reading 3540 // beyond the end of the string, which could cause a memory fault. 3541 if (preload_characters == 3) preload_characters = 2; 3542 } else { 3543 if (preload_characters > 2) preload_characters = 2; 3544 } 3545 } else { 3546 if (preload_characters > 1) preload_characters = 1; 3547 } 3548 return preload_characters; 3549 } 3550 3551 3552 // This class is used when generating the alternatives in a choice node. It 3553 // records the way the alternative is being code generated. 3554 class AlternativeGeneration: public Malloced { 3555 public: 3556 AlternativeGeneration() 3557 : possible_success(), 3558 expects_preload(false), 3559 after(), 3560 quick_check_details() { } 3561 Label possible_success; 3562 bool expects_preload; 3563 Label after; 3564 QuickCheckDetails quick_check_details; 3565 }; 3566 3567 3568 // Creates a list of AlternativeGenerations. If the list has a reasonable 3569 // size then it is on the stack, otherwise the excess is on the heap. 3570 class AlternativeGenerationList { 3571 public: 3572 AlternativeGenerationList(int count, Zone* zone) 3573 : alt_gens_(count, zone) { 3574 for (int i = 0; i < count && i < kAFew; i++) { 3575 alt_gens_.Add(a_few_alt_gens_ + i, zone); 3576 } 3577 for (int i = kAFew; i < count; i++) { 3578 alt_gens_.Add(new AlternativeGeneration(), zone); 3579 } 3580 } 3581 ~AlternativeGenerationList() { 3582 for (int i = kAFew; i < alt_gens_.length(); i++) { 3583 delete alt_gens_[i]; 3584 alt_gens_[i] = NULL; 3585 } 3586 } 3587 3588 AlternativeGeneration* at(int i) { 3589 return alt_gens_[i]; 3590 } 3591 3592 private: 3593 static const int kAFew = 10; 3594 ZoneList<AlternativeGeneration*> alt_gens_; 3595 AlternativeGeneration a_few_alt_gens_[kAFew]; 3596 }; 3597 3598 3599 // The '2' variant is has inclusive from and exclusive to. 3600 static const int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 3601 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B, 0x2028, 0x202A, 3602 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, 0x10000 }; 3603 static const int kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges); 3604 3605 static const int kWordRanges[] = { 3606 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 }; 3607 static const int kWordRangeCount = ARRAY_SIZE(kWordRanges); 3608 static const int kDigitRanges[] = { '0', '9' + 1, 0x10000 }; 3609 static const int kDigitRangeCount = ARRAY_SIZE(kDigitRanges); 3610 static const int kSurrogateRanges[] = { 0xd800, 0xe000, 0x10000 }; 3611 static const int kSurrogateRangeCount = ARRAY_SIZE(kSurrogateRanges); 3612 static const int kLineTerminatorRanges[] = { 0x000A, 0x000B, 0x000D, 0x000E, 3613 0x2028, 0x202A, 0x10000 }; 3614 static const int kLineTerminatorRangeCount = ARRAY_SIZE(kLineTerminatorRanges); 3615 3616 3617 void BoyerMoorePositionInfo::Set(int character) { 3618 SetInterval(Interval(character, character)); 3619 } 3620 3621 3622 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) { 3623 s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval); 3624 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval); 3625 d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval); 3626 surrogate_ = 3627 AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval); 3628 if (interval.to() - interval.from() >= kMapSize - 1) { 3629 if (map_count_ != kMapSize) { 3630 map_count_ = kMapSize; 3631 for (int i = 0; i < kMapSize; i++) map_->at(i) = true; 3632 } 3633 return; 3634 } 3635 for (int i = interval.from(); i <= interval.to(); i++) { 3636 int mod_character = (i & kMask); 3637 if (!map_->at(mod_character)) { 3638 map_count_++; 3639 map_->at(mod_character) = true; 3640 } 3641 if (map_count_ == kMapSize) return; 3642 } 3643 } 3644 3645 3646 void BoyerMoorePositionInfo::SetAll() { 3647 s_ = w_ = d_ = kLatticeUnknown; 3648 if (map_count_ != kMapSize) { 3649 map_count_ = kMapSize; 3650 for (int i = 0; i < kMapSize; i++) map_->at(i) = true; 3651 } 3652 } 3653 3654 3655 BoyerMooreLookahead::BoyerMooreLookahead( 3656 int length, RegExpCompiler* compiler, Zone* zone) 3657 : length_(length), 3658 compiler_(compiler) { 3659 if (compiler->ascii()) { 3660 max_char_ = String::kMaxOneByteCharCode; 3661 } else { 3662 max_char_ = String::kMaxUtf16CodeUnit; 3663 } 3664 bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone); 3665 for (int i = 0; i < length; i++) { 3666 bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone); 3667 } 3668 } 3669 3670 3671 // Find the longest range of lookahead that has the fewest number of different 3672 // characters that can occur at a given position. Since we are optimizing two 3673 // different parameters at once this is a tradeoff. 3674 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) { 3675 int biggest_points = 0; 3676 // If more than 32 characters out of 128 can occur it is unlikely that we can 3677 // be lucky enough to step forwards much of the time. 3678 const int kMaxMax = 32; 3679 for (int max_number_of_chars = 4; 3680 max_number_of_chars < kMaxMax; 3681 max_number_of_chars *= 2) { 3682 biggest_points = 3683 FindBestInterval(max_number_of_chars, biggest_points, from, to); 3684 } 3685 if (biggest_points == 0) return false; 3686 return true; 3687 } 3688 3689 3690 // Find the highest-points range between 0 and length_ where the character 3691 // information is not too vague. 'Too vague' means that there are more than 3692 // max_number_of_chars that can occur at this position. Calculates the number 3693 // of points as the product of width-of-the-range and 3694 // probability-of-finding-one-of-the-characters, where the probability is 3695 // calculated using the frequency distribution of the sample subject string. 3696 int BoyerMooreLookahead::FindBestInterval( 3697 int max_number_of_chars, int old_biggest_points, int* from, int* to) { 3698 int biggest_points = old_biggest_points; 3699 static const int kSize = RegExpMacroAssembler::kTableSize; 3700 for (int i = 0; i < length_; ) { 3701 while (i < length_ && Count(i) > max_number_of_chars) i++; 3702 if (i == length_) break; 3703 int remembered_from = i; 3704 bool union_map[kSize]; 3705 for (int j = 0; j < kSize; j++) union_map[j] = false; 3706 while (i < length_ && Count(i) <= max_number_of_chars) { 3707 BoyerMoorePositionInfo* map = bitmaps_->at(i); 3708 for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j); 3709 i++; 3710 } 3711 int frequency = 0; 3712 for (int j = 0; j < kSize; j++) { 3713 if (union_map[j]) { 3714 // Add 1 to the frequency to give a small per-character boost for 3715 // the cases where our sampling is not good enough and many 3716 // characters have a frequency of zero. This means the frequency 3717 // can theoretically be up to 2*kSize though we treat it mostly as 3718 // a fraction of kSize. 3719 frequency += compiler_->frequency_collator()->Frequency(j) + 1; 3720 } 3721 } 3722 // We use the probability of skipping times the distance we are skipping to 3723 // judge the effectiveness of this. Actually we have a cut-off: By 3724 // dividing by 2 we switch off the skipping if the probability of skipping 3725 // is less than 50%. This is because the multibyte mask-and-compare 3726 // skipping in quickcheck is more likely to do well on this case. 3727 bool in_quickcheck_range = ((i - remembered_from < 4) || 3728 (compiler_->ascii() ? remembered_from <= 4 : remembered_from <= 2)); 3729 // Called 'probability' but it is only a rough estimate and can actually 3730 // be outside the 0-kSize range. 3731 int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency; 3732 int points = (i - remembered_from) * probability; 3733 if (points > biggest_points) { 3734 *from = remembered_from; 3735 *to = i - 1; 3736 biggest_points = points; 3737 } 3738 } 3739 return biggest_points; 3740 } 3741 3742 3743 // Take all the characters that will not prevent a successful match if they 3744 // occur in the subject string in the range between min_lookahead and 3745 // max_lookahead (inclusive) measured from the current position. If the 3746 // character at max_lookahead offset is not one of these characters, then we 3747 // can safely skip forwards by the number of characters in the range. 3748 int BoyerMooreLookahead::GetSkipTable(int min_lookahead, 3749 int max_lookahead, 3750 Handle<ByteArray> boolean_skip_table) { 3751 const int kSize = RegExpMacroAssembler::kTableSize; 3752 3753 const int kSkipArrayEntry = 0; 3754 const int kDontSkipArrayEntry = 1; 3755 3756 for (int i = 0; i < kSize; i++) { 3757 boolean_skip_table->set(i, kSkipArrayEntry); 3758 } 3759 int skip = max_lookahead + 1 - min_lookahead; 3760 3761 for (int i = max_lookahead; i >= min_lookahead; i--) { 3762 BoyerMoorePositionInfo* map = bitmaps_->at(i); 3763 for (int j = 0; j < kSize; j++) { 3764 if (map->at(j)) { 3765 boolean_skip_table->set(j, kDontSkipArrayEntry); 3766 } 3767 } 3768 } 3769 3770 return skip; 3771 } 3772 3773 3774 // See comment above on the implementation of GetSkipTable. 3775 bool BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) { 3776 const int kSize = RegExpMacroAssembler::kTableSize; 3777 3778 int min_lookahead = 0; 3779 int max_lookahead = 0; 3780 3781 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return false; 3782 3783 bool found_single_character = false; 3784 int single_character = 0; 3785 for (int i = max_lookahead; i >= min_lookahead; i--) { 3786 BoyerMoorePositionInfo* map = bitmaps_->at(i); 3787 if (map->map_count() > 1 || 3788 (found_single_character && map->map_count() != 0)) { 3789 found_single_character = false; 3790 break; 3791 } 3792 for (int j = 0; j < kSize; j++) { 3793 if (map->at(j)) { 3794 found_single_character = true; 3795 single_character = j; 3796 break; 3797 } 3798 } 3799 } 3800 3801 int lookahead_width = max_lookahead + 1 - min_lookahead; 3802 3803 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) { 3804 // The mask-compare can probably handle this better. 3805 return false; 3806 } 3807 3808 if (found_single_character) { 3809 Label cont, again; 3810 masm->Bind(&again); 3811 masm->LoadCurrentCharacter(max_lookahead, &cont, true); 3812 if (max_char_ > kSize) { 3813 masm->CheckCharacterAfterAnd(single_character, 3814 RegExpMacroAssembler::kTableMask, 3815 &cont); 3816 } else { 3817 masm->CheckCharacter(single_character, &cont); 3818 } 3819 masm->AdvanceCurrentPosition(lookahead_width); 3820 masm->GoTo(&again); 3821 masm->Bind(&cont); 3822 return true; 3823 } 3824 3825 Factory* factory = masm->zone()->isolate()->factory(); 3826 Handle<ByteArray> boolean_skip_table = factory->NewByteArray(kSize, TENURED); 3827 int skip_distance = GetSkipTable( 3828 min_lookahead, max_lookahead, boolean_skip_table); 3829 ASSERT(skip_distance != 0); 3830 3831 Label cont, again; 3832 masm->Bind(&again); 3833 masm->LoadCurrentCharacter(max_lookahead, &cont, true); 3834 masm->CheckBitInTable(boolean_skip_table, &cont); 3835 masm->AdvanceCurrentPosition(skip_distance); 3836 masm->GoTo(&again); 3837 masm->Bind(&cont); 3838 3839 return true; 3840 } 3841 3842 3843 /* Code generation for choice nodes. 3844 * 3845 * We generate quick checks that do a mask and compare to eliminate a 3846 * choice. If the quick check succeeds then it jumps to the continuation to 3847 * do slow checks and check subsequent nodes. If it fails (the common case) 3848 * it falls through to the next choice. 3849 * 3850 * Here is the desired flow graph. Nodes directly below each other imply 3851 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative 3852 * 3 doesn't have a quick check so we have to call the slow check. 3853 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire 3854 * regexp continuation is generated directly after the Sn node, up to the 3855 * next GoTo if we decide to reuse some already generated code. Some 3856 * nodes expect preload_characters to be preloaded into the current 3857 * character register. R nodes do this preloading. Vertices are marked 3858 * F for failures and S for success (possible success in the case of quick 3859 * nodes). L, V, < and > are used as arrow heads. 3860 * 3861 * ----------> R 3862 * | 3863 * V 3864 * Q1 -----> S1 3865 * | S / 3866 * F| / 3867 * | F/ 3868 * | / 3869 * | R 3870 * | / 3871 * V L 3872 * Q2 -----> S2 3873 * | S / 3874 * F| / 3875 * | F/ 3876 * | / 3877 * | R 3878 * | / 3879 * V L 3880 * S3 3881 * | 3882 * F| 3883 * | 3884 * R 3885 * | 3886 * backtrack V 3887 * <----------Q4 3888 * \ F | 3889 * \ |S 3890 * \ F V 3891 * \-----S4 3892 * 3893 * For greedy loops we reverse our expectation and expect to match rather 3894 * than fail. Therefore we want the loop code to look like this (U is the 3895 * unwind code that steps back in the greedy loop). The following alternatives 3896 * look the same as above. 3897 * _____ 3898 * / \ 3899 * V | 3900 * ----------> S1 | 3901 * /| | 3902 * / |S | 3903 * F/ \_____/ 3904 * / 3905 * |<----------- 3906 * | \ 3907 * V \ 3908 * Q2 ---> S2 \ 3909 * | S / | 3910 * F| / | 3911 * | F/ | 3912 * | / | 3913 * | R | 3914 * | / | 3915 * F VL | 3916 * <------U | 3917 * back |S | 3918 * \______________/ 3919 */ 3920 3921 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3922 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 3923 int choice_count = alternatives_->length(); 3924 #ifdef DEBUG 3925 for (int i = 0; i < choice_count - 1; i++) { 3926 GuardedAlternative alternative = alternatives_->at(i); 3927 ZoneList<Guard*>* guards = alternative.guards(); 3928 int guard_count = (guards == NULL) ? 0 : guards->length(); 3929 for (int j = 0; j < guard_count; j++) { 3930 ASSERT(!trace->mentions_reg(guards->at(j)->reg())); 3931 } 3932 } 3933 #endif 3934 3935 LimitResult limit_result = LimitVersions(compiler, trace); 3936 if (limit_result == DONE) return; 3937 ASSERT(limit_result == CONTINUE); 3938 3939 int new_flush_budget = trace->flush_budget() / choice_count; 3940 if (trace->flush_budget() == 0 && trace->actions() != NULL) { 3941 trace->Flush(compiler, this); 3942 return; 3943 } 3944 3945 RecursionCheck rc(compiler); 3946 3947 Trace* current_trace = trace; 3948 3949 int text_length = GreedyLoopTextLengthForAlternative(&(alternatives_->at(0))); 3950 bool greedy_loop = false; 3951 Label greedy_loop_label; 3952 Trace counter_backtrack_trace; 3953 counter_backtrack_trace.set_backtrack(&greedy_loop_label); 3954 if (not_at_start()) counter_backtrack_trace.set_at_start(false); 3955 3956 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) { 3957 // Here we have special handling for greedy loops containing only text nodes 3958 // and other simple nodes. These are handled by pushing the current 3959 // position on the stack and then incrementing the current position each 3960 // time around the switch. On backtrack we decrement the current position 3961 // and check it against the pushed value. This avoids pushing backtrack 3962 // information for each iteration of the loop, which could take up a lot of 3963 // space. 3964 greedy_loop = true; 3965 ASSERT(trace->stop_node() == NULL); 3966 macro_assembler->PushCurrentPosition(); 3967 current_trace = &counter_backtrack_trace; 3968 Label greedy_match_failed; 3969 Trace greedy_match_trace; 3970 if (not_at_start()) greedy_match_trace.set_at_start(false); 3971 greedy_match_trace.set_backtrack(&greedy_match_failed); 3972 Label loop_label; 3973 macro_assembler->Bind(&loop_label); 3974 greedy_match_trace.set_stop_node(this); 3975 greedy_match_trace.set_loop_label(&loop_label); 3976 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace); 3977 macro_assembler->Bind(&greedy_match_failed); 3978 } 3979 3980 Label second_choice; // For use in greedy matches. 3981 macro_assembler->Bind(&second_choice); 3982 3983 int first_normal_choice = greedy_loop ? 1 : 0; 3984 3985 bool not_at_start = current_trace->at_start() == Trace::FALSE_VALUE; 3986 const int kEatsAtLeastNotYetInitialized = -1; 3987 int eats_at_least = kEatsAtLeastNotYetInitialized; 3988 3989 bool skip_was_emitted = false; 3990 3991 if (!greedy_loop && choice_count == 2) { 3992 GuardedAlternative alt1 = alternatives_->at(1); 3993 if (alt1.guards() == NULL || alt1.guards()->length() == 0) { 3994 RegExpNode* eats_anything_node = alt1.node(); 3995 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) == 3996 this) { 3997 // At this point we know that we are at a non-greedy loop that will eat 3998 // any character one at a time. Any non-anchored regexp has such a 3999 // loop prepended to it in order to find where it starts. We look for 4000 // a pattern of the form ...abc... where we can look 6 characters ahead 4001 // and step forwards 3 if the character is not one of abc. Abc need 4002 // not be atoms, they can be any reasonably limited character class or 4003 // small alternation. 4004 ASSERT(trace->is_trivial()); // This is the case on LoopChoiceNodes. 4005 BoyerMooreLookahead* lookahead = bm_info(not_at_start); 4006 if (lookahead == NULL) { 4007 eats_at_least = Min(kMaxLookaheadForBoyerMoore, 4008 EatsAtLeast(kMaxLookaheadForBoyerMoore, 4009 kRecursionBudget, 4010 not_at_start)); 4011 if (eats_at_least >= 1) { 4012 BoyerMooreLookahead* bm = 4013 new(zone()) BoyerMooreLookahead(eats_at_least, 4014 compiler, 4015 zone()); 4016 GuardedAlternative alt0 = alternatives_->at(0); 4017 alt0.node()->FillInBMInfo(0, kRecursionBudget, bm, not_at_start); 4018 skip_was_emitted = bm->EmitSkipInstructions(macro_assembler); 4019 } 4020 } else { 4021 skip_was_emitted = lookahead->EmitSkipInstructions(macro_assembler); 4022 } 4023 } 4024 } 4025 } 4026 4027 if (eats_at_least == kEatsAtLeastNotYetInitialized) { 4028 // Save some time by looking at most one machine word ahead. 4029 eats_at_least = 4030 EatsAtLeast(compiler->ascii() ? 4 : 2, kRecursionBudget, not_at_start); 4031 } 4032 int preload_characters = CalculatePreloadCharacters(compiler, eats_at_least); 4033 4034 bool preload_is_current = !skip_was_emitted && 4035 (current_trace->characters_preloaded() == preload_characters); 4036 bool preload_has_checked_bounds = preload_is_current; 4037 4038 AlternativeGenerationList alt_gens(choice_count, zone()); 4039 4040 // For now we just call all choices one after the other. The idea ultimately 4041 // is to use the Dispatch table to try only the relevant ones. 4042 for (int i = first_normal_choice; i < choice_count; i++) { 4043 GuardedAlternative alternative = alternatives_->at(i); 4044 AlternativeGeneration* alt_gen = alt_gens.at(i); 4045 alt_gen->quick_check_details.set_characters(preload_characters); 4046 ZoneList<Guard*>* guards = alternative.guards(); 4047 int guard_count = (guards == NULL) ? 0 : guards->length(); 4048 Trace new_trace(*current_trace); 4049 new_trace.set_characters_preloaded(preload_is_current ? 4050 preload_characters : 4051 0); 4052 if (preload_has_checked_bounds) { 4053 new_trace.set_bound_checked_up_to(preload_characters); 4054 } 4055 new_trace.quick_check_performed()->Clear(); 4056 if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE); 4057 alt_gen->expects_preload = preload_is_current; 4058 bool generate_full_check_inline = false; 4059 if (FLAG_regexp_optimization && 4060 try_to_emit_quick_check_for_alternative(i) && 4061 alternative.node()->EmitQuickCheck(compiler, 4062 &new_trace, 4063 preload_has_checked_bounds, 4064 &alt_gen->possible_success, 4065 &alt_gen->quick_check_details, 4066 i < choice_count - 1)) { 4067 // Quick check was generated for this choice. 4068 preload_is_current = true; 4069 preload_has_checked_bounds = true; 4070 // On the last choice in the ChoiceNode we generated the quick 4071 // check to fall through on possible success. So now we need to 4072 // generate the full check inline. 4073 if (i == choice_count - 1) { 4074 macro_assembler->Bind(&alt_gen->possible_success); 4075 new_trace.set_quick_check_performed(&alt_gen->quick_check_details); 4076 new_trace.set_characters_preloaded(preload_characters); 4077 new_trace.set_bound_checked_up_to(preload_characters); 4078 generate_full_check_inline = true; 4079 } 4080 } else if (alt_gen->quick_check_details.cannot_match()) { 4081 if (i == choice_count - 1 && !greedy_loop) { 4082 macro_assembler->GoTo(trace->backtrack()); 4083 } 4084 continue; 4085 } else { 4086 // No quick check was generated. Put the full code here. 4087 // If this is not the first choice then there could be slow checks from 4088 // previous cases that go here when they fail. There's no reason to 4089 // insist that they preload characters since the slow check we are about 4090 // to generate probably can't use it. 4091 if (i != first_normal_choice) { 4092 alt_gen->expects_preload = false; 4093 new_trace.InvalidateCurrentCharacter(); 4094 } 4095 if (i < choice_count - 1) { 4096 new_trace.set_backtrack(&alt_gen->after); 4097 } 4098 generate_full_check_inline = true; 4099 } 4100 if (generate_full_check_inline) { 4101 if (new_trace.actions() != NULL) { 4102 new_trace.set_flush_budget(new_flush_budget); 4103 } 4104 for (int j = 0; j < guard_count; j++) { 4105 GenerateGuard(macro_assembler, guards->at(j), &new_trace); 4106 } 4107 alternative.node()->Emit(compiler, &new_trace); 4108 preload_is_current = false; 4109 } 4110 macro_assembler->Bind(&alt_gen->after); 4111 } 4112 if (greedy_loop) { 4113 macro_assembler->Bind(&greedy_loop_label); 4114 // If we have unwound to the bottom then backtrack. 4115 macro_assembler->CheckGreedyLoop(trace->backtrack()); 4116 // Otherwise try the second priority at an earlier position. 4117 macro_assembler->AdvanceCurrentPosition(-text_length); 4118 macro_assembler->GoTo(&second_choice); 4119 } 4120 4121 // At this point we need to generate slow checks for the alternatives where 4122 // the quick check was inlined. We can recognize these because the associated 4123 // label was bound. 4124 for (int i = first_normal_choice; i < choice_count - 1; i++) { 4125 AlternativeGeneration* alt_gen = alt_gens.at(i); 4126 Trace new_trace(*current_trace); 4127 // If there are actions to be flushed we have to limit how many times 4128 // they are flushed. Take the budget of the parent trace and distribute 4129 // it fairly amongst the children. 4130 if (new_trace.actions() != NULL) { 4131 new_trace.set_flush_budget(new_flush_budget); 4132 } 4133 EmitOutOfLineContinuation(compiler, 4134 &new_trace, 4135 alternatives_->at(i), 4136 alt_gen, 4137 preload_characters, 4138 alt_gens.at(i + 1)->expects_preload); 4139 } 4140 } 4141 4142 4143 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler, 4144 Trace* trace, 4145 GuardedAlternative alternative, 4146 AlternativeGeneration* alt_gen, 4147 int preload_characters, 4148 bool next_expects_preload) { 4149 if (!alt_gen->possible_success.is_linked()) return; 4150 4151 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 4152 macro_assembler->Bind(&alt_gen->possible_success); 4153 Trace out_of_line_trace(*trace); 4154 out_of_line_trace.set_characters_preloaded(preload_characters); 4155 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details); 4156 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE); 4157 ZoneList<Guard*>* guards = alternative.guards(); 4158 int guard_count = (guards == NULL) ? 0 : guards->length(); 4159 if (next_expects_preload) { 4160 Label reload_current_char; 4161 out_of_line_trace.set_backtrack(&reload_current_char); 4162 for (int j = 0; j < guard_count; j++) { 4163 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace); 4164 } 4165 alternative.node()->Emit(compiler, &out_of_line_trace); 4166 macro_assembler->Bind(&reload_current_char); 4167 // Reload the current character, since the next quick check expects that. 4168 // We don't need to check bounds here because we only get into this 4169 // code through a quick check which already did the checked load. 4170 macro_assembler->LoadCurrentCharacter(trace->cp_offset(), 4171 NULL, 4172 false, 4173 preload_characters); 4174 macro_assembler->GoTo(&(alt_gen->after)); 4175 } else { 4176 out_of_line_trace.set_backtrack(&(alt_gen->after)); 4177 for (int j = 0; j < guard_count; j++) { 4178 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace); 4179 } 4180 alternative.node()->Emit(compiler, &out_of_line_trace); 4181 } 4182 } 4183 4184 4185 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) { 4186 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 4187 LimitResult limit_result = LimitVersions(compiler, trace); 4188 if (limit_result == DONE) return; 4189 ASSERT(limit_result == CONTINUE); 4190 4191 RecursionCheck rc(compiler); 4192 4193 switch (action_type_) { 4194 case STORE_POSITION: { 4195 Trace::DeferredCapture 4196 new_capture(data_.u_position_register.reg, 4197 data_.u_position_register.is_capture, 4198 trace); 4199 Trace new_trace = *trace; 4200 new_trace.add_action(&new_capture); 4201 on_success()->Emit(compiler, &new_trace); 4202 break; 4203 } 4204 case INCREMENT_REGISTER: { 4205 Trace::DeferredIncrementRegister 4206 new_increment(data_.u_increment_register.reg); 4207 Trace new_trace = *trace; 4208 new_trace.add_action(&new_increment); 4209 on_success()->Emit(compiler, &new_trace); 4210 break; 4211 } 4212 case SET_REGISTER: { 4213 Trace::DeferredSetRegister 4214 new_set(data_.u_store_register.reg, data_.u_store_register.value); 4215 Trace new_trace = *trace; 4216 new_trace.add_action(&new_set); 4217 on_success()->Emit(compiler, &new_trace); 4218 break; 4219 } 4220 case CLEAR_CAPTURES: { 4221 Trace::DeferredClearCaptures 4222 new_capture(Interval(data_.u_clear_captures.range_from, 4223 data_.u_clear_captures.range_to)); 4224 Trace new_trace = *trace; 4225 new_trace.add_action(&new_capture); 4226 on_success()->Emit(compiler, &new_trace); 4227 break; 4228 } 4229 case BEGIN_SUBMATCH: 4230 if (!trace->is_trivial()) { 4231 trace->Flush(compiler, this); 4232 } else { 4233 assembler->WriteCurrentPositionToRegister( 4234 data_.u_submatch.current_position_register, 0); 4235 assembler->WriteStackPointerToRegister( 4236 data_.u_submatch.stack_pointer_register); 4237 on_success()->Emit(compiler, trace); 4238 } 4239 break; 4240 case EMPTY_MATCH_CHECK: { 4241 int start_pos_reg = data_.u_empty_match_check.start_register; 4242 int stored_pos = 0; 4243 int rep_reg = data_.u_empty_match_check.repetition_register; 4244 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister); 4245 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos); 4246 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) { 4247 // If we know we haven't advanced and there is no minimum we 4248 // can just backtrack immediately. 4249 assembler->GoTo(trace->backtrack()); 4250 } else if (know_dist && stored_pos < trace->cp_offset()) { 4251 // If we know we've advanced we can generate the continuation 4252 // immediately. 4253 on_success()->Emit(compiler, trace); 4254 } else if (!trace->is_trivial()) { 4255 trace->Flush(compiler, this); 4256 } else { 4257 Label skip_empty_check; 4258 // If we have a minimum number of repetitions we check the current 4259 // number first and skip the empty check if it's not enough. 4260 if (has_minimum) { 4261 int limit = data_.u_empty_match_check.repetition_limit; 4262 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check); 4263 } 4264 // If the match is empty we bail out, otherwise we fall through 4265 // to the on-success continuation. 4266 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register, 4267 trace->backtrack()); 4268 assembler->Bind(&skip_empty_check); 4269 on_success()->Emit(compiler, trace); 4270 } 4271 break; 4272 } 4273 case POSITIVE_SUBMATCH_SUCCESS: { 4274 if (!trace->is_trivial()) { 4275 trace->Flush(compiler, this); 4276 return; 4277 } 4278 assembler->ReadCurrentPositionFromRegister( 4279 data_.u_submatch.current_position_register); 4280 assembler->ReadStackPointerFromRegister( 4281 data_.u_submatch.stack_pointer_register); 4282 int clear_register_count = data_.u_submatch.clear_register_count; 4283 if (clear_register_count == 0) { 4284 on_success()->Emit(compiler, trace); 4285 return; 4286 } 4287 int clear_registers_from = data_.u_submatch.clear_register_from; 4288 Label clear_registers_backtrack; 4289 Trace new_trace = *trace; 4290 new_trace.set_backtrack(&clear_registers_backtrack); 4291 on_success()->Emit(compiler, &new_trace); 4292 4293 assembler->Bind(&clear_registers_backtrack); 4294 int clear_registers_to = clear_registers_from + clear_register_count - 1; 4295 assembler->ClearRegisters(clear_registers_from, clear_registers_to); 4296 4297 ASSERT(trace->backtrack() == NULL); 4298 assembler->Backtrack(); 4299 return; 4300 } 4301 default: 4302 UNREACHABLE(); 4303 } 4304 } 4305 4306 4307 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 4308 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 4309 if (!trace->is_trivial()) { 4310 trace->Flush(compiler, this); 4311 return; 4312 } 4313 4314 LimitResult limit_result = LimitVersions(compiler, trace); 4315 if (limit_result == DONE) return; 4316 ASSERT(limit_result == CONTINUE); 4317 4318 RecursionCheck rc(compiler); 4319 4320 ASSERT_EQ(start_reg_ + 1, end_reg_); 4321 if (compiler->ignore_case()) { 4322 assembler->CheckNotBackReferenceIgnoreCase(start_reg_, 4323 trace->backtrack()); 4324 } else { 4325 assembler->CheckNotBackReference(start_reg_, trace->backtrack()); 4326 } 4327 on_success()->Emit(compiler, trace); 4328 } 4329 4330 4331 // ------------------------------------------------------------------- 4332 // Dot/dotty output 4333 4334 4335 #ifdef DEBUG 4336 4337 4338 class DotPrinter: public NodeVisitor { 4339 public: 4340 explicit DotPrinter(bool ignore_case) 4341 : ignore_case_(ignore_case), 4342 stream_(&alloc_) { } 4343 void PrintNode(const char* label, RegExpNode* node); 4344 void Visit(RegExpNode* node); 4345 void PrintAttributes(RegExpNode* from); 4346 StringStream* stream() { return &stream_; } 4347 void PrintOnFailure(RegExpNode* from, RegExpNode* to); 4348 #define DECLARE_VISIT(Type) \ 4349 virtual void Visit##Type(Type##Node* that); 4350 FOR_EACH_NODE_TYPE(DECLARE_VISIT) 4351 #undef DECLARE_VISIT 4352 private: 4353 bool ignore_case_; 4354 HeapStringAllocator alloc_; 4355 StringStream stream_; 4356 }; 4357 4358 4359 void DotPrinter::PrintNode(const char* label, RegExpNode* node) { 4360 stream()->Add("digraph G {\n graph [label=\""); 4361 for (int i = 0; label[i]; i++) { 4362 switch (label[i]) { 4363 case '\\': 4364 stream()->Add("\\\\"); 4365 break; 4366 case '"': 4367 stream()->Add("\""); 4368 break; 4369 default: 4370 stream()->Put(label[i]); 4371 break; 4372 } 4373 } 4374 stream()->Add("\"];\n"); 4375 Visit(node); 4376 stream()->Add("}\n"); 4377 printf("%s", *(stream()->ToCString())); 4378 } 4379 4380 4381 void DotPrinter::Visit(RegExpNode* node) { 4382 if (node->info()->visited) return; 4383 node->info()->visited = true; 4384 node->Accept(this); 4385 } 4386 4387 4388 void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) { 4389 stream()->Add(" n%p -> n%p [style=dotted];\n", from, on_failure); 4390 Visit(on_failure); 4391 } 4392 4393 4394 class TableEntryBodyPrinter { 4395 public: 4396 TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice) 4397 : stream_(stream), choice_(choice) { } 4398 void Call(uc16 from, DispatchTable::Entry entry) { 4399 OutSet* out_set = entry.out_set(); 4400 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { 4401 if (out_set->Get(i)) { 4402 stream()->Add(" n%p:s%io%i -> n%p;\n", 4403 choice(), 4404 from, 4405 i, 4406 choice()->alternatives()->at(i).node()); 4407 } 4408 } 4409 } 4410 private: 4411 StringStream* stream() { return stream_; } 4412 ChoiceNode* choice() { return choice_; } 4413 StringStream* stream_; 4414 ChoiceNode* choice_; 4415 }; 4416 4417 4418 class TableEntryHeaderPrinter { 4419 public: 4420 explicit TableEntryHeaderPrinter(StringStream* stream) 4421 : first_(true), stream_(stream) { } 4422 void Call(uc16 from, DispatchTable::Entry entry) { 4423 if (first_) { 4424 first_ = false; 4425 } else { 4426 stream()->Add("|"); 4427 } 4428 stream()->Add("{\\%k-\\%k|{", from, entry.to()); 4429 OutSet* out_set = entry.out_set(); 4430 int priority = 0; 4431 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { 4432 if (out_set->Get(i)) { 4433 if (priority > 0) stream()->Add("|"); 4434 stream()->Add("<s%io%i> %i", from, i, priority); 4435 priority++; 4436 } 4437 } 4438 stream()->Add("}}"); 4439 } 4440 4441 private: 4442 bool first_; 4443 StringStream* stream() { return stream_; } 4444 StringStream* stream_; 4445 }; 4446 4447 4448 class AttributePrinter { 4449 public: 4450 explicit AttributePrinter(DotPrinter* out) 4451 : out_(out), first_(true) { } 4452 void PrintSeparator() { 4453 if (first_) { 4454 first_ = false; 4455 } else { 4456 out_->stream()->Add("|"); 4457 } 4458 } 4459 void PrintBit(const char* name, bool value) { 4460 if (!value) return; 4461 PrintSeparator(); 4462 out_->stream()->Add("{%s}", name); 4463 } 4464 void PrintPositive(const char* name, int value) { 4465 if (value < 0) return; 4466 PrintSeparator(); 4467 out_->stream()->Add("{%s|%x}", name, value); 4468 } 4469 private: 4470 DotPrinter* out_; 4471 bool first_; 4472 }; 4473 4474 4475 void DotPrinter::PrintAttributes(RegExpNode* that) { 4476 stream()->Add(" a%p [shape=Mrecord, color=grey, fontcolor=grey, " 4477 "margin=0.1, fontsize=10, label=\"{", 4478 that); 4479 AttributePrinter printer(this); 4480 NodeInfo* info = that->info(); 4481 printer.PrintBit("NI", info->follows_newline_interest); 4482 printer.PrintBit("WI", info->follows_word_interest); 4483 printer.PrintBit("SI", info->follows_start_interest); 4484 Label* label = that->label(); 4485 if (label->is_bound()) 4486 printer.PrintPositive("@", label->pos()); 4487 stream()->Add("}\"];\n"); 4488 stream()->Add(" a%p -> n%p [style=dashed, color=grey, " 4489 "arrowhead=none];\n", that, that); 4490 } 4491 4492 4493 static const bool kPrintDispatchTable = false; 4494 void DotPrinter::VisitChoice(ChoiceNode* that) { 4495 if (kPrintDispatchTable) { 4496 stream()->Add(" n%p [shape=Mrecord, label=\"", that); 4497 TableEntryHeaderPrinter header_printer(stream()); 4498 that->GetTable(ignore_case_)->ForEach(&header_printer); 4499 stream()->Add("\"]\n", that); 4500 PrintAttributes(that); 4501 TableEntryBodyPrinter body_printer(stream(), that); 4502 that->GetTable(ignore_case_)->ForEach(&body_printer); 4503 } else { 4504 stream()->Add(" n%p [shape=Mrecord, label=\"?\"];\n", that); 4505 for (int i = 0; i < that->alternatives()->length(); i++) { 4506 GuardedAlternative alt = that->alternatives()->at(i); 4507 stream()->Add(" n%p -> n%p;\n", that, alt.node()); 4508 } 4509 } 4510 for (int i = 0; i < that->alternatives()->length(); i++) { 4511 GuardedAlternative alt = that->alternatives()->at(i); 4512 alt.node()->Accept(this); 4513 } 4514 } 4515 4516 4517 void DotPrinter::VisitText(TextNode* that) { 4518 Zone* zone = that->zone(); 4519 stream()->Add(" n%p [label=\"", that); 4520 for (int i = 0; i < that->elements()->length(); i++) { 4521 if (i > 0) stream()->Add(" "); 4522 TextElement elm = that->elements()->at(i); 4523 switch (elm.text_type()) { 4524 case TextElement::ATOM: { 4525 stream()->Add("'%w'", elm.atom()->data()); 4526 break; 4527 } 4528 case TextElement::CHAR_CLASS: { 4529 RegExpCharacterClass* node = elm.char_class(); 4530 stream()->Add("["); 4531 if (node->is_negated()) 4532 stream()->Add("^"); 4533 for (int j = 0; j < node->ranges(zone)->length(); j++) { 4534 CharacterRange range = node->ranges(zone)->at(j); 4535 stream()->Add("%k-%k", range.from(), range.to()); 4536 } 4537 stream()->Add("]"); 4538 break; 4539 } 4540 default: 4541 UNREACHABLE(); 4542 } 4543 } 4544 stream()->Add("\", shape=box, peripheries=2];\n"); 4545 PrintAttributes(that); 4546 stream()->Add(" n%p -> n%p;\n", that, that->on_success()); 4547 Visit(that->on_success()); 4548 } 4549 4550 4551 void DotPrinter::VisitBackReference(BackReferenceNode* that) { 4552 stream()->Add(" n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n", 4553 that, 4554 that->start_register(), 4555 that->end_register()); 4556 PrintAttributes(that); 4557 stream()->Add(" n%p -> n%p;\n", that, that->on_success()); 4558 Visit(that->on_success()); 4559 } 4560 4561 4562 void DotPrinter::VisitEnd(EndNode* that) { 4563 stream()->Add(" n%p [style=bold, shape=point];\n", that); 4564 PrintAttributes(that); 4565 } 4566 4567 4568 void DotPrinter::VisitAssertion(AssertionNode* that) { 4569 stream()->Add(" n%p [", that); 4570 switch (that->assertion_type()) { 4571 case AssertionNode::AT_END: 4572 stream()->Add("label=\"$\", shape=septagon"); 4573 break; 4574 case AssertionNode::AT_START: 4575 stream()->Add("label=\"^\", shape=septagon"); 4576 break; 4577 case AssertionNode::AT_BOUNDARY: 4578 stream()->Add("label=\"\\b\", shape=septagon"); 4579 break; 4580 case AssertionNode::AT_NON_BOUNDARY: 4581 stream()->Add("label=\"\\B\", shape=septagon"); 4582 break; 4583 case AssertionNode::AFTER_NEWLINE: 4584 stream()->Add("label=\"(?<=\\n)\", shape=septagon"); 4585 break; 4586 } 4587 stream()->Add("];\n"); 4588 PrintAttributes(that); 4589 RegExpNode* successor = that->on_success(); 4590 stream()->Add(" n%p -> n%p;\n", that, successor); 4591 Visit(successor); 4592 } 4593 4594 4595 void DotPrinter::VisitAction(ActionNode* that) { 4596 stream()->Add(" n%p [", that); 4597 switch (that->action_type_) { 4598 case ActionNode::SET_REGISTER: 4599 stream()->Add("label=\"$%i:=%i\", shape=octagon", 4600 that->data_.u_store_register.reg, 4601 that->data_.u_store_register.value); 4602 break; 4603 case ActionNode::INCREMENT_REGISTER: 4604 stream()->Add("label=\"$%i++\", shape=octagon", 4605 that->data_.u_increment_register.reg); 4606 break; 4607 case ActionNode::STORE_POSITION: 4608 stream()->Add("label=\"$%i:=$pos\", shape=octagon", 4609 that->data_.u_position_register.reg); 4610 break; 4611 case ActionNode::BEGIN_SUBMATCH: 4612 stream()->Add("label=\"$%i:=$pos,begin\", shape=septagon", 4613 that->data_.u_submatch.current_position_register); 4614 break; 4615 case ActionNode::POSITIVE_SUBMATCH_SUCCESS: 4616 stream()->Add("label=\"escape\", shape=septagon"); 4617 break; 4618 case ActionNode::EMPTY_MATCH_CHECK: 4619 stream()->Add("label=\"$%i=$pos?,$%i<%i?\", shape=septagon", 4620 that->data_.u_empty_match_check.start_register, 4621 that->data_.u_empty_match_check.repetition_register, 4622 that->data_.u_empty_match_check.repetition_limit); 4623 break; 4624 case ActionNode::CLEAR_CAPTURES: { 4625 stream()->Add("label=\"clear $%i to $%i\", shape=septagon", 4626 that->data_.u_clear_captures.range_from, 4627 that->data_.u_clear_captures.range_to); 4628 break; 4629 } 4630 } 4631 stream()->Add("];\n"); 4632 PrintAttributes(that); 4633 RegExpNode* successor = that->on_success(); 4634 stream()->Add(" n%p -> n%p;\n", that, successor); 4635 Visit(successor); 4636 } 4637 4638 4639 class DispatchTableDumper { 4640 public: 4641 explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { } 4642 void Call(uc16 key, DispatchTable::Entry entry); 4643 StringStream* stream() { return stream_; } 4644 private: 4645 StringStream* stream_; 4646 }; 4647 4648 4649 void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) { 4650 stream()->Add("[%k-%k]: {", key, entry.to()); 4651 OutSet* set = entry.out_set(); 4652 bool first = true; 4653 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { 4654 if (set->Get(i)) { 4655 if (first) { 4656 first = false; 4657 } else { 4658 stream()->Add(", "); 4659 } 4660 stream()->Add("%i", i); 4661 } 4662 } 4663 stream()->Add("}\n"); 4664 } 4665 4666 4667 void DispatchTable::Dump() { 4668 HeapStringAllocator alloc; 4669 StringStream stream(&alloc); 4670 DispatchTableDumper dumper(&stream); 4671 tree()->ForEach(&dumper); 4672 OS::PrintError("%s", *stream.ToCString()); 4673 } 4674 4675 4676 void RegExpEngine::DotPrint(const char* label, 4677 RegExpNode* node, 4678 bool ignore_case) { 4679 DotPrinter printer(ignore_case); 4680 printer.PrintNode(label, node); 4681 } 4682 4683 4684 #endif // DEBUG 4685 4686 4687 // ------------------------------------------------------------------- 4688 // Tree to graph conversion 4689 4690 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler, 4691 RegExpNode* on_success) { 4692 ZoneList<TextElement>* elms = 4693 new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone()); 4694 elms->Add(TextElement::Atom(this), compiler->zone()); 4695 return new(compiler->zone()) TextNode(elms, on_success); 4696 } 4697 4698 4699 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, 4700 RegExpNode* on_success) { 4701 return new(compiler->zone()) TextNode(elements(), on_success); 4702 } 4703 4704 4705 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges, 4706 const int* special_class, 4707 int length) { 4708 length--; // Remove final 0x10000. 4709 ASSERT(special_class[length] == 0x10000); 4710 ASSERT(ranges->length() != 0); 4711 ASSERT(length != 0); 4712 ASSERT(special_class[0] != 0); 4713 if (ranges->length() != (length >> 1) + 1) { 4714 return false; 4715 } 4716 CharacterRange range = ranges->at(0); 4717 if (range.from() != 0) { 4718 return false; 4719 } 4720 for (int i = 0; i < length; i += 2) { 4721 if (special_class[i] != (range.to() + 1)) { 4722 return false; 4723 } 4724 range = ranges->at((i >> 1) + 1); 4725 if (special_class[i+1] != range.from()) { 4726 return false; 4727 } 4728 } 4729 if (range.to() != 0xffff) { 4730 return false; 4731 } 4732 return true; 4733 } 4734 4735 4736 static bool CompareRanges(ZoneList<CharacterRange>* ranges, 4737 const int* special_class, 4738 int length) { 4739 length--; // Remove final 0x10000. 4740 ASSERT(special_class[length] == 0x10000); 4741 if (ranges->length() * 2 != length) { 4742 return false; 4743 } 4744 for (int i = 0; i < length; i += 2) { 4745 CharacterRange range = ranges->at(i >> 1); 4746 if (range.from() != special_class[i] || 4747 range.to() != special_class[i + 1] - 1) { 4748 return false; 4749 } 4750 } 4751 return true; 4752 } 4753 4754 4755 bool RegExpCharacterClass::is_standard(Zone* zone) { 4756 // TODO(lrn): Remove need for this function, by not throwing away information 4757 // along the way. 4758 if (is_negated_) { 4759 return false; 4760 } 4761 if (set_.is_standard()) { 4762 return true; 4763 } 4764 if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) { 4765 set_.set_standard_set_type('s'); 4766 return true; 4767 } 4768 if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) { 4769 set_.set_standard_set_type('S'); 4770 return true; 4771 } 4772 if (CompareInverseRanges(set_.ranges(zone), 4773 kLineTerminatorRanges, 4774 kLineTerminatorRangeCount)) { 4775 set_.set_standard_set_type('.'); 4776 return true; 4777 } 4778 if (CompareRanges(set_.ranges(zone), 4779 kLineTerminatorRanges, 4780 kLineTerminatorRangeCount)) { 4781 set_.set_standard_set_type('n'); 4782 return true; 4783 } 4784 if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) { 4785 set_.set_standard_set_type('w'); 4786 return true; 4787 } 4788 if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) { 4789 set_.set_standard_set_type('W'); 4790 return true; 4791 } 4792 return false; 4793 } 4794 4795 4796 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, 4797 RegExpNode* on_success) { 4798 return new(compiler->zone()) TextNode(this, on_success); 4799 } 4800 4801 4802 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler, 4803 RegExpNode* on_success) { 4804 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 4805 int length = alternatives->length(); 4806 ChoiceNode* result = 4807 new(compiler->zone()) ChoiceNode(length, compiler->zone()); 4808 for (int i = 0; i < length; i++) { 4809 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler, 4810 on_success)); 4811 result->AddAlternative(alternative); 4812 } 4813 return result; 4814 } 4815 4816 4817 RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler, 4818 RegExpNode* on_success) { 4819 return ToNode(min(), 4820 max(), 4821 is_greedy(), 4822 body(), 4823 compiler, 4824 on_success); 4825 } 4826 4827 4828 // Scoped object to keep track of how much we unroll quantifier loops in the 4829 // regexp graph generator. 4830 class RegExpExpansionLimiter { 4831 public: 4832 static const int kMaxExpansionFactor = 6; 4833 RegExpExpansionLimiter(RegExpCompiler* compiler, int factor) 4834 : compiler_(compiler), 4835 saved_expansion_factor_(compiler->current_expansion_factor()), 4836 ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) { 4837 ASSERT(factor > 0); 4838 if (ok_to_expand_) { 4839 if (factor > kMaxExpansionFactor) { 4840 // Avoid integer overflow of the current expansion factor. 4841 ok_to_expand_ = false; 4842 compiler->set_current_expansion_factor(kMaxExpansionFactor + 1); 4843 } else { 4844 int new_factor = saved_expansion_factor_ * factor; 4845 ok_to_expand_ = (new_factor <= kMaxExpansionFactor); 4846 compiler->set_current_expansion_factor(new_factor); 4847 } 4848 } 4849 } 4850 4851 ~RegExpExpansionLimiter() { 4852 compiler_->set_current_expansion_factor(saved_expansion_factor_); 4853 } 4854 4855 bool ok_to_expand() { return ok_to_expand_; } 4856 4857 private: 4858 RegExpCompiler* compiler_; 4859 int saved_expansion_factor_; 4860 bool ok_to_expand_; 4861 4862 DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter); 4863 }; 4864 4865 4866 RegExpNode* RegExpQuantifier::ToNode(int min, 4867 int max, 4868 bool is_greedy, 4869 RegExpTree* body, 4870 RegExpCompiler* compiler, 4871 RegExpNode* on_success, 4872 bool not_at_start) { 4873 // x{f, t} becomes this: 4874 // 4875 // (r++)<-. 4876 // | ` 4877 // | (x) 4878 // v ^ 4879 // (r=0)-->(?)---/ [if r < t] 4880 // | 4881 // [if r >= f] \----> ... 4882 // 4883 4884 // 15.10.2.5 RepeatMatcher algorithm. 4885 // The parser has already eliminated the case where max is 0. In the case 4886 // where max_match is zero the parser has removed the quantifier if min was 4887 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom. 4888 4889 // If we know that we cannot match zero length then things are a little 4890 // simpler since we don't need to make the special zero length match check 4891 // from step 2.1. If the min and max are small we can unroll a little in 4892 // this case. 4893 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,} 4894 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3} 4895 if (max == 0) return on_success; // This can happen due to recursion. 4896 bool body_can_be_empty = (body->min_match() == 0); 4897 int body_start_reg = RegExpCompiler::kNoRegister; 4898 Interval capture_registers = body->CaptureRegisters(); 4899 bool needs_capture_clearing = !capture_registers.is_empty(); 4900 Zone* zone = compiler->zone(); 4901 4902 if (body_can_be_empty) { 4903 body_start_reg = compiler->AllocateRegister(); 4904 } else if (FLAG_regexp_optimization && !needs_capture_clearing) { 4905 // Only unroll if there are no captures and the body can't be 4906 // empty. 4907 { 4908 RegExpExpansionLimiter limiter( 4909 compiler, min + ((max != min) ? 1 : 0)); 4910 if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) { 4911 int new_max = (max == kInfinity) ? max : max - min; 4912 // Recurse once to get the loop or optional matches after the fixed 4913 // ones. 4914 RegExpNode* answer = ToNode( 4915 0, new_max, is_greedy, body, compiler, on_success, true); 4916 // Unroll the forced matches from 0 to min. This can cause chains of 4917 // TextNodes (which the parser does not generate). These should be 4918 // combined if it turns out they hinder good code generation. 4919 for (int i = 0; i < min; i++) { 4920 answer = body->ToNode(compiler, answer); 4921 } 4922 return answer; 4923 } 4924 } 4925 if (max <= kMaxUnrolledMaxMatches && min == 0) { 4926 ASSERT(max > 0); // Due to the 'if' above. 4927 RegExpExpansionLimiter limiter(compiler, max); 4928 if (limiter.ok_to_expand()) { 4929 // Unroll the optional matches up to max. 4930 RegExpNode* answer = on_success; 4931 for (int i = 0; i < max; i++) { 4932 ChoiceNode* alternation = new(zone) ChoiceNode(2, zone); 4933 if (is_greedy) { 4934 alternation->AddAlternative( 4935 GuardedAlternative(body->ToNode(compiler, answer))); 4936 alternation->AddAlternative(GuardedAlternative(on_success)); 4937 } else { 4938 alternation->AddAlternative(GuardedAlternative(on_success)); 4939 alternation->AddAlternative( 4940 GuardedAlternative(body->ToNode(compiler, answer))); 4941 } 4942 answer = alternation; 4943 if (not_at_start) alternation->set_not_at_start(); 4944 } 4945 return answer; 4946 } 4947 } 4948 } 4949 bool has_min = min > 0; 4950 bool has_max = max < RegExpTree::kInfinity; 4951 bool needs_counter = has_min || has_max; 4952 int reg_ctr = needs_counter 4953 ? compiler->AllocateRegister() 4954 : RegExpCompiler::kNoRegister; 4955 LoopChoiceNode* center = new(zone) LoopChoiceNode(body->min_match() == 0, 4956 zone); 4957 if (not_at_start) center->set_not_at_start(); 4958 RegExpNode* loop_return = needs_counter 4959 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center)) 4960 : static_cast<RegExpNode*>(center); 4961 if (body_can_be_empty) { 4962 // If the body can be empty we need to check if it was and then 4963 // backtrack. 4964 loop_return = ActionNode::EmptyMatchCheck(body_start_reg, 4965 reg_ctr, 4966 min, 4967 loop_return); 4968 } 4969 RegExpNode* body_node = body->ToNode(compiler, loop_return); 4970 if (body_can_be_empty) { 4971 // If the body can be empty we need to store the start position 4972 // so we can bail out if it was empty. 4973 body_node = ActionNode::StorePosition(body_start_reg, false, body_node); 4974 } 4975 if (needs_capture_clearing) { 4976 // Before entering the body of this loop we need to clear captures. 4977 body_node = ActionNode::ClearCaptures(capture_registers, body_node); 4978 } 4979 GuardedAlternative body_alt(body_node); 4980 if (has_max) { 4981 Guard* body_guard = 4982 new(zone) Guard(reg_ctr, Guard::LT, max); 4983 body_alt.AddGuard(body_guard, zone); 4984 } 4985 GuardedAlternative rest_alt(on_success); 4986 if (has_min) { 4987 Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min); 4988 rest_alt.AddGuard(rest_guard, zone); 4989 } 4990 if (is_greedy) { 4991 center->AddLoopAlternative(body_alt); 4992 center->AddContinueAlternative(rest_alt); 4993 } else { 4994 center->AddContinueAlternative(rest_alt); 4995 center->AddLoopAlternative(body_alt); 4996 } 4997 if (needs_counter) { 4998 return ActionNode::SetRegister(reg_ctr, 0, center); 4999 } else { 5000 return center; 5001 } 5002 } 5003 5004 5005 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, 5006 RegExpNode* on_success) { 5007 NodeInfo info; 5008 Zone* zone = compiler->zone(); 5009 5010 switch (assertion_type()) { 5011 case START_OF_LINE: 5012 return AssertionNode::AfterNewline(on_success); 5013 case START_OF_INPUT: 5014 return AssertionNode::AtStart(on_success); 5015 case BOUNDARY: 5016 return AssertionNode::AtBoundary(on_success); 5017 case NON_BOUNDARY: 5018 return AssertionNode::AtNonBoundary(on_success); 5019 case END_OF_INPUT: 5020 return AssertionNode::AtEnd(on_success); 5021 case END_OF_LINE: { 5022 // Compile $ in multiline regexps as an alternation with a positive 5023 // lookahead in one side and an end-of-input on the other side. 5024 // We need two registers for the lookahead. 5025 int stack_pointer_register = compiler->AllocateRegister(); 5026 int position_register = compiler->AllocateRegister(); 5027 // The ChoiceNode to distinguish between a newline and end-of-input. 5028 ChoiceNode* result = new(zone) ChoiceNode(2, zone); 5029 // Create a newline atom. 5030 ZoneList<CharacterRange>* newline_ranges = 5031 new(zone) ZoneList<CharacterRange>(3, zone); 5032 CharacterRange::AddClassEscape('n', newline_ranges, zone); 5033 RegExpCharacterClass* newline_atom = new(zone) RegExpCharacterClass('n'); 5034 TextNode* newline_matcher = new(zone) TextNode( 5035 newline_atom, 5036 ActionNode::PositiveSubmatchSuccess(stack_pointer_register, 5037 position_register, 5038 0, // No captures inside. 5039 -1, // Ignored if no captures. 5040 on_success)); 5041 // Create an end-of-input matcher. 5042 RegExpNode* end_of_line = ActionNode::BeginSubmatch( 5043 stack_pointer_register, 5044 position_register, 5045 newline_matcher); 5046 // Add the two alternatives to the ChoiceNode. 5047 GuardedAlternative eol_alternative(end_of_line); 5048 result->AddAlternative(eol_alternative); 5049 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success)); 5050 result->AddAlternative(end_alternative); 5051 return result; 5052 } 5053 default: 5054 UNREACHABLE(); 5055 } 5056 return on_success; 5057 } 5058 5059 5060 RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler, 5061 RegExpNode* on_success) { 5062 return new(compiler->zone()) 5063 BackReferenceNode(RegExpCapture::StartRegister(index()), 5064 RegExpCapture::EndRegister(index()), 5065 on_success); 5066 } 5067 5068 5069 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler, 5070 RegExpNode* on_success) { 5071 return on_success; 5072 } 5073 5074 5075 RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler, 5076 RegExpNode* on_success) { 5077 int stack_pointer_register = compiler->AllocateRegister(); 5078 int position_register = compiler->AllocateRegister(); 5079 5080 const int registers_per_capture = 2; 5081 const int register_of_first_capture = 2; 5082 int register_count = capture_count_ * registers_per_capture; 5083 int register_start = 5084 register_of_first_capture + capture_from_ * registers_per_capture; 5085 5086 RegExpNode* success; 5087 if (is_positive()) { 5088 RegExpNode* node = ActionNode::BeginSubmatch( 5089 stack_pointer_register, 5090 position_register, 5091 body()->ToNode( 5092 compiler, 5093 ActionNode::PositiveSubmatchSuccess(stack_pointer_register, 5094 position_register, 5095 register_count, 5096 register_start, 5097 on_success))); 5098 return node; 5099 } else { 5100 // We use a ChoiceNode for a negative lookahead because it has most of 5101 // the characteristics we need. It has the body of the lookahead as its 5102 // first alternative and the expression after the lookahead of the second 5103 // alternative. If the first alternative succeeds then the 5104 // NegativeSubmatchSuccess will unwind the stack including everything the 5105 // choice node set up and backtrack. If the first alternative fails then 5106 // the second alternative is tried, which is exactly the desired result 5107 // for a negative lookahead. The NegativeLookaheadChoiceNode is a special 5108 // ChoiceNode that knows to ignore the first exit when calculating quick 5109 // checks. 5110 Zone* zone = compiler->zone(); 5111 5112 GuardedAlternative body_alt( 5113 body()->ToNode( 5114 compiler, 5115 success = new(zone) NegativeSubmatchSuccess(stack_pointer_register, 5116 position_register, 5117 register_count, 5118 register_start, 5119 zone))); 5120 ChoiceNode* choice_node = 5121 new(zone) NegativeLookaheadChoiceNode(body_alt, 5122 GuardedAlternative(on_success), 5123 zone); 5124 return ActionNode::BeginSubmatch(stack_pointer_register, 5125 position_register, 5126 choice_node); 5127 } 5128 } 5129 5130 5131 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler, 5132 RegExpNode* on_success) { 5133 return ToNode(body(), index(), compiler, on_success); 5134 } 5135 5136 5137 RegExpNode* RegExpCapture::ToNode(RegExpTree* body, 5138 int index, 5139 RegExpCompiler* compiler, 5140 RegExpNode* on_success) { 5141 int start_reg = RegExpCapture::StartRegister(index); 5142 int end_reg = RegExpCapture::EndRegister(index); 5143 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success); 5144 RegExpNode* body_node = body->ToNode(compiler, store_end); 5145 return ActionNode::StorePosition(start_reg, true, body_node); 5146 } 5147 5148 5149 RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler, 5150 RegExpNode* on_success) { 5151 ZoneList<RegExpTree*>* children = nodes(); 5152 RegExpNode* current = on_success; 5153 for (int i = children->length() - 1; i >= 0; i--) { 5154 current = children->at(i)->ToNode(compiler, current); 5155 } 5156 return current; 5157 } 5158 5159 5160 static void AddClass(const int* elmv, 5161 int elmc, 5162 ZoneList<CharacterRange>* ranges, 5163 Zone* zone) { 5164 elmc--; 5165 ASSERT(elmv[elmc] == 0x10000); 5166 for (int i = 0; i < elmc; i += 2) { 5167 ASSERT(elmv[i] < elmv[i + 1]); 5168 ranges->Add(CharacterRange(elmv[i], elmv[i + 1] - 1), zone); 5169 } 5170 } 5171 5172 5173 static void AddClassNegated(const int *elmv, 5174 int elmc, 5175 ZoneList<CharacterRange>* ranges, 5176 Zone* zone) { 5177 elmc--; 5178 ASSERT(elmv[elmc] == 0x10000); 5179 ASSERT(elmv[0] != 0x0000); 5180 ASSERT(elmv[elmc-1] != String::kMaxUtf16CodeUnit); 5181 uc16 last = 0x0000; 5182 for (int i = 0; i < elmc; i += 2) { 5183 ASSERT(last <= elmv[i] - 1); 5184 ASSERT(elmv[i] < elmv[i + 1]); 5185 ranges->Add(CharacterRange(last, elmv[i] - 1), zone); 5186 last = elmv[i + 1]; 5187 } 5188 ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit), zone); 5189 } 5190 5191 5192 void CharacterRange::AddClassEscape(uc16 type, 5193 ZoneList<CharacterRange>* ranges, 5194 Zone* zone) { 5195 switch (type) { 5196 case 's': 5197 AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone); 5198 break; 5199 case 'S': 5200 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone); 5201 break; 5202 case 'w': 5203 AddClass(kWordRanges, kWordRangeCount, ranges, zone); 5204 break; 5205 case 'W': 5206 AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone); 5207 break; 5208 case 'd': 5209 AddClass(kDigitRanges, kDigitRangeCount, ranges, zone); 5210 break; 5211 case 'D': 5212 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone); 5213 break; 5214 case '.': 5215 AddClassNegated(kLineTerminatorRanges, 5216 kLineTerminatorRangeCount, 5217 ranges, 5218 zone); 5219 break; 5220 // This is not a character range as defined by the spec but a 5221 // convenient shorthand for a character class that matches any 5222 // character. 5223 case '*': 5224 ranges->Add(CharacterRange::Everything(), zone); 5225 break; 5226 // This is the set of characters matched by the $ and ^ symbols 5227 // in multiline mode. 5228 case 'n': 5229 AddClass(kLineTerminatorRanges, 5230 kLineTerminatorRangeCount, 5231 ranges, 5232 zone); 5233 break; 5234 default: 5235 UNREACHABLE(); 5236 } 5237 } 5238 5239 5240 Vector<const int> CharacterRange::GetWordBounds() { 5241 return Vector<const int>(kWordRanges, kWordRangeCount - 1); 5242 } 5243 5244 5245 class CharacterRangeSplitter { 5246 public: 5247 CharacterRangeSplitter(ZoneList<CharacterRange>** included, 5248 ZoneList<CharacterRange>** excluded, 5249 Zone* zone) 5250 : included_(included), 5251 excluded_(excluded), 5252 zone_(zone) { } 5253 void Call(uc16 from, DispatchTable::Entry entry); 5254 5255 static const int kInBase = 0; 5256 static const int kInOverlay = 1; 5257 5258 private: 5259 ZoneList<CharacterRange>** included_; 5260 ZoneList<CharacterRange>** excluded_; 5261 Zone* zone_; 5262 }; 5263 5264 5265 void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) { 5266 if (!entry.out_set()->Get(kInBase)) return; 5267 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay) 5268 ? included_ 5269 : excluded_; 5270 if (*target == NULL) *target = new(zone_) ZoneList<CharacterRange>(2, zone_); 5271 (*target)->Add(CharacterRange(entry.from(), entry.to()), zone_); 5272 } 5273 5274 5275 void CharacterRange::Split(ZoneList<CharacterRange>* base, 5276 Vector<const int> overlay, 5277 ZoneList<CharacterRange>** included, 5278 ZoneList<CharacterRange>** excluded, 5279 Zone* zone) { 5280 ASSERT_EQ(NULL, *included); 5281 ASSERT_EQ(NULL, *excluded); 5282 DispatchTable table(zone); 5283 for (int i = 0; i < base->length(); i++) 5284 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase, zone); 5285 for (int i = 0; i < overlay.length(); i += 2) { 5286 table.AddRange(CharacterRange(overlay[i], overlay[i + 1] - 1), 5287 CharacterRangeSplitter::kInOverlay, zone); 5288 } 5289 CharacterRangeSplitter callback(included, excluded, zone); 5290 table.ForEach(&callback); 5291 } 5292 5293 5294 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges, 5295 bool is_ascii, 5296 Zone* zone) { 5297 Isolate* isolate = zone->isolate(); 5298 uc16 bottom = from(); 5299 uc16 top = to(); 5300 if (is_ascii && !RangeContainsLatin1Equivalents(*this)) { 5301 if (bottom > String::kMaxOneByteCharCode) return; 5302 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode; 5303 } 5304 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 5305 if (top == bottom) { 5306 // If this is a singleton we just expand the one character. 5307 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars); 5308 for (int i = 0; i < length; i++) { 5309 uc32 chr = chars[i]; 5310 if (chr != bottom) { 5311 ranges->Add(CharacterRange::Singleton(chars[i]), zone); 5312 } 5313 } 5314 } else { 5315 // If this is a range we expand the characters block by block, 5316 // expanding contiguous subranges (blocks) one at a time. 5317 // The approach is as follows. For a given start character we 5318 // look up the remainder of the block that contains it (represented 5319 // by the end point), for instance we find 'z' if the character 5320 // is 'c'. A block is characterized by the property 5321 // that all characters uncanonicalize in the same way, except that 5322 // each entry in the result is incremented by the distance from the first 5323 // element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and 5324 // the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. 5325 // Once we've found the end point we look up its uncanonicalization 5326 // and produce a range for each element. For instance for [c-f] 5327 // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only 5328 // add a range if it is not already contained in the input, so [c-f] 5329 // will be skipped but [C-F] will be added. If this range is not 5330 // completely contained in a block we do this for all the blocks 5331 // covered by the range (handling characters that is not in a block 5332 // as a "singleton block"). 5333 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 5334 int pos = bottom; 5335 while (pos <= top) { 5336 int length = isolate->jsregexp_canonrange()->get(pos, '\0', range); 5337 uc16 block_end; 5338 if (length == 0) { 5339 block_end = pos; 5340 } else { 5341 ASSERT_EQ(1, length); 5342 block_end = range[0]; 5343 } 5344 int end = (block_end > top) ? top : block_end; 5345 length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', range); 5346 for (int i = 0; i < length; i++) { 5347 uc32 c = range[i]; 5348 uc16 range_from = c - (block_end - pos); 5349 uc16 range_to = c - (block_end - end); 5350 if (!(bottom <= range_from && range_to <= top)) { 5351 ranges->Add(CharacterRange(range_from, range_to), zone); 5352 } 5353 } 5354 pos = end + 1; 5355 } 5356 } 5357 } 5358 5359 5360 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) { 5361 ASSERT_NOT_NULL(ranges); 5362 int n = ranges->length(); 5363 if (n <= 1) return true; 5364 int max = ranges->at(0).to(); 5365 for (int i = 1; i < n; i++) { 5366 CharacterRange next_range = ranges->at(i); 5367 if (next_range.from() <= max + 1) return false; 5368 max = next_range.to(); 5369 } 5370 return true; 5371 } 5372 5373 5374 ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) { 5375 if (ranges_ == NULL) { 5376 ranges_ = new(zone) ZoneList<CharacterRange>(2, zone); 5377 CharacterRange::AddClassEscape(standard_set_type_, ranges_, zone); 5378 } 5379 return ranges_; 5380 } 5381 5382 5383 // Move a number of elements in a zonelist to another position 5384 // in the same list. Handles overlapping source and target areas. 5385 static void MoveRanges(ZoneList<CharacterRange>* list, 5386 int from, 5387 int to, 5388 int count) { 5389 // Ranges are potentially overlapping. 5390 if (from < to) { 5391 for (int i = count - 1; i >= 0; i--) { 5392 list->at(to + i) = list->at(from + i); 5393 } 5394 } else { 5395 for (int i = 0; i < count; i++) { 5396 list->at(to + i) = list->at(from + i); 5397 } 5398 } 5399 } 5400 5401 5402 static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list, 5403 int count, 5404 CharacterRange insert) { 5405 // Inserts a range into list[0..count[, which must be sorted 5406 // by from value and non-overlapping and non-adjacent, using at most 5407 // list[0..count] for the result. Returns the number of resulting 5408 // canonicalized ranges. Inserting a range may collapse existing ranges into 5409 // fewer ranges, so the return value can be anything in the range 1..count+1. 5410 uc16 from = insert.from(); 5411 uc16 to = insert.to(); 5412 int start_pos = 0; 5413 int end_pos = count; 5414 for (int i = count - 1; i >= 0; i--) { 5415 CharacterRange current = list->at(i); 5416 if (current.from() > to + 1) { 5417 end_pos = i; 5418 } else if (current.to() + 1 < from) { 5419 start_pos = i + 1; 5420 break; 5421 } 5422 } 5423 5424 // Inserted range overlaps, or is adjacent to, ranges at positions 5425 // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are 5426 // not affected by the insertion. 5427 // If start_pos == end_pos, the range must be inserted before start_pos. 5428 // if start_pos < end_pos, the entire range from start_pos to end_pos 5429 // must be merged with the insert range. 5430 5431 if (start_pos == end_pos) { 5432 // Insert between existing ranges at position start_pos. 5433 if (start_pos < count) { 5434 MoveRanges(list, start_pos, start_pos + 1, count - start_pos); 5435 } 5436 list->at(start_pos) = insert; 5437 return count + 1; 5438 } 5439 if (start_pos + 1 == end_pos) { 5440 // Replace single existing range at position start_pos. 5441 CharacterRange to_replace = list->at(start_pos); 5442 int new_from = Min(to_replace.from(), from); 5443 int new_to = Max(to_replace.to(), to); 5444 list->at(start_pos) = CharacterRange(new_from, new_to); 5445 return count; 5446 } 5447 // Replace a number of existing ranges from start_pos to end_pos - 1. 5448 // Move the remaining ranges down. 5449 5450 int new_from = Min(list->at(start_pos).from(), from); 5451 int new_to = Max(list->at(end_pos - 1).to(), to); 5452 if (end_pos < count) { 5453 MoveRanges(list, end_pos, start_pos + 1, count - end_pos); 5454 } 5455 list->at(start_pos) = CharacterRange(new_from, new_to); 5456 return count - (end_pos - start_pos) + 1; 5457 } 5458 5459 5460 void CharacterSet::Canonicalize() { 5461 // Special/default classes are always considered canonical. The result 5462 // of calling ranges() will be sorted. 5463 if (ranges_ == NULL) return; 5464 CharacterRange::Canonicalize(ranges_); 5465 } 5466 5467 5468 void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) { 5469 if (character_ranges->length() <= 1) return; 5470 // Check whether ranges are already canonical (increasing, non-overlapping, 5471 // non-adjacent). 5472 int n = character_ranges->length(); 5473 int max = character_ranges->at(0).to(); 5474 int i = 1; 5475 while (i < n) { 5476 CharacterRange current = character_ranges->at(i); 5477 if (current.from() <= max + 1) { 5478 break; 5479 } 5480 max = current.to(); 5481 i++; 5482 } 5483 // Canonical until the i'th range. If that's all of them, we are done. 5484 if (i == n) return; 5485 5486 // The ranges at index i and forward are not canonicalized. Make them so by 5487 // doing the equivalent of insertion sort (inserting each into the previous 5488 // list, in order). 5489 // Notice that inserting a range can reduce the number of ranges in the 5490 // result due to combining of adjacent and overlapping ranges. 5491 int read = i; // Range to insert. 5492 int num_canonical = i; // Length of canonicalized part of list. 5493 do { 5494 num_canonical = InsertRangeInCanonicalList(character_ranges, 5495 num_canonical, 5496 character_ranges->at(read)); 5497 read++; 5498 } while (read < n); 5499 character_ranges->Rewind(num_canonical); 5500 5501 ASSERT(CharacterRange::IsCanonical(character_ranges)); 5502 } 5503 5504 5505 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges, 5506 ZoneList<CharacterRange>* negated_ranges, 5507 Zone* zone) { 5508 ASSERT(CharacterRange::IsCanonical(ranges)); 5509 ASSERT_EQ(0, negated_ranges->length()); 5510 int range_count = ranges->length(); 5511 uc16 from = 0; 5512 int i = 0; 5513 if (range_count > 0 && ranges->at(0).from() == 0) { 5514 from = ranges->at(0).to(); 5515 i = 1; 5516 } 5517 while (i < range_count) { 5518 CharacterRange range = ranges->at(i); 5519 negated_ranges->Add(CharacterRange(from + 1, range.from() - 1), zone); 5520 from = range.to(); 5521 i++; 5522 } 5523 if (from < String::kMaxUtf16CodeUnit) { 5524 negated_ranges->Add(CharacterRange(from + 1, String::kMaxUtf16CodeUnit), 5525 zone); 5526 } 5527 } 5528 5529 5530 // ------------------------------------------------------------------- 5531 // Splay tree 5532 5533 5534 OutSet* OutSet::Extend(unsigned value, Zone* zone) { 5535 if (Get(value)) 5536 return this; 5537 if (successors(zone) != NULL) { 5538 for (int i = 0; i < successors(zone)->length(); i++) { 5539 OutSet* successor = successors(zone)->at(i); 5540 if (successor->Get(value)) 5541 return successor; 5542 } 5543 } else { 5544 successors_ = new(zone) ZoneList<OutSet*>(2, zone); 5545 } 5546 OutSet* result = new(zone) OutSet(first_, remaining_); 5547 result->Set(value, zone); 5548 successors(zone)->Add(result, zone); 5549 return result; 5550 } 5551 5552 5553 void OutSet::Set(unsigned value, Zone *zone) { 5554 if (value < kFirstLimit) { 5555 first_ |= (1 << value); 5556 } else { 5557 if (remaining_ == NULL) 5558 remaining_ = new(zone) ZoneList<unsigned>(1, zone); 5559 if (remaining_->is_empty() || !remaining_->Contains(value)) 5560 remaining_->Add(value, zone); 5561 } 5562 } 5563 5564 5565 bool OutSet::Get(unsigned value) { 5566 if (value < kFirstLimit) { 5567 return (first_ & (1 << value)) != 0; 5568 } else if (remaining_ == NULL) { 5569 return false; 5570 } else { 5571 return remaining_->Contains(value); 5572 } 5573 } 5574 5575 5576 const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar; 5577 5578 5579 void DispatchTable::AddRange(CharacterRange full_range, int value, 5580 Zone* zone) { 5581 CharacterRange current = full_range; 5582 if (tree()->is_empty()) { 5583 // If this is the first range we just insert into the table. 5584 ZoneSplayTree<Config>::Locator loc; 5585 ASSERT_RESULT(tree()->Insert(current.from(), &loc)); 5586 loc.set_value(Entry(current.from(), current.to(), 5587 empty()->Extend(value, zone))); 5588 return; 5589 } 5590 // First see if there is a range to the left of this one that 5591 // overlaps. 5592 ZoneSplayTree<Config>::Locator loc; 5593 if (tree()->FindGreatestLessThan(current.from(), &loc)) { 5594 Entry* entry = &loc.value(); 5595 // If we've found a range that overlaps with this one, and it 5596 // starts strictly to the left of this one, we have to fix it 5597 // because the following code only handles ranges that start on 5598 // or after the start point of the range we're adding. 5599 if (entry->from() < current.from() && entry->to() >= current.from()) { 5600 // Snap the overlapping range in half around the start point of 5601 // the range we're adding. 5602 CharacterRange left(entry->from(), current.from() - 1); 5603 CharacterRange right(current.from(), entry->to()); 5604 // The left part of the overlapping range doesn't overlap. 5605 // Truncate the whole entry to be just the left part. 5606 entry->set_to(left.to()); 5607 // The right part is the one that overlaps. We add this part 5608 // to the map and let the next step deal with merging it with 5609 // the range we're adding. 5610 ZoneSplayTree<Config>::Locator loc; 5611 ASSERT_RESULT(tree()->Insert(right.from(), &loc)); 5612 loc.set_value(Entry(right.from(), 5613 right.to(), 5614 entry->out_set())); 5615 } 5616 } 5617 while (current.is_valid()) { 5618 if (tree()->FindLeastGreaterThan(current.from(), &loc) && 5619 (loc.value().from() <= current.to()) && 5620 (loc.value().to() >= current.from())) { 5621 Entry* entry = &loc.value(); 5622 // We have overlap. If there is space between the start point of 5623 // the range we're adding and where the overlapping range starts 5624 // then we have to add a range covering just that space. 5625 if (current.from() < entry->from()) { 5626 ZoneSplayTree<Config>::Locator ins; 5627 ASSERT_RESULT(tree()->Insert(current.from(), &ins)); 5628 ins.set_value(Entry(current.from(), 5629 entry->from() - 1, 5630 empty()->Extend(value, zone))); 5631 current.set_from(entry->from()); 5632 } 5633 ASSERT_EQ(current.from(), entry->from()); 5634 // If the overlapping range extends beyond the one we want to add 5635 // we have to snap the right part off and add it separately. 5636 if (entry->to() > current.to()) { 5637 ZoneSplayTree<Config>::Locator ins; 5638 ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins)); 5639 ins.set_value(Entry(current.to() + 1, 5640 entry->to(), 5641 entry->out_set())); 5642 entry->set_to(current.to()); 5643 } 5644 ASSERT(entry->to() <= current.to()); 5645 // The overlapping range is now completely contained by the range 5646 // we're adding so we can just update it and move the start point 5647 // of the range we're adding just past it. 5648 entry->AddValue(value, zone); 5649 // Bail out if the last interval ended at 0xFFFF since otherwise 5650 // adding 1 will wrap around to 0. 5651 if (entry->to() == String::kMaxUtf16CodeUnit) 5652 break; 5653 ASSERT(entry->to() + 1 > current.from()); 5654 current.set_from(entry->to() + 1); 5655 } else { 5656 // There is no overlap so we can just add the range 5657 ZoneSplayTree<Config>::Locator ins; 5658 ASSERT_RESULT(tree()->Insert(current.from(), &ins)); 5659 ins.set_value(Entry(current.from(), 5660 current.to(), 5661 empty()->Extend(value, zone))); 5662 break; 5663 } 5664 } 5665 } 5666 5667 5668 OutSet* DispatchTable::Get(uc16 value) { 5669 ZoneSplayTree<Config>::Locator loc; 5670 if (!tree()->FindGreatestLessThan(value, &loc)) 5671 return empty(); 5672 Entry* entry = &loc.value(); 5673 if (value <= entry->to()) 5674 return entry->out_set(); 5675 else 5676 return empty(); 5677 } 5678 5679 5680 // ------------------------------------------------------------------- 5681 // Analysis 5682 5683 5684 void Analysis::EnsureAnalyzed(RegExpNode* that) { 5685 StackLimitCheck check(that->zone()->isolate()); 5686 if (check.HasOverflowed()) { 5687 fail("Stack overflow"); 5688 return; 5689 } 5690 if (that->info()->been_analyzed || that->info()->being_analyzed) 5691 return; 5692 that->info()->being_analyzed = true; 5693 that->Accept(this); 5694 that->info()->being_analyzed = false; 5695 that->info()->been_analyzed = true; 5696 } 5697 5698 5699 void Analysis::VisitEnd(EndNode* that) { 5700 // nothing to do 5701 } 5702 5703 5704 void TextNode::CalculateOffsets() { 5705 int element_count = elements()->length(); 5706 // Set up the offsets of the elements relative to the start. This is a fixed 5707 // quantity since a TextNode can only contain fixed-width things. 5708 int cp_offset = 0; 5709 for (int i = 0; i < element_count; i++) { 5710 TextElement& elm = elements()->at(i); 5711 elm.set_cp_offset(cp_offset); 5712 cp_offset += elm.length(); 5713 } 5714 } 5715 5716 5717 void Analysis::VisitText(TextNode* that) { 5718 if (ignore_case_) { 5719 that->MakeCaseIndependent(is_ascii_); 5720 } 5721 EnsureAnalyzed(that->on_success()); 5722 if (!has_failed()) { 5723 that->CalculateOffsets(); 5724 } 5725 } 5726 5727 5728 void Analysis::VisitAction(ActionNode* that) { 5729 RegExpNode* target = that->on_success(); 5730 EnsureAnalyzed(target); 5731 if (!has_failed()) { 5732 // If the next node is interested in what it follows then this node 5733 // has to be interested too so it can pass the information on. 5734 that->info()->AddFromFollowing(target->info()); 5735 } 5736 } 5737 5738 5739 void Analysis::VisitChoice(ChoiceNode* that) { 5740 NodeInfo* info = that->info(); 5741 for (int i = 0; i < that->alternatives()->length(); i++) { 5742 RegExpNode* node = that->alternatives()->at(i).node(); 5743 EnsureAnalyzed(node); 5744 if (has_failed()) return; 5745 // Anything the following nodes need to know has to be known by 5746 // this node also, so it can pass it on. 5747 info->AddFromFollowing(node->info()); 5748 } 5749 } 5750 5751 5752 void Analysis::VisitLoopChoice(LoopChoiceNode* that) { 5753 NodeInfo* info = that->info(); 5754 for (int i = 0; i < that->alternatives()->length(); i++) { 5755 RegExpNode* node = that->alternatives()->at(i).node(); 5756 if (node != that->loop_node()) { 5757 EnsureAnalyzed(node); 5758 if (has_failed()) return; 5759 info->AddFromFollowing(node->info()); 5760 } 5761 } 5762 // Check the loop last since it may need the value of this node 5763 // to get a correct result. 5764 EnsureAnalyzed(that->loop_node()); 5765 if (!has_failed()) { 5766 info->AddFromFollowing(that->loop_node()->info()); 5767 } 5768 } 5769 5770 5771 void Analysis::VisitBackReference(BackReferenceNode* that) { 5772 EnsureAnalyzed(that->on_success()); 5773 } 5774 5775 5776 void Analysis::VisitAssertion(AssertionNode* that) { 5777 EnsureAnalyzed(that->on_success()); 5778 } 5779 5780 5781 void BackReferenceNode::FillInBMInfo(int offset, 5782 int budget, 5783 BoyerMooreLookahead* bm, 5784 bool not_at_start) { 5785 // Working out the set of characters that a backreference can match is too 5786 // hard, so we just say that any character can match. 5787 bm->SetRest(offset); 5788 SaveBMInfo(bm, not_at_start, offset); 5789 } 5790 5791 5792 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize == 5793 RegExpMacroAssembler::kTableSize); 5794 5795 5796 void ChoiceNode::FillInBMInfo(int offset, 5797 int budget, 5798 BoyerMooreLookahead* bm, 5799 bool not_at_start) { 5800 ZoneList<GuardedAlternative>* alts = alternatives(); 5801 budget = (budget - 1) / alts->length(); 5802 for (int i = 0; i < alts->length(); i++) { 5803 GuardedAlternative& alt = alts->at(i); 5804 if (alt.guards() != NULL && alt.guards()->length() != 0) { 5805 bm->SetRest(offset); // Give up trying to fill in info. 5806 SaveBMInfo(bm, not_at_start, offset); 5807 return; 5808 } 5809 alt.node()->FillInBMInfo(offset, budget, bm, not_at_start); 5810 } 5811 SaveBMInfo(bm, not_at_start, offset); 5812 } 5813 5814 5815 void TextNode::FillInBMInfo(int initial_offset, 5816 int budget, 5817 BoyerMooreLookahead* bm, 5818 bool not_at_start) { 5819 if (initial_offset >= bm->length()) return; 5820 int offset = initial_offset; 5821 int max_char = bm->max_char(); 5822 for (int i = 0; i < elements()->length(); i++) { 5823 if (offset >= bm->length()) { 5824 if (initial_offset == 0) set_bm_info(not_at_start, bm); 5825 return; 5826 } 5827 TextElement text = elements()->at(i); 5828 if (text.text_type() == TextElement::ATOM) { 5829 RegExpAtom* atom = text.atom(); 5830 for (int j = 0; j < atom->length(); j++, offset++) { 5831 if (offset >= bm->length()) { 5832 if (initial_offset == 0) set_bm_info(not_at_start, bm); 5833 return; 5834 } 5835 uc16 character = atom->data()[j]; 5836 if (bm->compiler()->ignore_case()) { 5837 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 5838 int length = GetCaseIndependentLetters( 5839 Isolate::Current(), 5840 character, 5841 bm->max_char() == String::kMaxOneByteCharCode, 5842 chars); 5843 for (int j = 0; j < length; j++) { 5844 bm->Set(offset, chars[j]); 5845 } 5846 } else { 5847 if (character <= max_char) bm->Set(offset, character); 5848 } 5849 } 5850 } else { 5851 ASSERT_EQ(TextElement::CHAR_CLASS, text.text_type()); 5852 RegExpCharacterClass* char_class = text.char_class(); 5853 ZoneList<CharacterRange>* ranges = char_class->ranges(zone()); 5854 if (char_class->is_negated()) { 5855 bm->SetAll(offset); 5856 } else { 5857 for (int k = 0; k < ranges->length(); k++) { 5858 CharacterRange& range = ranges->at(k); 5859 if (range.from() > max_char) continue; 5860 int to = Min(max_char, static_cast<int>(range.to())); 5861 bm->SetInterval(offset, Interval(range.from(), to)); 5862 } 5863 } 5864 offset++; 5865 } 5866 } 5867 if (offset >= bm->length()) { 5868 if (initial_offset == 0) set_bm_info(not_at_start, bm); 5869 return; 5870 } 5871 on_success()->FillInBMInfo(offset, 5872 budget - 1, 5873 bm, 5874 true); // Not at start after a text node. 5875 if (initial_offset == 0) set_bm_info(not_at_start, bm); 5876 } 5877 5878 5879 // ------------------------------------------------------------------- 5880 // Dispatch table construction 5881 5882 5883 void DispatchTableConstructor::VisitEnd(EndNode* that) { 5884 AddRange(CharacterRange::Everything()); 5885 } 5886 5887 5888 void DispatchTableConstructor::BuildTable(ChoiceNode* node) { 5889 node->set_being_calculated(true); 5890 ZoneList<GuardedAlternative>* alternatives = node->alternatives(); 5891 for (int i = 0; i < alternatives->length(); i++) { 5892 set_choice_index(i); 5893 alternatives->at(i).node()->Accept(this); 5894 } 5895 node->set_being_calculated(false); 5896 } 5897 5898 5899 class AddDispatchRange { 5900 public: 5901 explicit AddDispatchRange(DispatchTableConstructor* constructor) 5902 : constructor_(constructor) { } 5903 void Call(uc32 from, DispatchTable::Entry entry); 5904 private: 5905 DispatchTableConstructor* constructor_; 5906 }; 5907 5908 5909 void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) { 5910 CharacterRange range(from, entry.to()); 5911 constructor_->AddRange(range); 5912 } 5913 5914 5915 void DispatchTableConstructor::VisitChoice(ChoiceNode* node) { 5916 if (node->being_calculated()) 5917 return; 5918 DispatchTable* table = node->GetTable(ignore_case_); 5919 AddDispatchRange adder(this); 5920 table->ForEach(&adder); 5921 } 5922 5923 5924 void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) { 5925 // TODO(160): Find the node that we refer back to and propagate its start 5926 // set back to here. For now we just accept anything. 5927 AddRange(CharacterRange::Everything()); 5928 } 5929 5930 5931 void DispatchTableConstructor::VisitAssertion(AssertionNode* that) { 5932 RegExpNode* target = that->on_success(); 5933 target->Accept(this); 5934 } 5935 5936 5937 static int CompareRangeByFrom(const CharacterRange* a, 5938 const CharacterRange* b) { 5939 return Compare<uc16>(a->from(), b->from()); 5940 } 5941 5942 5943 void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) { 5944 ranges->Sort(CompareRangeByFrom); 5945 uc16 last = 0; 5946 for (int i = 0; i < ranges->length(); i++) { 5947 CharacterRange range = ranges->at(i); 5948 if (last < range.from()) 5949 AddRange(CharacterRange(last, range.from() - 1)); 5950 if (range.to() >= last) { 5951 if (range.to() == String::kMaxUtf16CodeUnit) { 5952 return; 5953 } else { 5954 last = range.to() + 1; 5955 } 5956 } 5957 } 5958 AddRange(CharacterRange(last, String::kMaxUtf16CodeUnit)); 5959 } 5960 5961 5962 void DispatchTableConstructor::VisitText(TextNode* that) { 5963 TextElement elm = that->elements()->at(0); 5964 switch (elm.text_type()) { 5965 case TextElement::ATOM: { 5966 uc16 c = elm.atom()->data()[0]; 5967 AddRange(CharacterRange(c, c)); 5968 break; 5969 } 5970 case TextElement::CHAR_CLASS: { 5971 RegExpCharacterClass* tree = elm.char_class(); 5972 ZoneList<CharacterRange>* ranges = tree->ranges(that->zone()); 5973 if (tree->is_negated()) { 5974 AddInverse(ranges); 5975 } else { 5976 for (int i = 0; i < ranges->length(); i++) 5977 AddRange(ranges->at(i)); 5978 } 5979 break; 5980 } 5981 default: { 5982 UNIMPLEMENTED(); 5983 } 5984 } 5985 } 5986 5987 5988 void DispatchTableConstructor::VisitAction(ActionNode* that) { 5989 RegExpNode* target = that->on_success(); 5990 target->Accept(this); 5991 } 5992 5993 5994 RegExpEngine::CompilationResult RegExpEngine::Compile( 5995 RegExpCompileData* data, 5996 bool ignore_case, 5997 bool is_global, 5998 bool is_multiline, 5999 Handle<String> pattern, 6000 Handle<String> sample_subject, 6001 bool is_ascii, 6002 Zone* zone) { 6003 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) { 6004 return IrregexpRegExpTooBig(zone->isolate()); 6005 } 6006 RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii, zone); 6007 6008 // Sample some characters from the middle of the string. 6009 static const int kSampleSize = 128; 6010 6011 FlattenString(sample_subject); 6012 int chars_sampled = 0; 6013 int half_way = (sample_subject->length() - kSampleSize) / 2; 6014 for (int i = Max(0, half_way); 6015 i < sample_subject->length() && chars_sampled < kSampleSize; 6016 i++, chars_sampled++) { 6017 compiler.frequency_collator()->CountCharacter(sample_subject->Get(i)); 6018 } 6019 6020 // Wrap the body of the regexp in capture #0. 6021 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree, 6022 0, 6023 &compiler, 6024 compiler.accept()); 6025 RegExpNode* node = captured_body; 6026 bool is_end_anchored = data->tree->IsAnchoredAtEnd(); 6027 bool is_start_anchored = data->tree->IsAnchoredAtStart(); 6028 int max_length = data->tree->max_match(); 6029 if (!is_start_anchored) { 6030 // Add a .*? at the beginning, outside the body capture, unless 6031 // this expression is anchored at the beginning. 6032 RegExpNode* loop_node = 6033 RegExpQuantifier::ToNode(0, 6034 RegExpTree::kInfinity, 6035 false, 6036 new(zone) RegExpCharacterClass('*'), 6037 &compiler, 6038 captured_body, 6039 data->contains_anchor); 6040 6041 if (data->contains_anchor) { 6042 // Unroll loop once, to take care of the case that might start 6043 // at the start of input. 6044 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone); 6045 first_step_node->AddAlternative(GuardedAlternative(captured_body)); 6046 first_step_node->AddAlternative(GuardedAlternative( 6047 new(zone) TextNode(new(zone) RegExpCharacterClass('*'), loop_node))); 6048 node = first_step_node; 6049 } else { 6050 node = loop_node; 6051 } 6052 } 6053 if (is_ascii) { 6054 node = node->FilterASCII(RegExpCompiler::kMaxRecursion, ignore_case); 6055 // Do it again to propagate the new nodes to places where they were not 6056 // put because they had not been calculated yet. 6057 if (node != NULL) { 6058 node = node->FilterASCII(RegExpCompiler::kMaxRecursion, ignore_case); 6059 } 6060 } 6061 6062 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone); 6063 data->node = node; 6064 Analysis analysis(ignore_case, is_ascii); 6065 analysis.EnsureAnalyzed(node); 6066 if (analysis.has_failed()) { 6067 const char* error_message = analysis.error_message(); 6068 return CompilationResult(zone->isolate(), error_message); 6069 } 6070 6071 // Create the correct assembler for the architecture. 6072 #ifndef V8_INTERPRETED_REGEXP 6073 // Native regexp implementation. 6074 6075 NativeRegExpMacroAssembler::Mode mode = 6076 is_ascii ? NativeRegExpMacroAssembler::ASCII 6077 : NativeRegExpMacroAssembler::UC16; 6078 6079 #if V8_TARGET_ARCH_IA32 6080 RegExpMacroAssemblerIA32 macro_assembler(mode, (data->capture_count + 1) * 2, 6081 zone); 6082 #elif V8_TARGET_ARCH_X64 6083 RegExpMacroAssemblerX64 macro_assembler(mode, (data->capture_count + 1) * 2, 6084 zone); 6085 #elif V8_TARGET_ARCH_ARM 6086 RegExpMacroAssemblerARM macro_assembler(mode, (data->capture_count + 1) * 2, 6087 zone); 6088 #elif V8_TARGET_ARCH_MIPS 6089 RegExpMacroAssemblerMIPS macro_assembler(mode, (data->capture_count + 1) * 2, 6090 zone); 6091 #endif 6092 6093 #else // V8_INTERPRETED_REGEXP 6094 // Interpreted regexp implementation. 6095 EmbeddedVector<byte, 1024> codes; 6096 RegExpMacroAssemblerIrregexp macro_assembler(codes, zone); 6097 #endif // V8_INTERPRETED_REGEXP 6098 6099 // Inserted here, instead of in Assembler, because it depends on information 6100 // in the AST that isn't replicated in the Node structure. 6101 static const int kMaxBacksearchLimit = 1024; 6102 if (is_end_anchored && 6103 !is_start_anchored && 6104 max_length < kMaxBacksearchLimit) { 6105 macro_assembler.SetCurrentPositionFromEnd(max_length); 6106 } 6107 6108 if (is_global) { 6109 macro_assembler.set_global_mode( 6110 (data->tree->min_match() > 0) 6111 ? RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK 6112 : RegExpMacroAssembler::GLOBAL); 6113 } 6114 6115 return compiler.Assemble(¯o_assembler, 6116 node, 6117 data->capture_count, 6118 pattern); 6119 } 6120 6121 6122 }} // namespace v8::internal 6123