1 // Copyright 2012 the V8 project authors. All rights reserved. 2 // Redistribution and use in source and binary forms, with or without 3 // modification, are permitted provided that the following conditions are 4 // met: 5 // 6 // * Redistributions of source code must retain the above copyright 7 // notice, this list of conditions and the following disclaimer. 8 // * Redistributions in binary form must reproduce the above 9 // copyright notice, this list of conditions and the following 10 // disclaimer in the documentation and/or other materials provided 11 // with the distribution. 12 // * Neither the name of Google Inc. nor the names of its 13 // contributors may be used to endorse or promote products derived 14 // from this software without specific prior written permission. 15 // 16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 28 #include "v8.h" 29 30 #include "ast.h" 31 #include "compiler.h" 32 #include "execution.h" 33 #include "factory.h" 34 #include "jsregexp.h" 35 #include "jsregexp-inl.h" 36 #include "platform.h" 37 #include "string-search.h" 38 #include "runtime.h" 39 #include "compilation-cache.h" 40 #include "string-stream.h" 41 #include "parser.h" 42 #include "regexp-macro-assembler.h" 43 #include "regexp-macro-assembler-tracer.h" 44 #include "regexp-macro-assembler-irregexp.h" 45 #include "regexp-stack.h" 46 47 #ifndef V8_INTERPRETED_REGEXP 48 #if V8_TARGET_ARCH_IA32 49 #include "ia32/regexp-macro-assembler-ia32.h" 50 #elif V8_TARGET_ARCH_X64 51 #include "x64/regexp-macro-assembler-x64.h" 52 #elif V8_TARGET_ARCH_ARM 53 #include "arm/regexp-macro-assembler-arm.h" 54 #elif V8_TARGET_ARCH_MIPS 55 #include "mips/regexp-macro-assembler-mips.h" 56 #else 57 #error Unsupported target architecture. 58 #endif 59 #endif 60 61 #include "interpreter-irregexp.h" 62 63 64 namespace v8 { 65 namespace internal { 66 67 Handle<Object> RegExpImpl::CreateRegExpLiteral(Handle<JSFunction> constructor, 68 Handle<String> pattern, 69 Handle<String> flags, 70 bool* has_pending_exception) { 71 // Call the construct code with 2 arguments. 72 Handle<Object> argv[] = { pattern, flags }; 73 return Execution::New(constructor, ARRAY_SIZE(argv), argv, 74 has_pending_exception); 75 } 76 77 78 static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) { 79 int flags = JSRegExp::NONE; 80 for (int i = 0; i < str->length(); i++) { 81 switch (str->Get(i)) { 82 case 'i': 83 flags |= JSRegExp::IGNORE_CASE; 84 break; 85 case 'g': 86 flags |= JSRegExp::GLOBAL; 87 break; 88 case 'm': 89 flags |= JSRegExp::MULTILINE; 90 break; 91 } 92 } 93 return JSRegExp::Flags(flags); 94 } 95 96 97 static inline void ThrowRegExpException(Handle<JSRegExp> re, 98 Handle<String> pattern, 99 Handle<String> error_text, 100 const char* message) { 101 Isolate* isolate = re->GetIsolate(); 102 Factory* factory = isolate->factory(); 103 Handle<FixedArray> elements = factory->NewFixedArray(2); 104 elements->set(0, *pattern); 105 elements->set(1, *error_text); 106 Handle<JSArray> array = factory->NewJSArrayWithElements(elements); 107 Handle<Object> regexp_err = factory->NewSyntaxError(message, array); 108 isolate->Throw(*regexp_err); 109 } 110 111 112 ContainedInLattice AddRange(ContainedInLattice containment, 113 const int* ranges, 114 int ranges_length, 115 Interval new_range) { 116 ASSERT((ranges_length & 1) == 1); 117 ASSERT(ranges[ranges_length - 1] == String::kMaxUtf16CodeUnit + 1); 118 if (containment == kLatticeUnknown) return containment; 119 bool inside = false; 120 int last = 0; 121 for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) { 122 // Consider the range from last to ranges[i]. 123 // We haven't got to the new range yet. 124 if (ranges[i] <= new_range.from()) continue; 125 // New range is wholly inside last-ranges[i]. Note that new_range.to() is 126 // inclusive, but the values in ranges are not. 127 if (last <= new_range.from() && new_range.to() < ranges[i]) { 128 return Combine(containment, inside ? kLatticeIn : kLatticeOut); 129 } 130 return kLatticeUnknown; 131 } 132 return containment; 133 } 134 135 136 // More makes code generation slower, less makes V8 benchmark score lower. 137 const int kMaxLookaheadForBoyerMoore = 8; 138 // In a 3-character pattern you can maximally step forwards 3 characters 139 // at a time, which is not always enough to pay for the extra logic. 140 const int kPatternTooShortForBoyerMoore = 2; 141 142 143 // Identifies the sort of regexps where the regexp engine is faster 144 // than the code used for atom matches. 145 static bool HasFewDifferentCharacters(Handle<String> pattern) { 146 int length = Min(kMaxLookaheadForBoyerMoore, pattern->length()); 147 if (length <= kPatternTooShortForBoyerMoore) return false; 148 const int kMod = 128; 149 bool character_found[kMod]; 150 int different = 0; 151 memset(&character_found[0], 0, sizeof(character_found)); 152 for (int i = 0; i < length; i++) { 153 int ch = (pattern->Get(i) & (kMod - 1)); 154 if (!character_found[ch]) { 155 character_found[ch] = true; 156 different++; 157 // We declare a regexp low-alphabet if it has at least 3 times as many 158 // characters as it has different characters. 159 if (different * 3 > length) return false; 160 } 161 } 162 return true; 163 } 164 165 166 // Generic RegExp methods. Dispatches to implementation specific methods. 167 168 169 Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re, 170 Handle<String> pattern, 171 Handle<String> flag_str) { 172 Isolate* isolate = re->GetIsolate(); 173 Zone zone(isolate); 174 JSRegExp::Flags flags = RegExpFlagsFromString(flag_str); 175 CompilationCache* compilation_cache = isolate->compilation_cache(); 176 Handle<FixedArray> cached = compilation_cache->LookupRegExp(pattern, flags); 177 bool in_cache = !cached.is_null(); 178 LOG(isolate, RegExpCompileEvent(re, in_cache)); 179 180 Handle<Object> result; 181 if (in_cache) { 182 re->set_data(*cached); 183 return re; 184 } 185 pattern = FlattenGetString(pattern); 186 PostponeInterruptsScope postpone(isolate); 187 RegExpCompileData parse_result; 188 FlatStringReader reader(isolate, pattern); 189 if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(), 190 &parse_result, &zone)) { 191 // Throw an exception if we fail to parse the pattern. 192 ThrowRegExpException(re, 193 pattern, 194 parse_result.error, 195 "malformed_regexp"); 196 return Handle<Object>::null(); 197 } 198 199 bool has_been_compiled = false; 200 201 if (parse_result.simple && 202 !flags.is_ignore_case() && 203 !HasFewDifferentCharacters(pattern)) { 204 // Parse-tree is a single atom that is equal to the pattern. 205 AtomCompile(re, pattern, flags, pattern); 206 has_been_compiled = true; 207 } else if (parse_result.tree->IsAtom() && 208 !flags.is_ignore_case() && 209 parse_result.capture_count == 0) { 210 RegExpAtom* atom = parse_result.tree->AsAtom(); 211 Vector<const uc16> atom_pattern = atom->data(); 212 Handle<String> atom_string = 213 isolate->factory()->NewStringFromTwoByte(atom_pattern); 214 if (!HasFewDifferentCharacters(atom_string)) { 215 AtomCompile(re, pattern, flags, atom_string); 216 has_been_compiled = true; 217 } 218 } 219 if (!has_been_compiled) { 220 IrregexpInitialize(re, pattern, flags, parse_result.capture_count); 221 } 222 ASSERT(re->data()->IsFixedArray()); 223 // Compilation succeeded so the data is set on the regexp 224 // and we can store it in the cache. 225 Handle<FixedArray> data(FixedArray::cast(re->data())); 226 compilation_cache->PutRegExp(pattern, flags, data); 227 228 return re; 229 } 230 231 232 Handle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp, 233 Handle<String> subject, 234 int index, 235 Handle<JSArray> last_match_info) { 236 switch (regexp->TypeTag()) { 237 case JSRegExp::ATOM: 238 return AtomExec(regexp, subject, index, last_match_info); 239 case JSRegExp::IRREGEXP: { 240 Handle<Object> result = 241 IrregexpExec(regexp, subject, index, last_match_info); 242 ASSERT(!result.is_null() || 243 regexp->GetIsolate()->has_pending_exception()); 244 return result; 245 } 246 default: 247 UNREACHABLE(); 248 return Handle<Object>::null(); 249 } 250 } 251 252 253 // RegExp Atom implementation: Simple string search using indexOf. 254 255 256 void RegExpImpl::AtomCompile(Handle<JSRegExp> re, 257 Handle<String> pattern, 258 JSRegExp::Flags flags, 259 Handle<String> match_pattern) { 260 re->GetIsolate()->factory()->SetRegExpAtomData(re, 261 JSRegExp::ATOM, 262 pattern, 263 flags, 264 match_pattern); 265 } 266 267 268 static void SetAtomLastCapture(FixedArray* array, 269 String* subject, 270 int from, 271 int to) { 272 SealHandleScope shs(array->GetIsolate()); 273 RegExpImpl::SetLastCaptureCount(array, 2); 274 RegExpImpl::SetLastSubject(array, subject); 275 RegExpImpl::SetLastInput(array, subject); 276 RegExpImpl::SetCapture(array, 0, from); 277 RegExpImpl::SetCapture(array, 1, to); 278 } 279 280 281 int RegExpImpl::AtomExecRaw(Handle<JSRegExp> regexp, 282 Handle<String> subject, 283 int index, 284 int32_t* output, 285 int output_size) { 286 Isolate* isolate = regexp->GetIsolate(); 287 288 ASSERT(0 <= index); 289 ASSERT(index <= subject->length()); 290 291 if (!subject->IsFlat()) FlattenString(subject); 292 DisallowHeapAllocation no_gc; // ensure vectors stay valid 293 294 String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex)); 295 int needle_len = needle->length(); 296 ASSERT(needle->IsFlat()); 297 ASSERT_LT(0, needle_len); 298 299 if (index + needle_len > subject->length()) { 300 return RegExpImpl::RE_FAILURE; 301 } 302 303 for (int i = 0; i < output_size; i += 2) { 304 String::FlatContent needle_content = needle->GetFlatContent(); 305 String::FlatContent subject_content = subject->GetFlatContent(); 306 ASSERT(needle_content.IsFlat()); 307 ASSERT(subject_content.IsFlat()); 308 // dispatch on type of strings 309 index = (needle_content.IsAscii() 310 ? (subject_content.IsAscii() 311 ? SearchString(isolate, 312 subject_content.ToOneByteVector(), 313 needle_content.ToOneByteVector(), 314 index) 315 : SearchString(isolate, 316 subject_content.ToUC16Vector(), 317 needle_content.ToOneByteVector(), 318 index)) 319 : (subject_content.IsAscii() 320 ? SearchString(isolate, 321 subject_content.ToOneByteVector(), 322 needle_content.ToUC16Vector(), 323 index) 324 : SearchString(isolate, 325 subject_content.ToUC16Vector(), 326 needle_content.ToUC16Vector(), 327 index))); 328 if (index == -1) { 329 return i / 2; // Return number of matches. 330 } else { 331 output[i] = index; 332 output[i+1] = index + needle_len; 333 index += needle_len; 334 } 335 } 336 return output_size / 2; 337 } 338 339 340 Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re, 341 Handle<String> subject, 342 int index, 343 Handle<JSArray> last_match_info) { 344 Isolate* isolate = re->GetIsolate(); 345 346 static const int kNumRegisters = 2; 347 STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize); 348 int32_t* output_registers = isolate->jsregexp_static_offsets_vector(); 349 350 int res = AtomExecRaw(re, subject, index, output_registers, kNumRegisters); 351 352 if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value(); 353 354 ASSERT_EQ(res, RegExpImpl::RE_SUCCESS); 355 SealHandleScope shs(isolate); 356 FixedArray* array = FixedArray::cast(last_match_info->elements()); 357 SetAtomLastCapture(array, *subject, output_registers[0], output_registers[1]); 358 return last_match_info; 359 } 360 361 362 // Irregexp implementation. 363 364 // Ensures that the regexp object contains a compiled version of the 365 // source for either ASCII or non-ASCII strings. 366 // If the compiled version doesn't already exist, it is compiled 367 // from the source pattern. 368 // If compilation fails, an exception is thrown and this function 369 // returns false. 370 bool RegExpImpl::EnsureCompiledIrregexp( 371 Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii) { 372 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii)); 373 #ifdef V8_INTERPRETED_REGEXP 374 if (compiled_code->IsByteArray()) return true; 375 #else // V8_INTERPRETED_REGEXP (RegExp native code) 376 if (compiled_code->IsCode()) return true; 377 #endif 378 // We could potentially have marked this as flushable, but have kept 379 // a saved version if we did not flush it yet. 380 Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_ascii)); 381 if (saved_code->IsCode()) { 382 // Reinstate the code in the original place. 383 re->SetDataAt(JSRegExp::code_index(is_ascii), saved_code); 384 ASSERT(compiled_code->IsSmi()); 385 return true; 386 } 387 return CompileIrregexp(re, sample_subject, is_ascii); 388 } 389 390 391 static bool CreateRegExpErrorObjectAndThrow(Handle<JSRegExp> re, 392 bool is_ascii, 393 Handle<String> error_message, 394 Isolate* isolate) { 395 Factory* factory = isolate->factory(); 396 Handle<FixedArray> elements = factory->NewFixedArray(2); 397 elements->set(0, re->Pattern()); 398 elements->set(1, *error_message); 399 Handle<JSArray> array = factory->NewJSArrayWithElements(elements); 400 Handle<Object> regexp_err = 401 factory->NewSyntaxError("malformed_regexp", array); 402 isolate->Throw(*regexp_err); 403 return false; 404 } 405 406 407 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, 408 Handle<String> sample_subject, 409 bool is_ascii) { 410 // Compile the RegExp. 411 Isolate* isolate = re->GetIsolate(); 412 Zone zone(isolate); 413 PostponeInterruptsScope postpone(isolate); 414 // If we had a compilation error the last time this is saved at the 415 // saved code index. 416 Object* entry = re->DataAt(JSRegExp::code_index(is_ascii)); 417 // When arriving here entry can only be a smi, either representing an 418 // uncompiled regexp, a previous compilation error, or code that has 419 // been flushed. 420 ASSERT(entry->IsSmi()); 421 int entry_value = Smi::cast(entry)->value(); 422 ASSERT(entry_value == JSRegExp::kUninitializedValue || 423 entry_value == JSRegExp::kCompilationErrorValue || 424 (entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0)); 425 426 if (entry_value == JSRegExp::kCompilationErrorValue) { 427 // A previous compilation failed and threw an error which we store in 428 // the saved code index (we store the error message, not the actual 429 // error). Recreate the error object and throw it. 430 Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_ascii)); 431 ASSERT(error_string->IsString()); 432 Handle<String> error_message(String::cast(error_string)); 433 CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate); 434 return false; 435 } 436 437 JSRegExp::Flags flags = re->GetFlags(); 438 439 Handle<String> pattern(re->Pattern()); 440 if (!pattern->IsFlat()) FlattenString(pattern); 441 RegExpCompileData compile_data; 442 FlatStringReader reader(isolate, pattern); 443 if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(), 444 &compile_data, 445 &zone)) { 446 // Throw an exception if we fail to parse the pattern. 447 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once. 448 ThrowRegExpException(re, 449 pattern, 450 compile_data.error, 451 "malformed_regexp"); 452 return false; 453 } 454 RegExpEngine::CompilationResult result = 455 RegExpEngine::Compile(&compile_data, 456 flags.is_ignore_case(), 457 flags.is_global(), 458 flags.is_multiline(), 459 pattern, 460 sample_subject, 461 is_ascii, 462 &zone); 463 if (result.error_message != NULL) { 464 // Unable to compile regexp. 465 Handle<String> error_message = 466 isolate->factory()->NewStringFromUtf8(CStrVector(result.error_message)); 467 CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate); 468 return false; 469 } 470 471 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data())); 472 data->set(JSRegExp::code_index(is_ascii), result.code); 473 int register_max = IrregexpMaxRegisterCount(*data); 474 if (result.num_registers > register_max) { 475 SetIrregexpMaxRegisterCount(*data, result.num_registers); 476 } 477 478 return true; 479 } 480 481 482 int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) { 483 return Smi::cast( 484 re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value(); 485 } 486 487 488 void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) { 489 re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value)); 490 } 491 492 493 int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) { 494 return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value(); 495 } 496 497 498 int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) { 499 return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value(); 500 } 501 502 503 ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_ascii) { 504 return ByteArray::cast(re->get(JSRegExp::code_index(is_ascii))); 505 } 506 507 508 Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_ascii) { 509 return Code::cast(re->get(JSRegExp::code_index(is_ascii))); 510 } 511 512 513 void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re, 514 Handle<String> pattern, 515 JSRegExp::Flags flags, 516 int capture_count) { 517 // Initialize compiled code entries to null. 518 re->GetIsolate()->factory()->SetRegExpIrregexpData(re, 519 JSRegExp::IRREGEXP, 520 pattern, 521 flags, 522 capture_count); 523 } 524 525 526 int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp, 527 Handle<String> subject) { 528 if (!subject->IsFlat()) FlattenString(subject); 529 530 // Check the asciiness of the underlying storage. 531 bool is_ascii = subject->IsOneByteRepresentationUnderneath(); 532 if (!EnsureCompiledIrregexp(regexp, subject, is_ascii)) return -1; 533 534 #ifdef V8_INTERPRETED_REGEXP 535 // Byte-code regexp needs space allocated for all its registers. 536 // The result captures are copied to the start of the registers array 537 // if the match succeeds. This way those registers are not clobbered 538 // when we set the last match info from last successful match. 539 return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) + 540 (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2; 541 #else // V8_INTERPRETED_REGEXP 542 // Native regexp only needs room to output captures. Registers are handled 543 // internally. 544 return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2; 545 #endif // V8_INTERPRETED_REGEXP 546 } 547 548 549 int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp, 550 Handle<String> subject, 551 int index, 552 int32_t* output, 553 int output_size) { 554 Isolate* isolate = regexp->GetIsolate(); 555 556 Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate); 557 558 ASSERT(index >= 0); 559 ASSERT(index <= subject->length()); 560 ASSERT(subject->IsFlat()); 561 562 bool is_ascii = subject->IsOneByteRepresentationUnderneath(); 563 564 #ifndef V8_INTERPRETED_REGEXP 565 ASSERT(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2); 566 do { 567 EnsureCompiledIrregexp(regexp, subject, is_ascii); 568 Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii), isolate); 569 // The stack is used to allocate registers for the compiled regexp code. 570 // This means that in case of failure, the output registers array is left 571 // untouched and contains the capture results from the previous successful 572 // match. We can use that to set the last match info lazily. 573 NativeRegExpMacroAssembler::Result res = 574 NativeRegExpMacroAssembler::Match(code, 575 subject, 576 output, 577 output_size, 578 index, 579 isolate); 580 if (res != NativeRegExpMacroAssembler::RETRY) { 581 ASSERT(res != NativeRegExpMacroAssembler::EXCEPTION || 582 isolate->has_pending_exception()); 583 STATIC_ASSERT( 584 static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS); 585 STATIC_ASSERT( 586 static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE); 587 STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION) 588 == RE_EXCEPTION); 589 return static_cast<IrregexpResult>(res); 590 } 591 // If result is RETRY, the string has changed representation, and we 592 // must restart from scratch. 593 // In this case, it means we must make sure we are prepared to handle 594 // the, potentially, different subject (the string can switch between 595 // being internal and external, and even between being ASCII and UC16, 596 // but the characters are always the same). 597 IrregexpPrepare(regexp, subject); 598 is_ascii = subject->IsOneByteRepresentationUnderneath(); 599 } while (true); 600 UNREACHABLE(); 601 return RE_EXCEPTION; 602 #else // V8_INTERPRETED_REGEXP 603 604 ASSERT(output_size >= IrregexpNumberOfRegisters(*irregexp)); 605 // We must have done EnsureCompiledIrregexp, so we can get the number of 606 // registers. 607 int number_of_capture_registers = 608 (IrregexpNumberOfCaptures(*irregexp) + 1) * 2; 609 int32_t* raw_output = &output[number_of_capture_registers]; 610 // We do not touch the actual capture result registers until we know there 611 // has been a match so that we can use those capture results to set the 612 // last match info. 613 for (int i = number_of_capture_registers - 1; i >= 0; i--) { 614 raw_output[i] = -1; 615 } 616 Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_ascii), isolate); 617 618 IrregexpResult result = IrregexpInterpreter::Match(isolate, 619 byte_codes, 620 subject, 621 raw_output, 622 index); 623 if (result == RE_SUCCESS) { 624 // Copy capture results to the start of the registers array. 625 OS::MemCopy( 626 output, raw_output, number_of_capture_registers * sizeof(int32_t)); 627 } 628 if (result == RE_EXCEPTION) { 629 ASSERT(!isolate->has_pending_exception()); 630 isolate->StackOverflow(); 631 } 632 return result; 633 #endif // V8_INTERPRETED_REGEXP 634 } 635 636 637 Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp, 638 Handle<String> subject, 639 int previous_index, 640 Handle<JSArray> last_match_info) { 641 Isolate* isolate = regexp->GetIsolate(); 642 ASSERT_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP); 643 644 // Prepare space for the return values. 645 #if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG) 646 if (FLAG_trace_regexp_bytecodes) { 647 String* pattern = regexp->Pattern(); 648 PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString())); 649 PrintF("\n\nSubject string: '%s'\n\n", *(subject->ToCString())); 650 } 651 #endif 652 int required_registers = RegExpImpl::IrregexpPrepare(regexp, subject); 653 if (required_registers < 0) { 654 // Compiling failed with an exception. 655 ASSERT(isolate->has_pending_exception()); 656 return Handle<Object>::null(); 657 } 658 659 int32_t* output_registers = NULL; 660 if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) { 661 output_registers = NewArray<int32_t>(required_registers); 662 } 663 SmartArrayPointer<int32_t> auto_release(output_registers); 664 if (output_registers == NULL) { 665 output_registers = isolate->jsregexp_static_offsets_vector(); 666 } 667 668 int res = RegExpImpl::IrregexpExecRaw( 669 regexp, subject, previous_index, output_registers, required_registers); 670 if (res == RE_SUCCESS) { 671 int capture_count = 672 IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())); 673 return SetLastMatchInfo( 674 last_match_info, subject, capture_count, output_registers); 675 } 676 if (res == RE_EXCEPTION) { 677 ASSERT(isolate->has_pending_exception()); 678 return Handle<Object>::null(); 679 } 680 ASSERT(res == RE_FAILURE); 681 return isolate->factory()->null_value(); 682 } 683 684 685 Handle<JSArray> RegExpImpl::SetLastMatchInfo(Handle<JSArray> last_match_info, 686 Handle<String> subject, 687 int capture_count, 688 int32_t* match) { 689 ASSERT(last_match_info->HasFastObjectElements()); 690 int capture_register_count = (capture_count + 1) * 2; 691 last_match_info->EnsureSize(capture_register_count + kLastMatchOverhead); 692 DisallowHeapAllocation no_allocation; 693 FixedArray* array = FixedArray::cast(last_match_info->elements()); 694 if (match != NULL) { 695 for (int i = 0; i < capture_register_count; i += 2) { 696 SetCapture(array, i, match[i]); 697 SetCapture(array, i + 1, match[i + 1]); 698 } 699 } 700 SetLastCaptureCount(array, capture_register_count); 701 SetLastSubject(array, *subject); 702 SetLastInput(array, *subject); 703 return last_match_info; 704 } 705 706 707 RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp, 708 Handle<String> subject, 709 bool is_global, 710 Isolate* isolate) 711 : register_array_(NULL), 712 register_array_size_(0), 713 regexp_(regexp), 714 subject_(subject) { 715 #ifdef V8_INTERPRETED_REGEXP 716 bool interpreted = true; 717 #else 718 bool interpreted = false; 719 #endif // V8_INTERPRETED_REGEXP 720 721 if (regexp_->TypeTag() == JSRegExp::ATOM) { 722 static const int kAtomRegistersPerMatch = 2; 723 registers_per_match_ = kAtomRegistersPerMatch; 724 // There is no distinction between interpreted and native for atom regexps. 725 interpreted = false; 726 } else { 727 registers_per_match_ = RegExpImpl::IrregexpPrepare(regexp_, subject_); 728 if (registers_per_match_ < 0) { 729 num_matches_ = -1; // Signal exception. 730 return; 731 } 732 } 733 734 if (is_global && !interpreted) { 735 register_array_size_ = 736 Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize); 737 max_matches_ = register_array_size_ / registers_per_match_; 738 } else { 739 // Global loop in interpreted regexp is not implemented. We choose 740 // the size of the offsets vector so that it can only store one match. 741 register_array_size_ = registers_per_match_; 742 max_matches_ = 1; 743 } 744 745 if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) { 746 register_array_ = NewArray<int32_t>(register_array_size_); 747 } else { 748 register_array_ = isolate->jsregexp_static_offsets_vector(); 749 } 750 751 // Set state so that fetching the results the first time triggers a call 752 // to the compiled regexp. 753 current_match_index_ = max_matches_ - 1; 754 num_matches_ = max_matches_; 755 ASSERT(registers_per_match_ >= 2); // Each match has at least one capture. 756 ASSERT_GE(register_array_size_, registers_per_match_); 757 int32_t* last_match = 758 ®ister_array_[current_match_index_ * registers_per_match_]; 759 last_match[0] = -1; 760 last_match[1] = 0; 761 } 762 763 764 // ------------------------------------------------------------------- 765 // Implementation of the Irregexp regular expression engine. 766 // 767 // The Irregexp regular expression engine is intended to be a complete 768 // implementation of ECMAScript regular expressions. It generates either 769 // bytecodes or native code. 770 771 // The Irregexp regexp engine is structured in three steps. 772 // 1) The parser generates an abstract syntax tree. See ast.cc. 773 // 2) From the AST a node network is created. The nodes are all 774 // subclasses of RegExpNode. The nodes represent states when 775 // executing a regular expression. Several optimizations are 776 // performed on the node network. 777 // 3) From the nodes we generate either byte codes or native code 778 // that can actually execute the regular expression (perform 779 // the search). The code generation step is described in more 780 // detail below. 781 782 // Code generation. 783 // 784 // The nodes are divided into four main categories. 785 // * Choice nodes 786 // These represent places where the regular expression can 787 // match in more than one way. For example on entry to an 788 // alternation (foo|bar) or a repetition (*, +, ? or {}). 789 // * Action nodes 790 // These represent places where some action should be 791 // performed. Examples include recording the current position 792 // in the input string to a register (in order to implement 793 // captures) or other actions on register for example in order 794 // to implement the counters needed for {} repetitions. 795 // * Matching nodes 796 // These attempt to match some element part of the input string. 797 // Examples of elements include character classes, plain strings 798 // or back references. 799 // * End nodes 800 // These are used to implement the actions required on finding 801 // a successful match or failing to find a match. 802 // 803 // The code generated (whether as byte codes or native code) maintains 804 // some state as it runs. This consists of the following elements: 805 // 806 // * The capture registers. Used for string captures. 807 // * Other registers. Used for counters etc. 808 // * The current position. 809 // * The stack of backtracking information. Used when a matching node 810 // fails to find a match and needs to try an alternative. 811 // 812 // Conceptual regular expression execution model: 813 // 814 // There is a simple conceptual model of regular expression execution 815 // which will be presented first. The actual code generated is a more 816 // efficient simulation of the simple conceptual model: 817 // 818 // * Choice nodes are implemented as follows: 819 // For each choice except the last { 820 // push current position 821 // push backtrack code location 822 // <generate code to test for choice> 823 // backtrack code location: 824 // pop current position 825 // } 826 // <generate code to test for last choice> 827 // 828 // * Actions nodes are generated as follows 829 // <push affected registers on backtrack stack> 830 // <generate code to perform action> 831 // push backtrack code location 832 // <generate code to test for following nodes> 833 // backtrack code location: 834 // <pop affected registers to restore their state> 835 // <pop backtrack location from stack and go to it> 836 // 837 // * Matching nodes are generated as follows: 838 // if input string matches at current position 839 // update current position 840 // <generate code to test for following nodes> 841 // else 842 // <pop backtrack location from stack and go to it> 843 // 844 // Thus it can be seen that the current position is saved and restored 845 // by the choice nodes, whereas the registers are saved and restored by 846 // by the action nodes that manipulate them. 847 // 848 // The other interesting aspect of this model is that nodes are generated 849 // at the point where they are needed by a recursive call to Emit(). If 850 // the node has already been code generated then the Emit() call will 851 // generate a jump to the previously generated code instead. In order to 852 // limit recursion it is possible for the Emit() function to put the node 853 // on a work list for later generation and instead generate a jump. The 854 // destination of the jump is resolved later when the code is generated. 855 // 856 // Actual regular expression code generation. 857 // 858 // Code generation is actually more complicated than the above. In order 859 // to improve the efficiency of the generated code some optimizations are 860 // performed 861 // 862 // * Choice nodes have 1-character lookahead. 863 // A choice node looks at the following character and eliminates some of 864 // the choices immediately based on that character. This is not yet 865 // implemented. 866 // * Simple greedy loops store reduced backtracking information. 867 // A quantifier like /.*foo/m will greedily match the whole input. It will 868 // then need to backtrack to a point where it can match "foo". The naive 869 // implementation of this would push each character position onto the 870 // backtracking stack, then pop them off one by one. This would use space 871 // proportional to the length of the input string. However since the "." 872 // can only match in one way and always has a constant length (in this case 873 // of 1) it suffices to store the current position on the top of the stack 874 // once. Matching now becomes merely incrementing the current position and 875 // backtracking becomes decrementing the current position and checking the 876 // result against the stored current position. This is faster and saves 877 // space. 878 // * The current state is virtualized. 879 // This is used to defer expensive operations until it is clear that they 880 // are needed and to generate code for a node more than once, allowing 881 // specialized an efficient versions of the code to be created. This is 882 // explained in the section below. 883 // 884 // Execution state virtualization. 885 // 886 // Instead of emitting code, nodes that manipulate the state can record their 887 // manipulation in an object called the Trace. The Trace object can record a 888 // current position offset, an optional backtrack code location on the top of 889 // the virtualized backtrack stack and some register changes. When a node is 890 // to be emitted it can flush the Trace or update it. Flushing the Trace 891 // will emit code to bring the actual state into line with the virtual state. 892 // Avoiding flushing the state can postpone some work (e.g. updates of capture 893 // registers). Postponing work can save time when executing the regular 894 // expression since it may be found that the work never has to be done as a 895 // failure to match can occur. In addition it is much faster to jump to a 896 // known backtrack code location than it is to pop an unknown backtrack 897 // location from the stack and jump there. 898 // 899 // The virtual state found in the Trace affects code generation. For example 900 // the virtual state contains the difference between the actual current 901 // position and the virtual current position, and matching code needs to use 902 // this offset to attempt a match in the correct location of the input 903 // string. Therefore code generated for a non-trivial trace is specialized 904 // to that trace. The code generator therefore has the ability to generate 905 // code for each node several times. In order to limit the size of the 906 // generated code there is an arbitrary limit on how many specialized sets of 907 // code may be generated for a given node. If the limit is reached, the 908 // trace is flushed and a generic version of the code for a node is emitted. 909 // This is subsequently used for that node. The code emitted for non-generic 910 // trace is not recorded in the node and so it cannot currently be reused in 911 // the event that code generation is requested for an identical trace. 912 913 914 void RegExpTree::AppendToText(RegExpText* text, Zone* zone) { 915 UNREACHABLE(); 916 } 917 918 919 void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) { 920 text->AddElement(TextElement::Atom(this), zone); 921 } 922 923 924 void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) { 925 text->AddElement(TextElement::CharClass(this), zone); 926 } 927 928 929 void RegExpText::AppendToText(RegExpText* text, Zone* zone) { 930 for (int i = 0; i < elements()->length(); i++) 931 text->AddElement(elements()->at(i), zone); 932 } 933 934 935 TextElement TextElement::Atom(RegExpAtom* atom) { 936 TextElement result = TextElement(ATOM); 937 result.data.u_atom = atom; 938 return result; 939 } 940 941 942 TextElement TextElement::CharClass( 943 RegExpCharacterClass* char_class) { 944 TextElement result = TextElement(CHAR_CLASS); 945 result.data.u_char_class = char_class; 946 return result; 947 } 948 949 950 int TextElement::length() { 951 if (text_type == ATOM) { 952 return data.u_atom->length(); 953 } else { 954 ASSERT(text_type == CHAR_CLASS); 955 return 1; 956 } 957 } 958 959 960 DispatchTable* ChoiceNode::GetTable(bool ignore_case) { 961 if (table_ == NULL) { 962 table_ = new(zone()) DispatchTable(zone()); 963 DispatchTableConstructor cons(table_, ignore_case, zone()); 964 cons.BuildTable(this); 965 } 966 return table_; 967 } 968 969 970 class FrequencyCollator { 971 public: 972 FrequencyCollator() : total_samples_(0) { 973 for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) { 974 frequencies_[i] = CharacterFrequency(i); 975 } 976 } 977 978 void CountCharacter(int character) { 979 int index = (character & RegExpMacroAssembler::kTableMask); 980 frequencies_[index].Increment(); 981 total_samples_++; 982 } 983 984 // Does not measure in percent, but rather per-128 (the table size from the 985 // regexp macro assembler). 986 int Frequency(int in_character) { 987 ASSERT((in_character & RegExpMacroAssembler::kTableMask) == in_character); 988 if (total_samples_ < 1) return 1; // Division by zero. 989 int freq_in_per128 = 990 (frequencies_[in_character].counter() * 128) / total_samples_; 991 return freq_in_per128; 992 } 993 994 private: 995 class CharacterFrequency { 996 public: 997 CharacterFrequency() : counter_(0), character_(-1) { } 998 explicit CharacterFrequency(int character) 999 : counter_(0), character_(character) { } 1000 1001 void Increment() { counter_++; } 1002 int counter() { return counter_; } 1003 int character() { return character_; } 1004 1005 private: 1006 int counter_; 1007 int character_; 1008 }; 1009 1010 1011 private: 1012 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize]; 1013 int total_samples_; 1014 }; 1015 1016 1017 class RegExpCompiler { 1018 public: 1019 RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii, 1020 Zone* zone); 1021 1022 int AllocateRegister() { 1023 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) { 1024 reg_exp_too_big_ = true; 1025 return next_register_; 1026 } 1027 return next_register_++; 1028 } 1029 1030 RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler, 1031 RegExpNode* start, 1032 int capture_count, 1033 Handle<String> pattern); 1034 1035 inline void AddWork(RegExpNode* node) { work_list_->Add(node); } 1036 1037 static const int kImplementationOffset = 0; 1038 static const int kNumberOfRegistersOffset = 0; 1039 static const int kCodeOffset = 1; 1040 1041 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; } 1042 EndNode* accept() { return accept_; } 1043 1044 static const int kMaxRecursion = 100; 1045 inline int recursion_depth() { return recursion_depth_; } 1046 inline void IncrementRecursionDepth() { recursion_depth_++; } 1047 inline void DecrementRecursionDepth() { recursion_depth_--; } 1048 1049 void SetRegExpTooBig() { reg_exp_too_big_ = true; } 1050 1051 inline bool ignore_case() { return ignore_case_; } 1052 inline bool ascii() { return ascii_; } 1053 FrequencyCollator* frequency_collator() { return &frequency_collator_; } 1054 1055 int current_expansion_factor() { return current_expansion_factor_; } 1056 void set_current_expansion_factor(int value) { 1057 current_expansion_factor_ = value; 1058 } 1059 1060 Zone* zone() const { return zone_; } 1061 1062 static const int kNoRegister = -1; 1063 1064 private: 1065 EndNode* accept_; 1066 int next_register_; 1067 List<RegExpNode*>* work_list_; 1068 int recursion_depth_; 1069 RegExpMacroAssembler* macro_assembler_; 1070 bool ignore_case_; 1071 bool ascii_; 1072 bool reg_exp_too_big_; 1073 int current_expansion_factor_; 1074 FrequencyCollator frequency_collator_; 1075 Zone* zone_; 1076 }; 1077 1078 1079 class RecursionCheck { 1080 public: 1081 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) { 1082 compiler->IncrementRecursionDepth(); 1083 } 1084 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); } 1085 private: 1086 RegExpCompiler* compiler_; 1087 }; 1088 1089 1090 static RegExpEngine::CompilationResult IrregexpRegExpTooBig() { 1091 return RegExpEngine::CompilationResult("RegExp too big"); 1092 } 1093 1094 1095 // Attempts to compile the regexp using an Irregexp code generator. Returns 1096 // a fixed array or a null handle depending on whether it succeeded. 1097 RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii, 1098 Zone* zone) 1099 : next_register_(2 * (capture_count + 1)), 1100 work_list_(NULL), 1101 recursion_depth_(0), 1102 ignore_case_(ignore_case), 1103 ascii_(ascii), 1104 reg_exp_too_big_(false), 1105 current_expansion_factor_(1), 1106 frequency_collator_(), 1107 zone_(zone) { 1108 accept_ = new(zone) EndNode(EndNode::ACCEPT, zone); 1109 ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister); 1110 } 1111 1112 1113 RegExpEngine::CompilationResult RegExpCompiler::Assemble( 1114 RegExpMacroAssembler* macro_assembler, 1115 RegExpNode* start, 1116 int capture_count, 1117 Handle<String> pattern) { 1118 Heap* heap = pattern->GetHeap(); 1119 1120 bool use_slow_safe_regexp_compiler = false; 1121 if (heap->total_regexp_code_generated() > 1122 RegExpImpl::kRegWxpCompiledLimit && 1123 heap->isolate()->memory_allocator()->SizeExecutable() > 1124 RegExpImpl::kRegExpExecutableMemoryLimit) { 1125 use_slow_safe_regexp_compiler = true; 1126 } 1127 1128 macro_assembler->set_slow_safe(use_slow_safe_regexp_compiler); 1129 1130 #ifdef DEBUG 1131 if (FLAG_trace_regexp_assembler) 1132 macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler); 1133 else 1134 #endif 1135 macro_assembler_ = macro_assembler; 1136 1137 List <RegExpNode*> work_list(0); 1138 work_list_ = &work_list; 1139 Label fail; 1140 macro_assembler_->PushBacktrack(&fail); 1141 Trace new_trace; 1142 start->Emit(this, &new_trace); 1143 macro_assembler_->Bind(&fail); 1144 macro_assembler_->Fail(); 1145 while (!work_list.is_empty()) { 1146 work_list.RemoveLast()->Emit(this, &new_trace); 1147 } 1148 if (reg_exp_too_big_) return IrregexpRegExpTooBig(); 1149 1150 Handle<HeapObject> code = macro_assembler_->GetCode(pattern); 1151 heap->IncreaseTotalRegexpCodeGenerated(code->Size()); 1152 work_list_ = NULL; 1153 #ifdef DEBUG 1154 if (FLAG_print_code) { 1155 Handle<Code>::cast(code)->Disassemble(*pattern->ToCString()); 1156 } 1157 if (FLAG_trace_regexp_assembler) { 1158 delete macro_assembler_; 1159 } 1160 #endif 1161 return RegExpEngine::CompilationResult(*code, next_register_); 1162 } 1163 1164 1165 bool Trace::DeferredAction::Mentions(int that) { 1166 if (action_type() == ActionNode::CLEAR_CAPTURES) { 1167 Interval range = static_cast<DeferredClearCaptures*>(this)->range(); 1168 return range.Contains(that); 1169 } else { 1170 return reg() == that; 1171 } 1172 } 1173 1174 1175 bool Trace::mentions_reg(int reg) { 1176 for (DeferredAction* action = actions_; 1177 action != NULL; 1178 action = action->next()) { 1179 if (action->Mentions(reg)) 1180 return true; 1181 } 1182 return false; 1183 } 1184 1185 1186 bool Trace::GetStoredPosition(int reg, int* cp_offset) { 1187 ASSERT_EQ(0, *cp_offset); 1188 for (DeferredAction* action = actions_; 1189 action != NULL; 1190 action = action->next()) { 1191 if (action->Mentions(reg)) { 1192 if (action->action_type() == ActionNode::STORE_POSITION) { 1193 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset(); 1194 return true; 1195 } else { 1196 return false; 1197 } 1198 } 1199 } 1200 return false; 1201 } 1202 1203 1204 int Trace::FindAffectedRegisters(OutSet* affected_registers, 1205 Zone* zone) { 1206 int max_register = RegExpCompiler::kNoRegister; 1207 for (DeferredAction* action = actions_; 1208 action != NULL; 1209 action = action->next()) { 1210 if (action->action_type() == ActionNode::CLEAR_CAPTURES) { 1211 Interval range = static_cast<DeferredClearCaptures*>(action)->range(); 1212 for (int i = range.from(); i <= range.to(); i++) 1213 affected_registers->Set(i, zone); 1214 if (range.to() > max_register) max_register = range.to(); 1215 } else { 1216 affected_registers->Set(action->reg(), zone); 1217 if (action->reg() > max_register) max_register = action->reg(); 1218 } 1219 } 1220 return max_register; 1221 } 1222 1223 1224 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler, 1225 int max_register, 1226 OutSet& registers_to_pop, 1227 OutSet& registers_to_clear) { 1228 for (int reg = max_register; reg >= 0; reg--) { 1229 if (registers_to_pop.Get(reg)) assembler->PopRegister(reg); 1230 else if (registers_to_clear.Get(reg)) { 1231 int clear_to = reg; 1232 while (reg > 0 && registers_to_clear.Get(reg - 1)) { 1233 reg--; 1234 } 1235 assembler->ClearRegisters(reg, clear_to); 1236 } 1237 } 1238 } 1239 1240 1241 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler, 1242 int max_register, 1243 OutSet& affected_registers, 1244 OutSet* registers_to_pop, 1245 OutSet* registers_to_clear, 1246 Zone* zone) { 1247 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1. 1248 const int push_limit = (assembler->stack_limit_slack() + 1) / 2; 1249 1250 // Count pushes performed to force a stack limit check occasionally. 1251 int pushes = 0; 1252 1253 for (int reg = 0; reg <= max_register; reg++) { 1254 if (!affected_registers.Get(reg)) { 1255 continue; 1256 } 1257 1258 // The chronologically first deferred action in the trace 1259 // is used to infer the action needed to restore a register 1260 // to its previous state (or not, if it's safe to ignore it). 1261 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR }; 1262 DeferredActionUndoType undo_action = IGNORE; 1263 1264 int value = 0; 1265 bool absolute = false; 1266 bool clear = false; 1267 int store_position = -1; 1268 // This is a little tricky because we are scanning the actions in reverse 1269 // historical order (newest first). 1270 for (DeferredAction* action = actions_; 1271 action != NULL; 1272 action = action->next()) { 1273 if (action->Mentions(reg)) { 1274 switch (action->action_type()) { 1275 case ActionNode::SET_REGISTER: { 1276 Trace::DeferredSetRegister* psr = 1277 static_cast<Trace::DeferredSetRegister*>(action); 1278 if (!absolute) { 1279 value += psr->value(); 1280 absolute = true; 1281 } 1282 // SET_REGISTER is currently only used for newly introduced loop 1283 // counters. They can have a significant previous value if they 1284 // occour in a loop. TODO(lrn): Propagate this information, so 1285 // we can set undo_action to IGNORE if we know there is no value to 1286 // restore. 1287 undo_action = RESTORE; 1288 ASSERT_EQ(store_position, -1); 1289 ASSERT(!clear); 1290 break; 1291 } 1292 case ActionNode::INCREMENT_REGISTER: 1293 if (!absolute) { 1294 value++; 1295 } 1296 ASSERT_EQ(store_position, -1); 1297 ASSERT(!clear); 1298 undo_action = RESTORE; 1299 break; 1300 case ActionNode::STORE_POSITION: { 1301 Trace::DeferredCapture* pc = 1302 static_cast<Trace::DeferredCapture*>(action); 1303 if (!clear && store_position == -1) { 1304 store_position = pc->cp_offset(); 1305 } 1306 1307 // For captures we know that stores and clears alternate. 1308 // Other register, are never cleared, and if the occur 1309 // inside a loop, they might be assigned more than once. 1310 if (reg <= 1) { 1311 // Registers zero and one, aka "capture zero", is 1312 // always set correctly if we succeed. There is no 1313 // need to undo a setting on backtrack, because we 1314 // will set it again or fail. 1315 undo_action = IGNORE; 1316 } else { 1317 undo_action = pc->is_capture() ? CLEAR : RESTORE; 1318 } 1319 ASSERT(!absolute); 1320 ASSERT_EQ(value, 0); 1321 break; 1322 } 1323 case ActionNode::CLEAR_CAPTURES: { 1324 // Since we're scanning in reverse order, if we've already 1325 // set the position we have to ignore historically earlier 1326 // clearing operations. 1327 if (store_position == -1) { 1328 clear = true; 1329 } 1330 undo_action = RESTORE; 1331 ASSERT(!absolute); 1332 ASSERT_EQ(value, 0); 1333 break; 1334 } 1335 default: 1336 UNREACHABLE(); 1337 break; 1338 } 1339 } 1340 } 1341 // Prepare for the undo-action (e.g., push if it's going to be popped). 1342 if (undo_action == RESTORE) { 1343 pushes++; 1344 RegExpMacroAssembler::StackCheckFlag stack_check = 1345 RegExpMacroAssembler::kNoStackLimitCheck; 1346 if (pushes == push_limit) { 1347 stack_check = RegExpMacroAssembler::kCheckStackLimit; 1348 pushes = 0; 1349 } 1350 1351 assembler->PushRegister(reg, stack_check); 1352 registers_to_pop->Set(reg, zone); 1353 } else if (undo_action == CLEAR) { 1354 registers_to_clear->Set(reg, zone); 1355 } 1356 // Perform the chronologically last action (or accumulated increment) 1357 // for the register. 1358 if (store_position != -1) { 1359 assembler->WriteCurrentPositionToRegister(reg, store_position); 1360 } else if (clear) { 1361 assembler->ClearRegisters(reg, reg); 1362 } else if (absolute) { 1363 assembler->SetRegister(reg, value); 1364 } else if (value != 0) { 1365 assembler->AdvanceRegister(reg, value); 1366 } 1367 } 1368 } 1369 1370 1371 // This is called as we come into a loop choice node and some other tricky 1372 // nodes. It normalizes the state of the code generator to ensure we can 1373 // generate generic code. 1374 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) { 1375 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1376 1377 ASSERT(!is_trivial()); 1378 1379 if (actions_ == NULL && backtrack() == NULL) { 1380 // Here we just have some deferred cp advances to fix and we are back to 1381 // a normal situation. We may also have to forget some information gained 1382 // through a quick check that was already performed. 1383 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_); 1384 // Create a new trivial state and generate the node with that. 1385 Trace new_state; 1386 successor->Emit(compiler, &new_state); 1387 return; 1388 } 1389 1390 // Generate deferred actions here along with code to undo them again. 1391 OutSet affected_registers; 1392 1393 if (backtrack() != NULL) { 1394 // Here we have a concrete backtrack location. These are set up by choice 1395 // nodes and so they indicate that we have a deferred save of the current 1396 // position which we may need to emit here. 1397 assembler->PushCurrentPosition(); 1398 } 1399 1400 int max_register = FindAffectedRegisters(&affected_registers, 1401 compiler->zone()); 1402 OutSet registers_to_pop; 1403 OutSet registers_to_clear; 1404 PerformDeferredActions(assembler, 1405 max_register, 1406 affected_registers, 1407 ®isters_to_pop, 1408 ®isters_to_clear, 1409 compiler->zone()); 1410 if (cp_offset_ != 0) { 1411 assembler->AdvanceCurrentPosition(cp_offset_); 1412 } 1413 1414 // Create a new trivial state and generate the node with that. 1415 Label undo; 1416 assembler->PushBacktrack(&undo); 1417 Trace new_state; 1418 successor->Emit(compiler, &new_state); 1419 1420 // On backtrack we need to restore state. 1421 assembler->Bind(&undo); 1422 RestoreAffectedRegisters(assembler, 1423 max_register, 1424 registers_to_pop, 1425 registers_to_clear); 1426 if (backtrack() == NULL) { 1427 assembler->Backtrack(); 1428 } else { 1429 assembler->PopCurrentPosition(); 1430 assembler->GoTo(backtrack()); 1431 } 1432 } 1433 1434 1435 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) { 1436 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1437 1438 // Omit flushing the trace. We discard the entire stack frame anyway. 1439 1440 if (!label()->is_bound()) { 1441 // We are completely independent of the trace, since we ignore it, 1442 // so this code can be used as the generic version. 1443 assembler->Bind(label()); 1444 } 1445 1446 // Throw away everything on the backtrack stack since the start 1447 // of the negative submatch and restore the character position. 1448 assembler->ReadCurrentPositionFromRegister(current_position_register_); 1449 assembler->ReadStackPointerFromRegister(stack_pointer_register_); 1450 if (clear_capture_count_ > 0) { 1451 // Clear any captures that might have been performed during the success 1452 // of the body of the negative look-ahead. 1453 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1; 1454 assembler->ClearRegisters(clear_capture_start_, clear_capture_end); 1455 } 1456 // Now that we have unwound the stack we find at the top of the stack the 1457 // backtrack that the BeginSubmatch node got. 1458 assembler->Backtrack(); 1459 } 1460 1461 1462 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) { 1463 if (!trace->is_trivial()) { 1464 trace->Flush(compiler, this); 1465 return; 1466 } 1467 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1468 if (!label()->is_bound()) { 1469 assembler->Bind(label()); 1470 } 1471 switch (action_) { 1472 case ACCEPT: 1473 assembler->Succeed(); 1474 return; 1475 case BACKTRACK: 1476 assembler->GoTo(trace->backtrack()); 1477 return; 1478 case NEGATIVE_SUBMATCH_SUCCESS: 1479 // This case is handled in a different virtual method. 1480 UNREACHABLE(); 1481 } 1482 UNIMPLEMENTED(); 1483 } 1484 1485 1486 void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) { 1487 if (guards_ == NULL) 1488 guards_ = new(zone) ZoneList<Guard*>(1, zone); 1489 guards_->Add(guard, zone); 1490 } 1491 1492 1493 ActionNode* ActionNode::SetRegister(int reg, 1494 int val, 1495 RegExpNode* on_success) { 1496 ActionNode* result = 1497 new(on_success->zone()) ActionNode(SET_REGISTER, on_success); 1498 result->data_.u_store_register.reg = reg; 1499 result->data_.u_store_register.value = val; 1500 return result; 1501 } 1502 1503 1504 ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) { 1505 ActionNode* result = 1506 new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success); 1507 result->data_.u_increment_register.reg = reg; 1508 return result; 1509 } 1510 1511 1512 ActionNode* ActionNode::StorePosition(int reg, 1513 bool is_capture, 1514 RegExpNode* on_success) { 1515 ActionNode* result = 1516 new(on_success->zone()) ActionNode(STORE_POSITION, on_success); 1517 result->data_.u_position_register.reg = reg; 1518 result->data_.u_position_register.is_capture = is_capture; 1519 return result; 1520 } 1521 1522 1523 ActionNode* ActionNode::ClearCaptures(Interval range, 1524 RegExpNode* on_success) { 1525 ActionNode* result = 1526 new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success); 1527 result->data_.u_clear_captures.range_from = range.from(); 1528 result->data_.u_clear_captures.range_to = range.to(); 1529 return result; 1530 } 1531 1532 1533 ActionNode* ActionNode::BeginSubmatch(int stack_reg, 1534 int position_reg, 1535 RegExpNode* on_success) { 1536 ActionNode* result = 1537 new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success); 1538 result->data_.u_submatch.stack_pointer_register = stack_reg; 1539 result->data_.u_submatch.current_position_register = position_reg; 1540 return result; 1541 } 1542 1543 1544 ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg, 1545 int position_reg, 1546 int clear_register_count, 1547 int clear_register_from, 1548 RegExpNode* on_success) { 1549 ActionNode* result = 1550 new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success); 1551 result->data_.u_submatch.stack_pointer_register = stack_reg; 1552 result->data_.u_submatch.current_position_register = position_reg; 1553 result->data_.u_submatch.clear_register_count = clear_register_count; 1554 result->data_.u_submatch.clear_register_from = clear_register_from; 1555 return result; 1556 } 1557 1558 1559 ActionNode* ActionNode::EmptyMatchCheck(int start_register, 1560 int repetition_register, 1561 int repetition_limit, 1562 RegExpNode* on_success) { 1563 ActionNode* result = 1564 new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success); 1565 result->data_.u_empty_match_check.start_register = start_register; 1566 result->data_.u_empty_match_check.repetition_register = repetition_register; 1567 result->data_.u_empty_match_check.repetition_limit = repetition_limit; 1568 return result; 1569 } 1570 1571 1572 #define DEFINE_ACCEPT(Type) \ 1573 void Type##Node::Accept(NodeVisitor* visitor) { \ 1574 visitor->Visit##Type(this); \ 1575 } 1576 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT) 1577 #undef DEFINE_ACCEPT 1578 1579 1580 void LoopChoiceNode::Accept(NodeVisitor* visitor) { 1581 visitor->VisitLoopChoice(this); 1582 } 1583 1584 1585 // ------------------------------------------------------------------- 1586 // Emit code. 1587 1588 1589 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler, 1590 Guard* guard, 1591 Trace* trace) { 1592 switch (guard->op()) { 1593 case Guard::LT: 1594 ASSERT(!trace->mentions_reg(guard->reg())); 1595 macro_assembler->IfRegisterGE(guard->reg(), 1596 guard->value(), 1597 trace->backtrack()); 1598 break; 1599 case Guard::GEQ: 1600 ASSERT(!trace->mentions_reg(guard->reg())); 1601 macro_assembler->IfRegisterLT(guard->reg(), 1602 guard->value(), 1603 trace->backtrack()); 1604 break; 1605 } 1606 } 1607 1608 1609 // Returns the number of characters in the equivalence class, omitting those 1610 // that cannot occur in the source string because it is ASCII. 1611 static int GetCaseIndependentLetters(Isolate* isolate, 1612 uc16 character, 1613 bool ascii_subject, 1614 unibrow::uchar* letters) { 1615 int length = 1616 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters); 1617 // Unibrow returns 0 or 1 for characters where case independence is 1618 // trivial. 1619 if (length == 0) { 1620 letters[0] = character; 1621 length = 1; 1622 } 1623 if (!ascii_subject || character <= String::kMaxOneByteCharCode) { 1624 return length; 1625 } 1626 // The standard requires that non-ASCII characters cannot have ASCII 1627 // character codes in their equivalence class. 1628 return 0; 1629 } 1630 1631 1632 static inline bool EmitSimpleCharacter(Isolate* isolate, 1633 RegExpCompiler* compiler, 1634 uc16 c, 1635 Label* on_failure, 1636 int cp_offset, 1637 bool check, 1638 bool preloaded) { 1639 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1640 bool bound_checked = false; 1641 if (!preloaded) { 1642 assembler->LoadCurrentCharacter( 1643 cp_offset, 1644 on_failure, 1645 check); 1646 bound_checked = true; 1647 } 1648 assembler->CheckNotCharacter(c, on_failure); 1649 return bound_checked; 1650 } 1651 1652 1653 // Only emits non-letters (things that don't have case). Only used for case 1654 // independent matches. 1655 static inline bool EmitAtomNonLetter(Isolate* isolate, 1656 RegExpCompiler* compiler, 1657 uc16 c, 1658 Label* on_failure, 1659 int cp_offset, 1660 bool check, 1661 bool preloaded) { 1662 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 1663 bool ascii = compiler->ascii(); 1664 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1665 int length = GetCaseIndependentLetters(isolate, c, ascii, chars); 1666 if (length < 1) { 1667 // This can't match. Must be an ASCII subject and a non-ASCII character. 1668 // We do not need to do anything since the ASCII pass already handled this. 1669 return false; // Bounds not checked. 1670 } 1671 bool checked = false; 1672 // We handle the length > 1 case in a later pass. 1673 if (length == 1) { 1674 if (ascii && c > String::kMaxOneByteCharCodeU) { 1675 // Can't match - see above. 1676 return false; // Bounds not checked. 1677 } 1678 if (!preloaded) { 1679 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 1680 checked = check; 1681 } 1682 macro_assembler->CheckNotCharacter(c, on_failure); 1683 } 1684 return checked; 1685 } 1686 1687 1688 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler, 1689 bool ascii, 1690 uc16 c1, 1691 uc16 c2, 1692 Label* on_failure) { 1693 uc16 char_mask; 1694 if (ascii) { 1695 char_mask = String::kMaxOneByteCharCode; 1696 } else { 1697 char_mask = String::kMaxUtf16CodeUnit; 1698 } 1699 uc16 exor = c1 ^ c2; 1700 // Check whether exor has only one bit set. 1701 if (((exor - 1) & exor) == 0) { 1702 // If c1 and c2 differ only by one bit. 1703 // Ecma262UnCanonicalize always gives the highest number last. 1704 ASSERT(c2 > c1); 1705 uc16 mask = char_mask ^ exor; 1706 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure); 1707 return true; 1708 } 1709 ASSERT(c2 > c1); 1710 uc16 diff = c2 - c1; 1711 if (((diff - 1) & diff) == 0 && c1 >= diff) { 1712 // If the characters differ by 2^n but don't differ by one bit then 1713 // subtract the difference from the found character, then do the or 1714 // trick. We avoid the theoretical case where negative numbers are 1715 // involved in order to simplify code generation. 1716 uc16 mask = char_mask ^ diff; 1717 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff, 1718 diff, 1719 mask, 1720 on_failure); 1721 return true; 1722 } 1723 return false; 1724 } 1725 1726 1727 typedef bool EmitCharacterFunction(Isolate* isolate, 1728 RegExpCompiler* compiler, 1729 uc16 c, 1730 Label* on_failure, 1731 int cp_offset, 1732 bool check, 1733 bool preloaded); 1734 1735 // Only emits letters (things that have case). Only used for case independent 1736 // matches. 1737 static inline bool EmitAtomLetter(Isolate* isolate, 1738 RegExpCompiler* compiler, 1739 uc16 c, 1740 Label* on_failure, 1741 int cp_offset, 1742 bool check, 1743 bool preloaded) { 1744 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 1745 bool ascii = compiler->ascii(); 1746 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1747 int length = GetCaseIndependentLetters(isolate, c, ascii, chars); 1748 if (length <= 1) return false; 1749 // We may not need to check against the end of the input string 1750 // if this character lies before a character that matched. 1751 if (!preloaded) { 1752 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 1753 } 1754 Label ok; 1755 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4); 1756 switch (length) { 1757 case 2: { 1758 if (ShortCutEmitCharacterPair(macro_assembler, 1759 ascii, 1760 chars[0], 1761 chars[1], 1762 on_failure)) { 1763 } else { 1764 macro_assembler->CheckCharacter(chars[0], &ok); 1765 macro_assembler->CheckNotCharacter(chars[1], on_failure); 1766 macro_assembler->Bind(&ok); 1767 } 1768 break; 1769 } 1770 case 4: 1771 macro_assembler->CheckCharacter(chars[3], &ok); 1772 // Fall through! 1773 case 3: 1774 macro_assembler->CheckCharacter(chars[0], &ok); 1775 macro_assembler->CheckCharacter(chars[1], &ok); 1776 macro_assembler->CheckNotCharacter(chars[2], on_failure); 1777 macro_assembler->Bind(&ok); 1778 break; 1779 default: 1780 UNREACHABLE(); 1781 break; 1782 } 1783 return true; 1784 } 1785 1786 1787 static void EmitBoundaryTest(RegExpMacroAssembler* masm, 1788 int border, 1789 Label* fall_through, 1790 Label* above_or_equal, 1791 Label* below) { 1792 if (below != fall_through) { 1793 masm->CheckCharacterLT(border, below); 1794 if (above_or_equal != fall_through) masm->GoTo(above_or_equal); 1795 } else { 1796 masm->CheckCharacterGT(border - 1, above_or_equal); 1797 } 1798 } 1799 1800 1801 static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm, 1802 int first, 1803 int last, 1804 Label* fall_through, 1805 Label* in_range, 1806 Label* out_of_range) { 1807 if (in_range == fall_through) { 1808 if (first == last) { 1809 masm->CheckNotCharacter(first, out_of_range); 1810 } else { 1811 masm->CheckCharacterNotInRange(first, last, out_of_range); 1812 } 1813 } else { 1814 if (first == last) { 1815 masm->CheckCharacter(first, in_range); 1816 } else { 1817 masm->CheckCharacterInRange(first, last, in_range); 1818 } 1819 if (out_of_range != fall_through) masm->GoTo(out_of_range); 1820 } 1821 } 1822 1823 1824 // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even. 1825 // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd. 1826 static void EmitUseLookupTable( 1827 RegExpMacroAssembler* masm, 1828 ZoneList<int>* ranges, 1829 int start_index, 1830 int end_index, 1831 int min_char, 1832 Label* fall_through, 1833 Label* even_label, 1834 Label* odd_label) { 1835 static const int kSize = RegExpMacroAssembler::kTableSize; 1836 static const int kMask = RegExpMacroAssembler::kTableMask; 1837 1838 int base = (min_char & ~kMask); 1839 USE(base); 1840 1841 // Assert that everything is on one kTableSize page. 1842 for (int i = start_index; i <= end_index; i++) { 1843 ASSERT_EQ(ranges->at(i) & ~kMask, base); 1844 } 1845 ASSERT(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base); 1846 1847 char templ[kSize]; 1848 Label* on_bit_set; 1849 Label* on_bit_clear; 1850 int bit; 1851 if (even_label == fall_through) { 1852 on_bit_set = odd_label; 1853 on_bit_clear = even_label; 1854 bit = 1; 1855 } else { 1856 on_bit_set = even_label; 1857 on_bit_clear = odd_label; 1858 bit = 0; 1859 } 1860 for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) { 1861 templ[i] = bit; 1862 } 1863 int j = 0; 1864 bit ^= 1; 1865 for (int i = start_index; i < end_index; i++) { 1866 for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) { 1867 templ[j] = bit; 1868 } 1869 bit ^= 1; 1870 } 1871 for (int i = j; i < kSize; i++) { 1872 templ[i] = bit; 1873 } 1874 Factory* factory = Isolate::Current()->factory(); 1875 // TODO(erikcorry): Cache these. 1876 Handle<ByteArray> ba = factory->NewByteArray(kSize, TENURED); 1877 for (int i = 0; i < kSize; i++) { 1878 ba->set(i, templ[i]); 1879 } 1880 masm->CheckBitInTable(ba, on_bit_set); 1881 if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear); 1882 } 1883 1884 1885 static void CutOutRange(RegExpMacroAssembler* masm, 1886 ZoneList<int>* ranges, 1887 int start_index, 1888 int end_index, 1889 int cut_index, 1890 Label* even_label, 1891 Label* odd_label) { 1892 bool odd = (((cut_index - start_index) & 1) == 1); 1893 Label* in_range_label = odd ? odd_label : even_label; 1894 Label dummy; 1895 EmitDoubleBoundaryTest(masm, 1896 ranges->at(cut_index), 1897 ranges->at(cut_index + 1) - 1, 1898 &dummy, 1899 in_range_label, 1900 &dummy); 1901 ASSERT(!dummy.is_linked()); 1902 // Cut out the single range by rewriting the array. This creates a new 1903 // range that is a merger of the two ranges on either side of the one we 1904 // are cutting out. The oddity of the labels is preserved. 1905 for (int j = cut_index; j > start_index; j--) { 1906 ranges->at(j) = ranges->at(j - 1); 1907 } 1908 for (int j = cut_index + 1; j < end_index; j++) { 1909 ranges->at(j) = ranges->at(j + 1); 1910 } 1911 } 1912 1913 1914 // Unicode case. Split the search space into kSize spaces that are handled 1915 // with recursion. 1916 static void SplitSearchSpace(ZoneList<int>* ranges, 1917 int start_index, 1918 int end_index, 1919 int* new_start_index, 1920 int* new_end_index, 1921 int* border) { 1922 static const int kSize = RegExpMacroAssembler::kTableSize; 1923 static const int kMask = RegExpMacroAssembler::kTableMask; 1924 1925 int first = ranges->at(start_index); 1926 int last = ranges->at(end_index) - 1; 1927 1928 *new_start_index = start_index; 1929 *border = (ranges->at(start_index) & ~kMask) + kSize; 1930 while (*new_start_index < end_index) { 1931 if (ranges->at(*new_start_index) > *border) break; 1932 (*new_start_index)++; 1933 } 1934 // new_start_index is the index of the first edge that is beyond the 1935 // current kSize space. 1936 1937 // For very large search spaces we do a binary chop search of the non-ASCII 1938 // space instead of just going to the end of the current kSize space. The 1939 // heuristics are complicated a little by the fact that any 128-character 1940 // encoding space can be quickly tested with a table lookup, so we don't 1941 // wish to do binary chop search at a smaller granularity than that. A 1942 // 128-character space can take up a lot of space in the ranges array if, 1943 // for example, we only want to match every second character (eg. the lower 1944 // case characters on some Unicode pages). 1945 int binary_chop_index = (end_index + start_index) / 2; 1946 // The first test ensures that we get to the code that handles the ASCII 1947 // range with a single not-taken branch, speeding up this important 1948 // character range (even non-ASCII charset-based text has spaces and 1949 // punctuation). 1950 if (*border - 1 > String::kMaxOneByteCharCode && // ASCII case. 1951 end_index - start_index > (*new_start_index - start_index) * 2 && 1952 last - first > kSize * 2 && 1953 binary_chop_index > *new_start_index && 1954 ranges->at(binary_chop_index) >= first + 2 * kSize) { 1955 int scan_forward_for_section_border = binary_chop_index;; 1956 int new_border = (ranges->at(binary_chop_index) | kMask) + 1; 1957 1958 while (scan_forward_for_section_border < end_index) { 1959 if (ranges->at(scan_forward_for_section_border) > new_border) { 1960 *new_start_index = scan_forward_for_section_border; 1961 *border = new_border; 1962 break; 1963 } 1964 scan_forward_for_section_border++; 1965 } 1966 } 1967 1968 ASSERT(*new_start_index > start_index); 1969 *new_end_index = *new_start_index - 1; 1970 if (ranges->at(*new_end_index) == *border) { 1971 (*new_end_index)--; 1972 } 1973 if (*border >= ranges->at(end_index)) { 1974 *border = ranges->at(end_index); 1975 *new_start_index = end_index; // Won't be used. 1976 *new_end_index = end_index - 1; 1977 } 1978 } 1979 1980 1981 // Gets a series of segment boundaries representing a character class. If the 1982 // character is in the range between an even and an odd boundary (counting from 1983 // start_index) then go to even_label, otherwise go to odd_label. We already 1984 // know that the character is in the range of min_char to max_char inclusive. 1985 // Either label can be NULL indicating backtracking. Either label can also be 1986 // equal to the fall_through label. 1987 static void GenerateBranches(RegExpMacroAssembler* masm, 1988 ZoneList<int>* ranges, 1989 int start_index, 1990 int end_index, 1991 uc16 min_char, 1992 uc16 max_char, 1993 Label* fall_through, 1994 Label* even_label, 1995 Label* odd_label) { 1996 int first = ranges->at(start_index); 1997 int last = ranges->at(end_index) - 1; 1998 1999 ASSERT_LT(min_char, first); 2000 2001 // Just need to test if the character is before or on-or-after 2002 // a particular character. 2003 if (start_index == end_index) { 2004 EmitBoundaryTest(masm, first, fall_through, even_label, odd_label); 2005 return; 2006 } 2007 2008 // Another almost trivial case: There is one interval in the middle that is 2009 // different from the end intervals. 2010 if (start_index + 1 == end_index) { 2011 EmitDoubleBoundaryTest( 2012 masm, first, last, fall_through, even_label, odd_label); 2013 return; 2014 } 2015 2016 // It's not worth using table lookup if there are very few intervals in the 2017 // character class. 2018 if (end_index - start_index <= 6) { 2019 // It is faster to test for individual characters, so we look for those 2020 // first, then try arbitrary ranges in the second round. 2021 static int kNoCutIndex = -1; 2022 int cut = kNoCutIndex; 2023 for (int i = start_index; i < end_index; i++) { 2024 if (ranges->at(i) == ranges->at(i + 1) - 1) { 2025 cut = i; 2026 break; 2027 } 2028 } 2029 if (cut == kNoCutIndex) cut = start_index; 2030 CutOutRange( 2031 masm, ranges, start_index, end_index, cut, even_label, odd_label); 2032 ASSERT_GE(end_index - start_index, 2); 2033 GenerateBranches(masm, 2034 ranges, 2035 start_index + 1, 2036 end_index - 1, 2037 min_char, 2038 max_char, 2039 fall_through, 2040 even_label, 2041 odd_label); 2042 return; 2043 } 2044 2045 // If there are a lot of intervals in the regexp, then we will use tables to 2046 // determine whether the character is inside or outside the character class. 2047 static const int kBits = RegExpMacroAssembler::kTableSizeBits; 2048 2049 if ((max_char >> kBits) == (min_char >> kBits)) { 2050 EmitUseLookupTable(masm, 2051 ranges, 2052 start_index, 2053 end_index, 2054 min_char, 2055 fall_through, 2056 even_label, 2057 odd_label); 2058 return; 2059 } 2060 2061 if ((min_char >> kBits) != (first >> kBits)) { 2062 masm->CheckCharacterLT(first, odd_label); 2063 GenerateBranches(masm, 2064 ranges, 2065 start_index + 1, 2066 end_index, 2067 first, 2068 max_char, 2069 fall_through, 2070 odd_label, 2071 even_label); 2072 return; 2073 } 2074 2075 int new_start_index = 0; 2076 int new_end_index = 0; 2077 int border = 0; 2078 2079 SplitSearchSpace(ranges, 2080 start_index, 2081 end_index, 2082 &new_start_index, 2083 &new_end_index, 2084 &border); 2085 2086 Label handle_rest; 2087 Label* above = &handle_rest; 2088 if (border == last + 1) { 2089 // We didn't find any section that started after the limit, so everything 2090 // above the border is one of the terminal labels. 2091 above = (end_index & 1) != (start_index & 1) ? odd_label : even_label; 2092 ASSERT(new_end_index == end_index - 1); 2093 } 2094 2095 ASSERT_LE(start_index, new_end_index); 2096 ASSERT_LE(new_start_index, end_index); 2097 ASSERT_LT(start_index, new_start_index); 2098 ASSERT_LT(new_end_index, end_index); 2099 ASSERT(new_end_index + 1 == new_start_index || 2100 (new_end_index + 2 == new_start_index && 2101 border == ranges->at(new_end_index + 1))); 2102 ASSERT_LT(min_char, border - 1); 2103 ASSERT_LT(border, max_char); 2104 ASSERT_LT(ranges->at(new_end_index), border); 2105 ASSERT(border < ranges->at(new_start_index) || 2106 (border == ranges->at(new_start_index) && 2107 new_start_index == end_index && 2108 new_end_index == end_index - 1 && 2109 border == last + 1)); 2110 ASSERT(new_start_index == 0 || border >= ranges->at(new_start_index - 1)); 2111 2112 masm->CheckCharacterGT(border - 1, above); 2113 Label dummy; 2114 GenerateBranches(masm, 2115 ranges, 2116 start_index, 2117 new_end_index, 2118 min_char, 2119 border - 1, 2120 &dummy, 2121 even_label, 2122 odd_label); 2123 if (handle_rest.is_linked()) { 2124 masm->Bind(&handle_rest); 2125 bool flip = (new_start_index & 1) != (start_index & 1); 2126 GenerateBranches(masm, 2127 ranges, 2128 new_start_index, 2129 end_index, 2130 border, 2131 max_char, 2132 &dummy, 2133 flip ? odd_label : even_label, 2134 flip ? even_label : odd_label); 2135 } 2136 } 2137 2138 2139 static void EmitCharClass(RegExpMacroAssembler* macro_assembler, 2140 RegExpCharacterClass* cc, 2141 bool ascii, 2142 Label* on_failure, 2143 int cp_offset, 2144 bool check_offset, 2145 bool preloaded, 2146 Zone* zone) { 2147 ZoneList<CharacterRange>* ranges = cc->ranges(zone); 2148 if (!CharacterRange::IsCanonical(ranges)) { 2149 CharacterRange::Canonicalize(ranges); 2150 } 2151 2152 int max_char; 2153 if (ascii) { 2154 max_char = String::kMaxOneByteCharCode; 2155 } else { 2156 max_char = String::kMaxUtf16CodeUnit; 2157 } 2158 2159 int range_count = ranges->length(); 2160 2161 int last_valid_range = range_count - 1; 2162 while (last_valid_range >= 0) { 2163 CharacterRange& range = ranges->at(last_valid_range); 2164 if (range.from() <= max_char) { 2165 break; 2166 } 2167 last_valid_range--; 2168 } 2169 2170 if (last_valid_range < 0) { 2171 if (!cc->is_negated()) { 2172 macro_assembler->GoTo(on_failure); 2173 } 2174 if (check_offset) { 2175 macro_assembler->CheckPosition(cp_offset, on_failure); 2176 } 2177 return; 2178 } 2179 2180 if (last_valid_range == 0 && 2181 ranges->at(0).IsEverything(max_char)) { 2182 if (cc->is_negated()) { 2183 macro_assembler->GoTo(on_failure); 2184 } else { 2185 // This is a common case hit by non-anchored expressions. 2186 if (check_offset) { 2187 macro_assembler->CheckPosition(cp_offset, on_failure); 2188 } 2189 } 2190 return; 2191 } 2192 if (last_valid_range == 0 && 2193 !cc->is_negated() && 2194 ranges->at(0).IsEverything(max_char)) { 2195 // This is a common case hit by non-anchored expressions. 2196 if (check_offset) { 2197 macro_assembler->CheckPosition(cp_offset, on_failure); 2198 } 2199 return; 2200 } 2201 2202 if (!preloaded) { 2203 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset); 2204 } 2205 2206 if (cc->is_standard(zone) && 2207 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(), 2208 on_failure)) { 2209 return; 2210 } 2211 2212 2213 // A new list with ascending entries. Each entry is a code unit 2214 // where there is a boundary between code units that are part of 2215 // the class and code units that are not. Normally we insert an 2216 // entry at zero which goes to the failure label, but if there 2217 // was already one there we fall through for success on that entry. 2218 // Subsequent entries have alternating meaning (success/failure). 2219 ZoneList<int>* range_boundaries = 2220 new(zone) ZoneList<int>(last_valid_range, zone); 2221 2222 bool zeroth_entry_is_failure = !cc->is_negated(); 2223 2224 for (int i = 0; i <= last_valid_range; i++) { 2225 CharacterRange& range = ranges->at(i); 2226 if (range.from() == 0) { 2227 ASSERT_EQ(i, 0); 2228 zeroth_entry_is_failure = !zeroth_entry_is_failure; 2229 } else { 2230 range_boundaries->Add(range.from(), zone); 2231 } 2232 range_boundaries->Add(range.to() + 1, zone); 2233 } 2234 int end_index = range_boundaries->length() - 1; 2235 if (range_boundaries->at(end_index) > max_char) { 2236 end_index--; 2237 } 2238 2239 Label fall_through; 2240 GenerateBranches(macro_assembler, 2241 range_boundaries, 2242 0, // start_index. 2243 end_index, 2244 0, // min_char. 2245 max_char, 2246 &fall_through, 2247 zeroth_entry_is_failure ? &fall_through : on_failure, 2248 zeroth_entry_is_failure ? on_failure : &fall_through); 2249 macro_assembler->Bind(&fall_through); 2250 } 2251 2252 2253 RegExpNode::~RegExpNode() { 2254 } 2255 2256 2257 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler, 2258 Trace* trace) { 2259 // If we are generating a greedy loop then don't stop and don't reuse code. 2260 if (trace->stop_node() != NULL) { 2261 return CONTINUE; 2262 } 2263 2264 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 2265 if (trace->is_trivial()) { 2266 if (label_.is_bound()) { 2267 // We are being asked to generate a generic version, but that's already 2268 // been done so just go to it. 2269 macro_assembler->GoTo(&label_); 2270 return DONE; 2271 } 2272 if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) { 2273 // To avoid too deep recursion we push the node to the work queue and just 2274 // generate a goto here. 2275 compiler->AddWork(this); 2276 macro_assembler->GoTo(&label_); 2277 return DONE; 2278 } 2279 // Generate generic version of the node and bind the label for later use. 2280 macro_assembler->Bind(&label_); 2281 return CONTINUE; 2282 } 2283 2284 // We are being asked to make a non-generic version. Keep track of how many 2285 // non-generic versions we generate so as not to overdo it. 2286 trace_count_++; 2287 if (FLAG_regexp_optimization && 2288 trace_count_ < kMaxCopiesCodeGenerated && 2289 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) { 2290 return CONTINUE; 2291 } 2292 2293 // If we get here code has been generated for this node too many times or 2294 // recursion is too deep. Time to switch to a generic version. The code for 2295 // generic versions above can handle deep recursion properly. 2296 trace->Flush(compiler, this); 2297 return DONE; 2298 } 2299 2300 2301 int ActionNode::EatsAtLeast(int still_to_find, 2302 int budget, 2303 bool not_at_start) { 2304 if (budget <= 0) return 0; 2305 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input! 2306 return on_success()->EatsAtLeast(still_to_find, 2307 budget - 1, 2308 not_at_start); 2309 } 2310 2311 2312 void ActionNode::FillInBMInfo(int offset, 2313 int budget, 2314 BoyerMooreLookahead* bm, 2315 bool not_at_start) { 2316 if (action_type_ == BEGIN_SUBMATCH) { 2317 bm->SetRest(offset); 2318 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) { 2319 on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start); 2320 } 2321 SaveBMInfo(bm, not_at_start, offset); 2322 } 2323 2324 2325 int AssertionNode::EatsAtLeast(int still_to_find, 2326 int budget, 2327 bool not_at_start) { 2328 if (budget <= 0) return 0; 2329 // If we know we are not at the start and we are asked "how many characters 2330 // will you match if you succeed?" then we can answer anything since false 2331 // implies false. So lets just return the max answer (still_to_find) since 2332 // that won't prevent us from preloading a lot of characters for the other 2333 // branches in the node graph. 2334 if (assertion_type() == AT_START && not_at_start) return still_to_find; 2335 return on_success()->EatsAtLeast(still_to_find, 2336 budget - 1, 2337 not_at_start); 2338 } 2339 2340 2341 void AssertionNode::FillInBMInfo(int offset, 2342 int budget, 2343 BoyerMooreLookahead* bm, 2344 bool not_at_start) { 2345 // Match the behaviour of EatsAtLeast on this node. 2346 if (assertion_type() == AT_START && not_at_start) return; 2347 on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start); 2348 SaveBMInfo(bm, not_at_start, offset); 2349 } 2350 2351 2352 int BackReferenceNode::EatsAtLeast(int still_to_find, 2353 int budget, 2354 bool not_at_start) { 2355 if (budget <= 0) return 0; 2356 return on_success()->EatsAtLeast(still_to_find, 2357 budget - 1, 2358 not_at_start); 2359 } 2360 2361 2362 int TextNode::EatsAtLeast(int still_to_find, 2363 int budget, 2364 bool not_at_start) { 2365 int answer = Length(); 2366 if (answer >= still_to_find) return answer; 2367 if (budget <= 0) return answer; 2368 // We are not at start after this node so we set the last argument to 'true'. 2369 return answer + on_success()->EatsAtLeast(still_to_find - answer, 2370 budget - 1, 2371 true); 2372 } 2373 2374 2375 int NegativeLookaheadChoiceNode::EatsAtLeast(int still_to_find, 2376 int budget, 2377 bool not_at_start) { 2378 if (budget <= 0) return 0; 2379 // Alternative 0 is the negative lookahead, alternative 1 is what comes 2380 // afterwards. 2381 RegExpNode* node = alternatives_->at(1).node(); 2382 return node->EatsAtLeast(still_to_find, budget - 1, not_at_start); 2383 } 2384 2385 2386 void NegativeLookaheadChoiceNode::GetQuickCheckDetails( 2387 QuickCheckDetails* details, 2388 RegExpCompiler* compiler, 2389 int filled_in, 2390 bool not_at_start) { 2391 // Alternative 0 is the negative lookahead, alternative 1 is what comes 2392 // afterwards. 2393 RegExpNode* node = alternatives_->at(1).node(); 2394 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start); 2395 } 2396 2397 2398 int ChoiceNode::EatsAtLeastHelper(int still_to_find, 2399 int budget, 2400 RegExpNode* ignore_this_node, 2401 bool not_at_start) { 2402 if (budget <= 0) return 0; 2403 int min = 100; 2404 int choice_count = alternatives_->length(); 2405 budget = (budget - 1) / choice_count; 2406 for (int i = 0; i < choice_count; i++) { 2407 RegExpNode* node = alternatives_->at(i).node(); 2408 if (node == ignore_this_node) continue; 2409 int node_eats_at_least = 2410 node->EatsAtLeast(still_to_find, budget, not_at_start); 2411 if (node_eats_at_least < min) min = node_eats_at_least; 2412 if (min == 0) return 0; 2413 } 2414 return min; 2415 } 2416 2417 2418 int LoopChoiceNode::EatsAtLeast(int still_to_find, 2419 int budget, 2420 bool not_at_start) { 2421 return EatsAtLeastHelper(still_to_find, 2422 budget - 1, 2423 loop_node_, 2424 not_at_start); 2425 } 2426 2427 2428 int ChoiceNode::EatsAtLeast(int still_to_find, 2429 int budget, 2430 bool not_at_start) { 2431 return EatsAtLeastHelper(still_to_find, 2432 budget, 2433 NULL, 2434 not_at_start); 2435 } 2436 2437 2438 // Takes the left-most 1-bit and smears it out, setting all bits to its right. 2439 static inline uint32_t SmearBitsRight(uint32_t v) { 2440 v |= v >> 1; 2441 v |= v >> 2; 2442 v |= v >> 4; 2443 v |= v >> 8; 2444 v |= v >> 16; 2445 return v; 2446 } 2447 2448 2449 bool QuickCheckDetails::Rationalize(bool asc) { 2450 bool found_useful_op = false; 2451 uint32_t char_mask; 2452 if (asc) { 2453 char_mask = String::kMaxOneByteCharCode; 2454 } else { 2455 char_mask = String::kMaxUtf16CodeUnit; 2456 } 2457 mask_ = 0; 2458 value_ = 0; 2459 int char_shift = 0; 2460 for (int i = 0; i < characters_; i++) { 2461 Position* pos = &positions_[i]; 2462 if ((pos->mask & String::kMaxOneByteCharCode) != 0) { 2463 found_useful_op = true; 2464 } 2465 mask_ |= (pos->mask & char_mask) << char_shift; 2466 value_ |= (pos->value & char_mask) << char_shift; 2467 char_shift += asc ? 8 : 16; 2468 } 2469 return found_useful_op; 2470 } 2471 2472 2473 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler, 2474 Trace* trace, 2475 bool preload_has_checked_bounds, 2476 Label* on_possible_success, 2477 QuickCheckDetails* details, 2478 bool fall_through_on_failure) { 2479 if (details->characters() == 0) return false; 2480 GetQuickCheckDetails( 2481 details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE); 2482 if (details->cannot_match()) return false; 2483 if (!details->Rationalize(compiler->ascii())) return false; 2484 ASSERT(details->characters() == 1 || 2485 compiler->macro_assembler()->CanReadUnaligned()); 2486 uint32_t mask = details->mask(); 2487 uint32_t value = details->value(); 2488 2489 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2490 2491 if (trace->characters_preloaded() != details->characters()) { 2492 assembler->LoadCurrentCharacter(trace->cp_offset(), 2493 trace->backtrack(), 2494 !preload_has_checked_bounds, 2495 details->characters()); 2496 } 2497 2498 2499 bool need_mask = true; 2500 2501 if (details->characters() == 1) { 2502 // If number of characters preloaded is 1 then we used a byte or 16 bit 2503 // load so the value is already masked down. 2504 uint32_t char_mask; 2505 if (compiler->ascii()) { 2506 char_mask = String::kMaxOneByteCharCode; 2507 } else { 2508 char_mask = String::kMaxUtf16CodeUnit; 2509 } 2510 if ((mask & char_mask) == char_mask) need_mask = false; 2511 mask &= char_mask; 2512 } else { 2513 // For 2-character preloads in ASCII mode or 1-character preloads in 2514 // TWO_BYTE mode we also use a 16 bit load with zero extend. 2515 if (details->characters() == 2 && compiler->ascii()) { 2516 if ((mask & 0xffff) == 0xffff) need_mask = false; 2517 } else if (details->characters() == 1 && !compiler->ascii()) { 2518 if ((mask & 0xffff) == 0xffff) need_mask = false; 2519 } else { 2520 if (mask == 0xffffffff) need_mask = false; 2521 } 2522 } 2523 2524 if (fall_through_on_failure) { 2525 if (need_mask) { 2526 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success); 2527 } else { 2528 assembler->CheckCharacter(value, on_possible_success); 2529 } 2530 } else { 2531 if (need_mask) { 2532 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack()); 2533 } else { 2534 assembler->CheckNotCharacter(value, trace->backtrack()); 2535 } 2536 } 2537 return true; 2538 } 2539 2540 2541 // Here is the meat of GetQuickCheckDetails (see also the comment on the 2542 // super-class in the .h file). 2543 // 2544 // We iterate along the text object, building up for each character a 2545 // mask and value that can be used to test for a quick failure to match. 2546 // The masks and values for the positions will be combined into a single 2547 // machine word for the current character width in order to be used in 2548 // generating a quick check. 2549 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, 2550 RegExpCompiler* compiler, 2551 int characters_filled_in, 2552 bool not_at_start) { 2553 Isolate* isolate = Isolate::Current(); 2554 ASSERT(characters_filled_in < details->characters()); 2555 int characters = details->characters(); 2556 int char_mask; 2557 if (compiler->ascii()) { 2558 char_mask = String::kMaxOneByteCharCode; 2559 } else { 2560 char_mask = String::kMaxUtf16CodeUnit; 2561 } 2562 for (int k = 0; k < elms_->length(); k++) { 2563 TextElement elm = elms_->at(k); 2564 if (elm.text_type == TextElement::ATOM) { 2565 Vector<const uc16> quarks = elm.data.u_atom->data(); 2566 for (int i = 0; i < characters && i < quarks.length(); i++) { 2567 QuickCheckDetails::Position* pos = 2568 details->positions(characters_filled_in); 2569 uc16 c = quarks[i]; 2570 if (c > char_mask) { 2571 // If we expect a non-ASCII character from an ASCII string, 2572 // there is no way we can match. Not even case independent 2573 // matching can turn an ASCII character into non-ASCII or 2574 // vice versa. 2575 details->set_cannot_match(); 2576 pos->determines_perfectly = false; 2577 return; 2578 } 2579 if (compiler->ignore_case()) { 2580 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 2581 int length = GetCaseIndependentLetters(isolate, c, compiler->ascii(), 2582 chars); 2583 ASSERT(length != 0); // Can only happen if c > char_mask (see above). 2584 if (length == 1) { 2585 // This letter has no case equivalents, so it's nice and simple 2586 // and the mask-compare will determine definitely whether we have 2587 // a match at this character position. 2588 pos->mask = char_mask; 2589 pos->value = c; 2590 pos->determines_perfectly = true; 2591 } else { 2592 uint32_t common_bits = char_mask; 2593 uint32_t bits = chars[0]; 2594 for (int j = 1; j < length; j++) { 2595 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits); 2596 common_bits ^= differing_bits; 2597 bits &= common_bits; 2598 } 2599 // If length is 2 and common bits has only one zero in it then 2600 // our mask and compare instruction will determine definitely 2601 // whether we have a match at this character position. Otherwise 2602 // it can only be an approximate check. 2603 uint32_t one_zero = (common_bits | ~char_mask); 2604 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) { 2605 pos->determines_perfectly = true; 2606 } 2607 pos->mask = common_bits; 2608 pos->value = bits; 2609 } 2610 } else { 2611 // Don't ignore case. Nice simple case where the mask-compare will 2612 // determine definitely whether we have a match at this character 2613 // position. 2614 pos->mask = char_mask; 2615 pos->value = c; 2616 pos->determines_perfectly = true; 2617 } 2618 characters_filled_in++; 2619 ASSERT(characters_filled_in <= details->characters()); 2620 if (characters_filled_in == details->characters()) { 2621 return; 2622 } 2623 } 2624 } else { 2625 QuickCheckDetails::Position* pos = 2626 details->positions(characters_filled_in); 2627 RegExpCharacterClass* tree = elm.data.u_char_class; 2628 ZoneList<CharacterRange>* ranges = tree->ranges(zone()); 2629 if (tree->is_negated()) { 2630 // A quick check uses multi-character mask and compare. There is no 2631 // useful way to incorporate a negative char class into this scheme 2632 // so we just conservatively create a mask and value that will always 2633 // succeed. 2634 pos->mask = 0; 2635 pos->value = 0; 2636 } else { 2637 int first_range = 0; 2638 while (ranges->at(first_range).from() > char_mask) { 2639 first_range++; 2640 if (first_range == ranges->length()) { 2641 details->set_cannot_match(); 2642 pos->determines_perfectly = false; 2643 return; 2644 } 2645 } 2646 CharacterRange range = ranges->at(first_range); 2647 uc16 from = range.from(); 2648 uc16 to = range.to(); 2649 if (to > char_mask) { 2650 to = char_mask; 2651 } 2652 uint32_t differing_bits = (from ^ to); 2653 // A mask and compare is only perfect if the differing bits form a 2654 // number like 00011111 with one single block of trailing 1s. 2655 if ((differing_bits & (differing_bits + 1)) == 0 && 2656 from + differing_bits == to) { 2657 pos->determines_perfectly = true; 2658 } 2659 uint32_t common_bits = ~SmearBitsRight(differing_bits); 2660 uint32_t bits = (from & common_bits); 2661 for (int i = first_range + 1; i < ranges->length(); i++) { 2662 CharacterRange range = ranges->at(i); 2663 uc16 from = range.from(); 2664 uc16 to = range.to(); 2665 if (from > char_mask) continue; 2666 if (to > char_mask) to = char_mask; 2667 // Here we are combining more ranges into the mask and compare 2668 // value. With each new range the mask becomes more sparse and 2669 // so the chances of a false positive rise. A character class 2670 // with multiple ranges is assumed never to be equivalent to a 2671 // mask and compare operation. 2672 pos->determines_perfectly = false; 2673 uint32_t new_common_bits = (from ^ to); 2674 new_common_bits = ~SmearBitsRight(new_common_bits); 2675 common_bits &= new_common_bits; 2676 bits &= new_common_bits; 2677 uint32_t differing_bits = (from & common_bits) ^ bits; 2678 common_bits ^= differing_bits; 2679 bits &= common_bits; 2680 } 2681 pos->mask = common_bits; 2682 pos->value = bits; 2683 } 2684 characters_filled_in++; 2685 ASSERT(characters_filled_in <= details->characters()); 2686 if (characters_filled_in == details->characters()) { 2687 return; 2688 } 2689 } 2690 } 2691 ASSERT(characters_filled_in != details->characters()); 2692 if (!details->cannot_match()) { 2693 on_success()-> GetQuickCheckDetails(details, 2694 compiler, 2695 characters_filled_in, 2696 true); 2697 } 2698 } 2699 2700 2701 void QuickCheckDetails::Clear() { 2702 for (int i = 0; i < characters_; i++) { 2703 positions_[i].mask = 0; 2704 positions_[i].value = 0; 2705 positions_[i].determines_perfectly = false; 2706 } 2707 characters_ = 0; 2708 } 2709 2710 2711 void QuickCheckDetails::Advance(int by, bool ascii) { 2712 ASSERT(by >= 0); 2713 if (by >= characters_) { 2714 Clear(); 2715 return; 2716 } 2717 for (int i = 0; i < characters_ - by; i++) { 2718 positions_[i] = positions_[by + i]; 2719 } 2720 for (int i = characters_ - by; i < characters_; i++) { 2721 positions_[i].mask = 0; 2722 positions_[i].value = 0; 2723 positions_[i].determines_perfectly = false; 2724 } 2725 characters_ -= by; 2726 // We could change mask_ and value_ here but we would never advance unless 2727 // they had already been used in a check and they won't be used again because 2728 // it would gain us nothing. So there's no point. 2729 } 2730 2731 2732 void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) { 2733 ASSERT(characters_ == other->characters_); 2734 if (other->cannot_match_) { 2735 return; 2736 } 2737 if (cannot_match_) { 2738 *this = *other; 2739 return; 2740 } 2741 for (int i = from_index; i < characters_; i++) { 2742 QuickCheckDetails::Position* pos = positions(i); 2743 QuickCheckDetails::Position* other_pos = other->positions(i); 2744 if (pos->mask != other_pos->mask || 2745 pos->value != other_pos->value || 2746 !other_pos->determines_perfectly) { 2747 // Our mask-compare operation will be approximate unless we have the 2748 // exact same operation on both sides of the alternation. 2749 pos->determines_perfectly = false; 2750 } 2751 pos->mask &= other_pos->mask; 2752 pos->value &= pos->mask; 2753 other_pos->value &= pos->mask; 2754 uc16 differing_bits = (pos->value ^ other_pos->value); 2755 pos->mask &= ~differing_bits; 2756 pos->value &= pos->mask; 2757 } 2758 } 2759 2760 2761 class VisitMarker { 2762 public: 2763 explicit VisitMarker(NodeInfo* info) : info_(info) { 2764 ASSERT(!info->visited); 2765 info->visited = true; 2766 } 2767 ~VisitMarker() { 2768 info_->visited = false; 2769 } 2770 private: 2771 NodeInfo* info_; 2772 }; 2773 2774 2775 RegExpNode* SeqRegExpNode::FilterASCII(int depth, bool ignore_case) { 2776 if (info()->replacement_calculated) return replacement(); 2777 if (depth < 0) return this; 2778 ASSERT(!info()->visited); 2779 VisitMarker marker(info()); 2780 return FilterSuccessor(depth - 1, ignore_case); 2781 } 2782 2783 2784 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) { 2785 RegExpNode* next = on_success_->FilterASCII(depth - 1, ignore_case); 2786 if (next == NULL) return set_replacement(NULL); 2787 on_success_ = next; 2788 return set_replacement(this); 2789 } 2790 2791 2792 // We need to check for the following characters: 0x39c 0x3bc 0x178. 2793 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) { 2794 // TODO(dcarney): this could be a lot more efficient. 2795 return range.Contains(0x39c) || 2796 range.Contains(0x3bc) || range.Contains(0x178); 2797 } 2798 2799 2800 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) { 2801 for (int i = 0; i < ranges->length(); i++) { 2802 // TODO(dcarney): this could be a lot more efficient. 2803 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true; 2804 } 2805 return false; 2806 } 2807 2808 2809 RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) { 2810 if (info()->replacement_calculated) return replacement(); 2811 if (depth < 0) return this; 2812 ASSERT(!info()->visited); 2813 VisitMarker marker(info()); 2814 int element_count = elms_->length(); 2815 for (int i = 0; i < element_count; i++) { 2816 TextElement elm = elms_->at(i); 2817 if (elm.text_type == TextElement::ATOM) { 2818 Vector<const uc16> quarks = elm.data.u_atom->data(); 2819 for (int j = 0; j < quarks.length(); j++) { 2820 uint16_t c = quarks[j]; 2821 if (c <= String::kMaxOneByteCharCode) continue; 2822 if (!ignore_case) return set_replacement(NULL); 2823 // Here, we need to check for characters whose upper and lower cases 2824 // are outside the Latin-1 range. 2825 uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c); 2826 // Character is outside Latin-1 completely 2827 if (converted == 0) return set_replacement(NULL); 2828 // Convert quark to Latin-1 in place. 2829 uint16_t* copy = const_cast<uint16_t*>(quarks.start()); 2830 copy[j] = converted; 2831 } 2832 } else { 2833 ASSERT(elm.text_type == TextElement::CHAR_CLASS); 2834 RegExpCharacterClass* cc = elm.data.u_char_class; 2835 ZoneList<CharacterRange>* ranges = cc->ranges(zone()); 2836 if (!CharacterRange::IsCanonical(ranges)) { 2837 CharacterRange::Canonicalize(ranges); 2838 } 2839 // Now they are in order so we only need to look at the first. 2840 int range_count = ranges->length(); 2841 if (cc->is_negated()) { 2842 if (range_count != 0 && 2843 ranges->at(0).from() == 0 && 2844 ranges->at(0).to() >= String::kMaxOneByteCharCode) { 2845 // This will be handled in a later filter. 2846 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue; 2847 return set_replacement(NULL); 2848 } 2849 } else { 2850 if (range_count == 0 || 2851 ranges->at(0).from() > String::kMaxOneByteCharCode) { 2852 // This will be handled in a later filter. 2853 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue; 2854 return set_replacement(NULL); 2855 } 2856 } 2857 } 2858 } 2859 return FilterSuccessor(depth - 1, ignore_case); 2860 } 2861 2862 2863 RegExpNode* LoopChoiceNode::FilterASCII(int depth, bool ignore_case) { 2864 if (info()->replacement_calculated) return replacement(); 2865 if (depth < 0) return this; 2866 if (info()->visited) return this; 2867 { 2868 VisitMarker marker(info()); 2869 2870 RegExpNode* continue_replacement = 2871 continue_node_->FilterASCII(depth - 1, ignore_case); 2872 // If we can't continue after the loop then there is no sense in doing the 2873 // loop. 2874 if (continue_replacement == NULL) return set_replacement(NULL); 2875 } 2876 2877 return ChoiceNode::FilterASCII(depth - 1, ignore_case); 2878 } 2879 2880 2881 RegExpNode* ChoiceNode::FilterASCII(int depth, bool ignore_case) { 2882 if (info()->replacement_calculated) return replacement(); 2883 if (depth < 0) return this; 2884 if (info()->visited) return this; 2885 VisitMarker marker(info()); 2886 int choice_count = alternatives_->length(); 2887 2888 for (int i = 0; i < choice_count; i++) { 2889 GuardedAlternative alternative = alternatives_->at(i); 2890 if (alternative.guards() != NULL && alternative.guards()->length() != 0) { 2891 set_replacement(this); 2892 return this; 2893 } 2894 } 2895 2896 int surviving = 0; 2897 RegExpNode* survivor = NULL; 2898 for (int i = 0; i < choice_count; i++) { 2899 GuardedAlternative alternative = alternatives_->at(i); 2900 RegExpNode* replacement = 2901 alternative.node()->FilterASCII(depth - 1, ignore_case); 2902 ASSERT(replacement != this); // No missing EMPTY_MATCH_CHECK. 2903 if (replacement != NULL) { 2904 alternatives_->at(i).set_node(replacement); 2905 surviving++; 2906 survivor = replacement; 2907 } 2908 } 2909 if (surviving < 2) return set_replacement(survivor); 2910 2911 set_replacement(this); 2912 if (surviving == choice_count) { 2913 return this; 2914 } 2915 // Only some of the nodes survived the filtering. We need to rebuild the 2916 // alternatives list. 2917 ZoneList<GuardedAlternative>* new_alternatives = 2918 new(zone()) ZoneList<GuardedAlternative>(surviving, zone()); 2919 for (int i = 0; i < choice_count; i++) { 2920 RegExpNode* replacement = 2921 alternatives_->at(i).node()->FilterASCII(depth - 1, ignore_case); 2922 if (replacement != NULL) { 2923 alternatives_->at(i).set_node(replacement); 2924 new_alternatives->Add(alternatives_->at(i), zone()); 2925 } 2926 } 2927 alternatives_ = new_alternatives; 2928 return this; 2929 } 2930 2931 2932 RegExpNode* NegativeLookaheadChoiceNode::FilterASCII(int depth, 2933 bool ignore_case) { 2934 if (info()->replacement_calculated) return replacement(); 2935 if (depth < 0) return this; 2936 if (info()->visited) return this; 2937 VisitMarker marker(info()); 2938 // Alternative 0 is the negative lookahead, alternative 1 is what comes 2939 // afterwards. 2940 RegExpNode* node = alternatives_->at(1).node(); 2941 RegExpNode* replacement = node->FilterASCII(depth - 1, ignore_case); 2942 if (replacement == NULL) return set_replacement(NULL); 2943 alternatives_->at(1).set_node(replacement); 2944 2945 RegExpNode* neg_node = alternatives_->at(0).node(); 2946 RegExpNode* neg_replacement = neg_node->FilterASCII(depth - 1, ignore_case); 2947 // If the negative lookahead is always going to fail then 2948 // we don't need to check it. 2949 if (neg_replacement == NULL) return set_replacement(replacement); 2950 alternatives_->at(0).set_node(neg_replacement); 2951 return set_replacement(this); 2952 } 2953 2954 2955 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 2956 RegExpCompiler* compiler, 2957 int characters_filled_in, 2958 bool not_at_start) { 2959 if (body_can_be_zero_length_ || info()->visited) return; 2960 VisitMarker marker(info()); 2961 return ChoiceNode::GetQuickCheckDetails(details, 2962 compiler, 2963 characters_filled_in, 2964 not_at_start); 2965 } 2966 2967 2968 void LoopChoiceNode::FillInBMInfo(int offset, 2969 int budget, 2970 BoyerMooreLookahead* bm, 2971 bool not_at_start) { 2972 if (body_can_be_zero_length_ || budget <= 0) { 2973 bm->SetRest(offset); 2974 SaveBMInfo(bm, not_at_start, offset); 2975 return; 2976 } 2977 ChoiceNode::FillInBMInfo(offset, budget - 1, bm, not_at_start); 2978 SaveBMInfo(bm, not_at_start, offset); 2979 } 2980 2981 2982 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 2983 RegExpCompiler* compiler, 2984 int characters_filled_in, 2985 bool not_at_start) { 2986 not_at_start = (not_at_start || not_at_start_); 2987 int choice_count = alternatives_->length(); 2988 ASSERT(choice_count > 0); 2989 alternatives_->at(0).node()->GetQuickCheckDetails(details, 2990 compiler, 2991 characters_filled_in, 2992 not_at_start); 2993 for (int i = 1; i < choice_count; i++) { 2994 QuickCheckDetails new_details(details->characters()); 2995 RegExpNode* node = alternatives_->at(i).node(); 2996 node->GetQuickCheckDetails(&new_details, compiler, 2997 characters_filled_in, 2998 not_at_start); 2999 // Here we merge the quick match details of the two branches. 3000 details->Merge(&new_details, characters_filled_in); 3001 } 3002 } 3003 3004 3005 // Check for [0-9A-Z_a-z]. 3006 static void EmitWordCheck(RegExpMacroAssembler* assembler, 3007 Label* word, 3008 Label* non_word, 3009 bool fall_through_on_word) { 3010 if (assembler->CheckSpecialCharacterClass( 3011 fall_through_on_word ? 'w' : 'W', 3012 fall_through_on_word ? non_word : word)) { 3013 // Optimized implementation available. 3014 return; 3015 } 3016 assembler->CheckCharacterGT('z', non_word); 3017 assembler->CheckCharacterLT('0', non_word); 3018 assembler->CheckCharacterGT('a' - 1, word); 3019 assembler->CheckCharacterLT('9' + 1, word); 3020 assembler->CheckCharacterLT('A', non_word); 3021 assembler->CheckCharacterLT('Z' + 1, word); 3022 if (fall_through_on_word) { 3023 assembler->CheckNotCharacter('_', non_word); 3024 } else { 3025 assembler->CheckCharacter('_', word); 3026 } 3027 } 3028 3029 3030 // Emit the code to check for a ^ in multiline mode (1-character lookbehind 3031 // that matches newline or the start of input). 3032 static void EmitHat(RegExpCompiler* compiler, 3033 RegExpNode* on_success, 3034 Trace* trace) { 3035 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3036 // We will be loading the previous character into the current character 3037 // register. 3038 Trace new_trace(*trace); 3039 new_trace.InvalidateCurrentCharacter(); 3040 3041 Label ok; 3042 if (new_trace.cp_offset() == 0) { 3043 // The start of input counts as a newline in this context, so skip to 3044 // ok if we are at the start. 3045 assembler->CheckAtStart(&ok); 3046 } 3047 // We already checked that we are not at the start of input so it must be 3048 // OK to load the previous character. 3049 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1, 3050 new_trace.backtrack(), 3051 false); 3052 if (!assembler->CheckSpecialCharacterClass('n', 3053 new_trace.backtrack())) { 3054 // Newline means \n, \r, 0x2028 or 0x2029. 3055 if (!compiler->ascii()) { 3056 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok); 3057 } 3058 assembler->CheckCharacter('\n', &ok); 3059 assembler->CheckNotCharacter('\r', new_trace.backtrack()); 3060 } 3061 assembler->Bind(&ok); 3062 on_success->Emit(compiler, &new_trace); 3063 } 3064 3065 3066 // Emit the code to handle \b and \B (word-boundary or non-word-boundary). 3067 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) { 3068 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3069 Trace::TriBool next_is_word_character = Trace::UNKNOWN; 3070 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE); 3071 BoyerMooreLookahead* lookahead = bm_info(not_at_start); 3072 if (lookahead == NULL) { 3073 int eats_at_least = 3074 Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore, 3075 kRecursionBudget, 3076 not_at_start)); 3077 if (eats_at_least >= 1) { 3078 BoyerMooreLookahead* bm = 3079 new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone()); 3080 FillInBMInfo(0, kRecursionBudget, bm, not_at_start); 3081 if (bm->at(0)->is_non_word()) 3082 next_is_word_character = Trace::FALSE_VALUE; 3083 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE; 3084 } 3085 } else { 3086 if (lookahead->at(0)->is_non_word()) 3087 next_is_word_character = Trace::FALSE_VALUE; 3088 if (lookahead->at(0)->is_word()) 3089 next_is_word_character = Trace::TRUE_VALUE; 3090 } 3091 bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY); 3092 if (next_is_word_character == Trace::UNKNOWN) { 3093 Label before_non_word; 3094 Label before_word; 3095 if (trace->characters_preloaded() != 1) { 3096 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word); 3097 } 3098 // Fall through on non-word. 3099 EmitWordCheck(assembler, &before_word, &before_non_word, false); 3100 // Next character is not a word character. 3101 assembler->Bind(&before_non_word); 3102 Label ok; 3103 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord); 3104 assembler->GoTo(&ok); 3105 3106 assembler->Bind(&before_word); 3107 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord); 3108 assembler->Bind(&ok); 3109 } else if (next_is_word_character == Trace::TRUE_VALUE) { 3110 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord); 3111 } else { 3112 ASSERT(next_is_word_character == Trace::FALSE_VALUE); 3113 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord); 3114 } 3115 } 3116 3117 3118 void AssertionNode::BacktrackIfPrevious( 3119 RegExpCompiler* compiler, 3120 Trace* trace, 3121 AssertionNode::IfPrevious backtrack_if_previous) { 3122 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3123 Trace new_trace(*trace); 3124 new_trace.InvalidateCurrentCharacter(); 3125 3126 Label fall_through, dummy; 3127 3128 Label* non_word = backtrack_if_previous == kIsNonWord ? 3129 new_trace.backtrack() : 3130 &fall_through; 3131 Label* word = backtrack_if_previous == kIsNonWord ? 3132 &fall_through : 3133 new_trace.backtrack(); 3134 3135 if (new_trace.cp_offset() == 0) { 3136 // The start of input counts as a non-word character, so the question is 3137 // decided if we are at the start. 3138 assembler->CheckAtStart(non_word); 3139 } 3140 // We already checked that we are not at the start of input so it must be 3141 // OK to load the previous character. 3142 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false); 3143 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord); 3144 3145 assembler->Bind(&fall_through); 3146 on_success()->Emit(compiler, &new_trace); 3147 } 3148 3149 3150 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details, 3151 RegExpCompiler* compiler, 3152 int filled_in, 3153 bool not_at_start) { 3154 if (assertion_type_ == AT_START && not_at_start) { 3155 details->set_cannot_match(); 3156 return; 3157 } 3158 return on_success()->GetQuickCheckDetails(details, 3159 compiler, 3160 filled_in, 3161 not_at_start); 3162 } 3163 3164 3165 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3166 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3167 switch (assertion_type_) { 3168 case AT_END: { 3169 Label ok; 3170 assembler->CheckPosition(trace->cp_offset(), &ok); 3171 assembler->GoTo(trace->backtrack()); 3172 assembler->Bind(&ok); 3173 break; 3174 } 3175 case AT_START: { 3176 if (trace->at_start() == Trace::FALSE_VALUE) { 3177 assembler->GoTo(trace->backtrack()); 3178 return; 3179 } 3180 if (trace->at_start() == Trace::UNKNOWN) { 3181 assembler->CheckNotAtStart(trace->backtrack()); 3182 Trace at_start_trace = *trace; 3183 at_start_trace.set_at_start(true); 3184 on_success()->Emit(compiler, &at_start_trace); 3185 return; 3186 } 3187 } 3188 break; 3189 case AFTER_NEWLINE: 3190 EmitHat(compiler, on_success(), trace); 3191 return; 3192 case AT_BOUNDARY: 3193 case AT_NON_BOUNDARY: { 3194 EmitBoundaryCheck(compiler, trace); 3195 return; 3196 } 3197 } 3198 on_success()->Emit(compiler, trace); 3199 } 3200 3201 3202 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) { 3203 if (quick_check == NULL) return false; 3204 if (offset >= quick_check->characters()) return false; 3205 return quick_check->positions(offset)->determines_perfectly; 3206 } 3207 3208 3209 static void UpdateBoundsCheck(int index, int* checked_up_to) { 3210 if (index > *checked_up_to) { 3211 *checked_up_to = index; 3212 } 3213 } 3214 3215 3216 // We call this repeatedly to generate code for each pass over the text node. 3217 // The passes are in increasing order of difficulty because we hope one 3218 // of the first passes will fail in which case we are saved the work of the 3219 // later passes. for example for the case independent regexp /%[asdfghjkl]a/ 3220 // we will check the '%' in the first pass, the case independent 'a' in the 3221 // second pass and the character class in the last pass. 3222 // 3223 // The passes are done from right to left, so for example to test for /bar/ 3224 // we will first test for an 'r' with offset 2, then an 'a' with offset 1 3225 // and then a 'b' with offset 0. This means we can avoid the end-of-input 3226 // bounds check most of the time. In the example we only need to check for 3227 // end-of-input when loading the putative 'r'. 3228 // 3229 // A slight complication involves the fact that the first character may already 3230 // be fetched into a register by the previous node. In this case we want to 3231 // do the test for that character first. We do this in separate passes. The 3232 // 'preloaded' argument indicates that we are doing such a 'pass'. If such a 3233 // pass has been performed then subsequent passes will have true in 3234 // first_element_checked to indicate that that character does not need to be 3235 // checked again. 3236 // 3237 // In addition to all this we are passed a Trace, which can 3238 // contain an AlternativeGeneration object. In this AlternativeGeneration 3239 // object we can see details of any quick check that was already passed in 3240 // order to get to the code we are now generating. The quick check can involve 3241 // loading characters, which means we do not need to recheck the bounds 3242 // up to the limit the quick check already checked. In addition the quick 3243 // check can have involved a mask and compare operation which may simplify 3244 // or obviate the need for further checks at some character positions. 3245 void TextNode::TextEmitPass(RegExpCompiler* compiler, 3246 TextEmitPassType pass, 3247 bool preloaded, 3248 Trace* trace, 3249 bool first_element_checked, 3250 int* checked_up_to) { 3251 Isolate* isolate = Isolate::Current(); 3252 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3253 bool ascii = compiler->ascii(); 3254 Label* backtrack = trace->backtrack(); 3255 QuickCheckDetails* quick_check = trace->quick_check_performed(); 3256 int element_count = elms_->length(); 3257 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) { 3258 TextElement elm = elms_->at(i); 3259 int cp_offset = trace->cp_offset() + elm.cp_offset; 3260 if (elm.text_type == TextElement::ATOM) { 3261 Vector<const uc16> quarks = elm.data.u_atom->data(); 3262 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { 3263 if (first_element_checked && i == 0 && j == 0) continue; 3264 if (DeterminedAlready(quick_check, elm.cp_offset + j)) continue; 3265 EmitCharacterFunction* emit_function = NULL; 3266 switch (pass) { 3267 case NON_ASCII_MATCH: 3268 ASSERT(ascii); 3269 if (quarks[j] > String::kMaxOneByteCharCode) { 3270 assembler->GoTo(backtrack); 3271 return; 3272 } 3273 break; 3274 case NON_LETTER_CHARACTER_MATCH: 3275 emit_function = &EmitAtomNonLetter; 3276 break; 3277 case SIMPLE_CHARACTER_MATCH: 3278 emit_function = &EmitSimpleCharacter; 3279 break; 3280 case CASE_CHARACTER_MATCH: 3281 emit_function = &EmitAtomLetter; 3282 break; 3283 default: 3284 break; 3285 } 3286 if (emit_function != NULL) { 3287 bool bound_checked = emit_function(isolate, 3288 compiler, 3289 quarks[j], 3290 backtrack, 3291 cp_offset + j, 3292 *checked_up_to < cp_offset + j, 3293 preloaded); 3294 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); 3295 } 3296 } 3297 } else { 3298 ASSERT_EQ(elm.text_type, TextElement::CHAR_CLASS); 3299 if (pass == CHARACTER_CLASS_MATCH) { 3300 if (first_element_checked && i == 0) continue; 3301 if (DeterminedAlready(quick_check, elm.cp_offset)) continue; 3302 RegExpCharacterClass* cc = elm.data.u_char_class; 3303 EmitCharClass(assembler, 3304 cc, 3305 ascii, 3306 backtrack, 3307 cp_offset, 3308 *checked_up_to < cp_offset, 3309 preloaded, 3310 zone()); 3311 UpdateBoundsCheck(cp_offset, checked_up_to); 3312 } 3313 } 3314 } 3315 } 3316 3317 3318 int TextNode::Length() { 3319 TextElement elm = elms_->last(); 3320 ASSERT(elm.cp_offset >= 0); 3321 if (elm.text_type == TextElement::ATOM) { 3322 return elm.cp_offset + elm.data.u_atom->data().length(); 3323 } else { 3324 return elm.cp_offset + 1; 3325 } 3326 } 3327 3328 3329 bool TextNode::SkipPass(int int_pass, bool ignore_case) { 3330 TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass); 3331 if (ignore_case) { 3332 return pass == SIMPLE_CHARACTER_MATCH; 3333 } else { 3334 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH; 3335 } 3336 } 3337 3338 3339 // This generates the code to match a text node. A text node can contain 3340 // straight character sequences (possibly to be matched in a case-independent 3341 // way) and character classes. For efficiency we do not do this in a single 3342 // pass from left to right. Instead we pass over the text node several times, 3343 // emitting code for some character positions every time. See the comment on 3344 // TextEmitPass for details. 3345 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3346 LimitResult limit_result = LimitVersions(compiler, trace); 3347 if (limit_result == DONE) return; 3348 ASSERT(limit_result == CONTINUE); 3349 3350 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) { 3351 compiler->SetRegExpTooBig(); 3352 return; 3353 } 3354 3355 if (compiler->ascii()) { 3356 int dummy = 0; 3357 TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy); 3358 } 3359 3360 bool first_elt_done = false; 3361 int bound_checked_to = trace->cp_offset() - 1; 3362 bound_checked_to += trace->bound_checked_up_to(); 3363 3364 // If a character is preloaded into the current character register then 3365 // check that now. 3366 if (trace->characters_preloaded() == 1) { 3367 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { 3368 if (!SkipPass(pass, compiler->ignore_case())) { 3369 TextEmitPass(compiler, 3370 static_cast<TextEmitPassType>(pass), 3371 true, 3372 trace, 3373 false, 3374 &bound_checked_to); 3375 } 3376 } 3377 first_elt_done = true; 3378 } 3379 3380 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { 3381 if (!SkipPass(pass, compiler->ignore_case())) { 3382 TextEmitPass(compiler, 3383 static_cast<TextEmitPassType>(pass), 3384 false, 3385 trace, 3386 first_elt_done, 3387 &bound_checked_to); 3388 } 3389 } 3390 3391 Trace successor_trace(*trace); 3392 successor_trace.set_at_start(false); 3393 successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler); 3394 RecursionCheck rc(compiler); 3395 on_success()->Emit(compiler, &successor_trace); 3396 } 3397 3398 3399 void Trace::InvalidateCurrentCharacter() { 3400 characters_preloaded_ = 0; 3401 } 3402 3403 3404 void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) { 3405 ASSERT(by > 0); 3406 // We don't have an instruction for shifting the current character register 3407 // down or for using a shifted value for anything so lets just forget that 3408 // we preloaded any characters into it. 3409 characters_preloaded_ = 0; 3410 // Adjust the offsets of the quick check performed information. This 3411 // information is used to find out what we already determined about the 3412 // characters by means of mask and compare. 3413 quick_check_performed_.Advance(by, compiler->ascii()); 3414 cp_offset_ += by; 3415 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) { 3416 compiler->SetRegExpTooBig(); 3417 cp_offset_ = 0; 3418 } 3419 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by); 3420 } 3421 3422 3423 void TextNode::MakeCaseIndependent(bool is_ascii) { 3424 int element_count = elms_->length(); 3425 for (int i = 0; i < element_count; i++) { 3426 TextElement elm = elms_->at(i); 3427 if (elm.text_type == TextElement::CHAR_CLASS) { 3428 RegExpCharacterClass* cc = elm.data.u_char_class; 3429 // None of the standard character classes is different in the case 3430 // independent case and it slows us down if we don't know that. 3431 if (cc->is_standard(zone())) continue; 3432 ZoneList<CharacterRange>* ranges = cc->ranges(zone()); 3433 int range_count = ranges->length(); 3434 for (int j = 0; j < range_count; j++) { 3435 ranges->at(j).AddCaseEquivalents(ranges, is_ascii, zone()); 3436 } 3437 } 3438 } 3439 } 3440 3441 3442 int TextNode::GreedyLoopTextLength() { 3443 TextElement elm = elms_->at(elms_->length() - 1); 3444 if (elm.text_type == TextElement::CHAR_CLASS) { 3445 return elm.cp_offset + 1; 3446 } else { 3447 return elm.cp_offset + elm.data.u_atom->data().length(); 3448 } 3449 } 3450 3451 3452 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode( 3453 RegExpCompiler* compiler) { 3454 if (elms_->length() != 1) return NULL; 3455 TextElement elm = elms_->at(0); 3456 if (elm.text_type != TextElement::CHAR_CLASS) return NULL; 3457 RegExpCharacterClass* node = elm.data.u_char_class; 3458 ZoneList<CharacterRange>* ranges = node->ranges(zone()); 3459 if (!CharacterRange::IsCanonical(ranges)) { 3460 CharacterRange::Canonicalize(ranges); 3461 } 3462 if (node->is_negated()) { 3463 return ranges->length() == 0 ? on_success() : NULL; 3464 } 3465 if (ranges->length() != 1) return NULL; 3466 uint32_t max_char; 3467 if (compiler->ascii()) { 3468 max_char = String::kMaxOneByteCharCode; 3469 } else { 3470 max_char = String::kMaxUtf16CodeUnit; 3471 } 3472 return ranges->at(0).IsEverything(max_char) ? on_success() : NULL; 3473 } 3474 3475 3476 // Finds the fixed match length of a sequence of nodes that goes from 3477 // this alternative and back to this choice node. If there are variable 3478 // length nodes or other complications in the way then return a sentinel 3479 // value indicating that a greedy loop cannot be constructed. 3480 int ChoiceNode::GreedyLoopTextLengthForAlternative( 3481 GuardedAlternative* alternative) { 3482 int length = 0; 3483 RegExpNode* node = alternative->node(); 3484 // Later we will generate code for all these text nodes using recursion 3485 // so we have to limit the max number. 3486 int recursion_depth = 0; 3487 while (node != this) { 3488 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) { 3489 return kNodeIsTooComplexForGreedyLoops; 3490 } 3491 int node_length = node->GreedyLoopTextLength(); 3492 if (node_length == kNodeIsTooComplexForGreedyLoops) { 3493 return kNodeIsTooComplexForGreedyLoops; 3494 } 3495 length += node_length; 3496 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node); 3497 node = seq_node->on_success(); 3498 } 3499 return length; 3500 } 3501 3502 3503 void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) { 3504 ASSERT_EQ(loop_node_, NULL); 3505 AddAlternative(alt); 3506 loop_node_ = alt.node(); 3507 } 3508 3509 3510 void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) { 3511 ASSERT_EQ(continue_node_, NULL); 3512 AddAlternative(alt); 3513 continue_node_ = alt.node(); 3514 } 3515 3516 3517 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3518 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 3519 if (trace->stop_node() == this) { 3520 int text_length = 3521 GreedyLoopTextLengthForAlternative(&(alternatives_->at(0))); 3522 ASSERT(text_length != kNodeIsTooComplexForGreedyLoops); 3523 // Update the counter-based backtracking info on the stack. This is an 3524 // optimization for greedy loops (see below). 3525 ASSERT(trace->cp_offset() == text_length); 3526 macro_assembler->AdvanceCurrentPosition(text_length); 3527 macro_assembler->GoTo(trace->loop_label()); 3528 return; 3529 } 3530 ASSERT(trace->stop_node() == NULL); 3531 if (!trace->is_trivial()) { 3532 trace->Flush(compiler, this); 3533 return; 3534 } 3535 ChoiceNode::Emit(compiler, trace); 3536 } 3537 3538 3539 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler, 3540 int eats_at_least) { 3541 int preload_characters = Min(4, eats_at_least); 3542 if (compiler->macro_assembler()->CanReadUnaligned()) { 3543 bool ascii = compiler->ascii(); 3544 if (ascii) { 3545 if (preload_characters > 4) preload_characters = 4; 3546 // We can't preload 3 characters because there is no machine instruction 3547 // to do that. We can't just load 4 because we could be reading 3548 // beyond the end of the string, which could cause a memory fault. 3549 if (preload_characters == 3) preload_characters = 2; 3550 } else { 3551 if (preload_characters > 2) preload_characters = 2; 3552 } 3553 } else { 3554 if (preload_characters > 1) preload_characters = 1; 3555 } 3556 return preload_characters; 3557 } 3558 3559 3560 // This class is used when generating the alternatives in a choice node. It 3561 // records the way the alternative is being code generated. 3562 class AlternativeGeneration: public Malloced { 3563 public: 3564 AlternativeGeneration() 3565 : possible_success(), 3566 expects_preload(false), 3567 after(), 3568 quick_check_details() { } 3569 Label possible_success; 3570 bool expects_preload; 3571 Label after; 3572 QuickCheckDetails quick_check_details; 3573 }; 3574 3575 3576 // Creates a list of AlternativeGenerations. If the list has a reasonable 3577 // size then it is on the stack, otherwise the excess is on the heap. 3578 class AlternativeGenerationList { 3579 public: 3580 AlternativeGenerationList(int count, Zone* zone) 3581 : alt_gens_(count, zone) { 3582 for (int i = 0; i < count && i < kAFew; i++) { 3583 alt_gens_.Add(a_few_alt_gens_ + i, zone); 3584 } 3585 for (int i = kAFew; i < count; i++) { 3586 alt_gens_.Add(new AlternativeGeneration(), zone); 3587 } 3588 } 3589 ~AlternativeGenerationList() { 3590 for (int i = kAFew; i < alt_gens_.length(); i++) { 3591 delete alt_gens_[i]; 3592 alt_gens_[i] = NULL; 3593 } 3594 } 3595 3596 AlternativeGeneration* at(int i) { 3597 return alt_gens_[i]; 3598 } 3599 3600 private: 3601 static const int kAFew = 10; 3602 ZoneList<AlternativeGeneration*> alt_gens_; 3603 AlternativeGeneration a_few_alt_gens_[kAFew]; 3604 }; 3605 3606 3607 // The '2' variant is has inclusive from and exclusive to. 3608 static const int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 3609 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B, 0x2028, 0x202A, 3610 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, 0x10000 }; 3611 static const int kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges); 3612 3613 static const int kWordRanges[] = { 3614 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 }; 3615 static const int kWordRangeCount = ARRAY_SIZE(kWordRanges); 3616 static const int kDigitRanges[] = { '0', '9' + 1, 0x10000 }; 3617 static const int kDigitRangeCount = ARRAY_SIZE(kDigitRanges); 3618 static const int kSurrogateRanges[] = { 0xd800, 0xe000, 0x10000 }; 3619 static const int kSurrogateRangeCount = ARRAY_SIZE(kSurrogateRanges); 3620 static const int kLineTerminatorRanges[] = { 0x000A, 0x000B, 0x000D, 0x000E, 3621 0x2028, 0x202A, 0x10000 }; 3622 static const int kLineTerminatorRangeCount = ARRAY_SIZE(kLineTerminatorRanges); 3623 3624 3625 void BoyerMoorePositionInfo::Set(int character) { 3626 SetInterval(Interval(character, character)); 3627 } 3628 3629 3630 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) { 3631 s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval); 3632 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval); 3633 d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval); 3634 surrogate_ = 3635 AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval); 3636 if (interval.to() - interval.from() >= kMapSize - 1) { 3637 if (map_count_ != kMapSize) { 3638 map_count_ = kMapSize; 3639 for (int i = 0; i < kMapSize; i++) map_->at(i) = true; 3640 } 3641 return; 3642 } 3643 for (int i = interval.from(); i <= interval.to(); i++) { 3644 int mod_character = (i & kMask); 3645 if (!map_->at(mod_character)) { 3646 map_count_++; 3647 map_->at(mod_character) = true; 3648 } 3649 if (map_count_ == kMapSize) return; 3650 } 3651 } 3652 3653 3654 void BoyerMoorePositionInfo::SetAll() { 3655 s_ = w_ = d_ = kLatticeUnknown; 3656 if (map_count_ != kMapSize) { 3657 map_count_ = kMapSize; 3658 for (int i = 0; i < kMapSize; i++) map_->at(i) = true; 3659 } 3660 } 3661 3662 3663 BoyerMooreLookahead::BoyerMooreLookahead( 3664 int length, RegExpCompiler* compiler, Zone* zone) 3665 : length_(length), 3666 compiler_(compiler) { 3667 if (compiler->ascii()) { 3668 max_char_ = String::kMaxOneByteCharCode; 3669 } else { 3670 max_char_ = String::kMaxUtf16CodeUnit; 3671 } 3672 bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone); 3673 for (int i = 0; i < length; i++) { 3674 bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone); 3675 } 3676 } 3677 3678 3679 // Find the longest range of lookahead that has the fewest number of different 3680 // characters that can occur at a given position. Since we are optimizing two 3681 // different parameters at once this is a tradeoff. 3682 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) { 3683 int biggest_points = 0; 3684 // If more than 32 characters out of 128 can occur it is unlikely that we can 3685 // be lucky enough to step forwards much of the time. 3686 const int kMaxMax = 32; 3687 for (int max_number_of_chars = 4; 3688 max_number_of_chars < kMaxMax; 3689 max_number_of_chars *= 2) { 3690 biggest_points = 3691 FindBestInterval(max_number_of_chars, biggest_points, from, to); 3692 } 3693 if (biggest_points == 0) return false; 3694 return true; 3695 } 3696 3697 3698 // Find the highest-points range between 0 and length_ where the character 3699 // information is not too vague. 'Too vague' means that there are more than 3700 // max_number_of_chars that can occur at this position. Calculates the number 3701 // of points as the product of width-of-the-range and 3702 // probability-of-finding-one-of-the-characters, where the probability is 3703 // calculated using the frequency distribution of the sample subject string. 3704 int BoyerMooreLookahead::FindBestInterval( 3705 int max_number_of_chars, int old_biggest_points, int* from, int* to) { 3706 int biggest_points = old_biggest_points; 3707 static const int kSize = RegExpMacroAssembler::kTableSize; 3708 for (int i = 0; i < length_; ) { 3709 while (i < length_ && Count(i) > max_number_of_chars) i++; 3710 if (i == length_) break; 3711 int remembered_from = i; 3712 bool union_map[kSize]; 3713 for (int j = 0; j < kSize; j++) union_map[j] = false; 3714 while (i < length_ && Count(i) <= max_number_of_chars) { 3715 BoyerMoorePositionInfo* map = bitmaps_->at(i); 3716 for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j); 3717 i++; 3718 } 3719 int frequency = 0; 3720 for (int j = 0; j < kSize; j++) { 3721 if (union_map[j]) { 3722 // Add 1 to the frequency to give a small per-character boost for 3723 // the cases where our sampling is not good enough and many 3724 // characters have a frequency of zero. This means the frequency 3725 // can theoretically be up to 2*kSize though we treat it mostly as 3726 // a fraction of kSize. 3727 frequency += compiler_->frequency_collator()->Frequency(j) + 1; 3728 } 3729 } 3730 // We use the probability of skipping times the distance we are skipping to 3731 // judge the effectiveness of this. Actually we have a cut-off: By 3732 // dividing by 2 we switch off the skipping if the probability of skipping 3733 // is less than 50%. This is because the multibyte mask-and-compare 3734 // skipping in quickcheck is more likely to do well on this case. 3735 bool in_quickcheck_range = ((i - remembered_from < 4) || 3736 (compiler_->ascii() ? remembered_from <= 4 : remembered_from <= 2)); 3737 // Called 'probability' but it is only a rough estimate and can actually 3738 // be outside the 0-kSize range. 3739 int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency; 3740 int points = (i - remembered_from) * probability; 3741 if (points > biggest_points) { 3742 *from = remembered_from; 3743 *to = i - 1; 3744 biggest_points = points; 3745 } 3746 } 3747 return biggest_points; 3748 } 3749 3750 3751 // Take all the characters that will not prevent a successful match if they 3752 // occur in the subject string in the range between min_lookahead and 3753 // max_lookahead (inclusive) measured from the current position. If the 3754 // character at max_lookahead offset is not one of these characters, then we 3755 // can safely skip forwards by the number of characters in the range. 3756 int BoyerMooreLookahead::GetSkipTable(int min_lookahead, 3757 int max_lookahead, 3758 Handle<ByteArray> boolean_skip_table) { 3759 const int kSize = RegExpMacroAssembler::kTableSize; 3760 3761 const int kSkipArrayEntry = 0; 3762 const int kDontSkipArrayEntry = 1; 3763 3764 for (int i = 0; i < kSize; i++) { 3765 boolean_skip_table->set(i, kSkipArrayEntry); 3766 } 3767 int skip = max_lookahead + 1 - min_lookahead; 3768 3769 for (int i = max_lookahead; i >= min_lookahead; i--) { 3770 BoyerMoorePositionInfo* map = bitmaps_->at(i); 3771 for (int j = 0; j < kSize; j++) { 3772 if (map->at(j)) { 3773 boolean_skip_table->set(j, kDontSkipArrayEntry); 3774 } 3775 } 3776 } 3777 3778 return skip; 3779 } 3780 3781 3782 // See comment above on the implementation of GetSkipTable. 3783 bool BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) { 3784 const int kSize = RegExpMacroAssembler::kTableSize; 3785 3786 int min_lookahead = 0; 3787 int max_lookahead = 0; 3788 3789 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return false; 3790 3791 bool found_single_character = false; 3792 int single_character = 0; 3793 for (int i = max_lookahead; i >= min_lookahead; i--) { 3794 BoyerMoorePositionInfo* map = bitmaps_->at(i); 3795 if (map->map_count() > 1 || 3796 (found_single_character && map->map_count() != 0)) { 3797 found_single_character = false; 3798 break; 3799 } 3800 for (int j = 0; j < kSize; j++) { 3801 if (map->at(j)) { 3802 found_single_character = true; 3803 single_character = j; 3804 break; 3805 } 3806 } 3807 } 3808 3809 int lookahead_width = max_lookahead + 1 - min_lookahead; 3810 3811 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) { 3812 // The mask-compare can probably handle this better. 3813 return false; 3814 } 3815 3816 if (found_single_character) { 3817 Label cont, again; 3818 masm->Bind(&again); 3819 masm->LoadCurrentCharacter(max_lookahead, &cont, true); 3820 if (max_char_ > kSize) { 3821 masm->CheckCharacterAfterAnd(single_character, 3822 RegExpMacroAssembler::kTableMask, 3823 &cont); 3824 } else { 3825 masm->CheckCharacter(single_character, &cont); 3826 } 3827 masm->AdvanceCurrentPosition(lookahead_width); 3828 masm->GoTo(&again); 3829 masm->Bind(&cont); 3830 return true; 3831 } 3832 3833 Factory* factory = Isolate::Current()->factory(); 3834 Handle<ByteArray> boolean_skip_table = factory->NewByteArray(kSize, TENURED); 3835 int skip_distance = GetSkipTable( 3836 min_lookahead, max_lookahead, boolean_skip_table); 3837 ASSERT(skip_distance != 0); 3838 3839 Label cont, again; 3840 masm->Bind(&again); 3841 masm->LoadCurrentCharacter(max_lookahead, &cont, true); 3842 masm->CheckBitInTable(boolean_skip_table, &cont); 3843 masm->AdvanceCurrentPosition(skip_distance); 3844 masm->GoTo(&again); 3845 masm->Bind(&cont); 3846 3847 return true; 3848 } 3849 3850 3851 /* Code generation for choice nodes. 3852 * 3853 * We generate quick checks that do a mask and compare to eliminate a 3854 * choice. If the quick check succeeds then it jumps to the continuation to 3855 * do slow checks and check subsequent nodes. If it fails (the common case) 3856 * it falls through to the next choice. 3857 * 3858 * Here is the desired flow graph. Nodes directly below each other imply 3859 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative 3860 * 3 doesn't have a quick check so we have to call the slow check. 3861 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire 3862 * regexp continuation is generated directly after the Sn node, up to the 3863 * next GoTo if we decide to reuse some already generated code. Some 3864 * nodes expect preload_characters to be preloaded into the current 3865 * character register. R nodes do this preloading. Vertices are marked 3866 * F for failures and S for success (possible success in the case of quick 3867 * nodes). L, V, < and > are used as arrow heads. 3868 * 3869 * ----------> R 3870 * | 3871 * V 3872 * Q1 -----> S1 3873 * | S / 3874 * F| / 3875 * | F/ 3876 * | / 3877 * | R 3878 * | / 3879 * V L 3880 * Q2 -----> S2 3881 * | S / 3882 * F| / 3883 * | F/ 3884 * | / 3885 * | R 3886 * | / 3887 * V L 3888 * S3 3889 * | 3890 * F| 3891 * | 3892 * R 3893 * | 3894 * backtrack V 3895 * <----------Q4 3896 * \ F | 3897 * \ |S 3898 * \ F V 3899 * \-----S4 3900 * 3901 * For greedy loops we reverse our expectation and expect to match rather 3902 * than fail. Therefore we want the loop code to look like this (U is the 3903 * unwind code that steps back in the greedy loop). The following alternatives 3904 * look the same as above. 3905 * _____ 3906 * / \ 3907 * V | 3908 * ----------> S1 | 3909 * /| | 3910 * / |S | 3911 * F/ \_____/ 3912 * / 3913 * |<----------- 3914 * | \ 3915 * V \ 3916 * Q2 ---> S2 \ 3917 * | S / | 3918 * F| / | 3919 * | F/ | 3920 * | / | 3921 * | R | 3922 * | / | 3923 * F VL | 3924 * <------U | 3925 * back |S | 3926 * \______________/ 3927 */ 3928 3929 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3930 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 3931 int choice_count = alternatives_->length(); 3932 #ifdef DEBUG 3933 for (int i = 0; i < choice_count - 1; i++) { 3934 GuardedAlternative alternative = alternatives_->at(i); 3935 ZoneList<Guard*>* guards = alternative.guards(); 3936 int guard_count = (guards == NULL) ? 0 : guards->length(); 3937 for (int j = 0; j < guard_count; j++) { 3938 ASSERT(!trace->mentions_reg(guards->at(j)->reg())); 3939 } 3940 } 3941 #endif 3942 3943 LimitResult limit_result = LimitVersions(compiler, trace); 3944 if (limit_result == DONE) return; 3945 ASSERT(limit_result == CONTINUE); 3946 3947 int new_flush_budget = trace->flush_budget() / choice_count; 3948 if (trace->flush_budget() == 0 && trace->actions() != NULL) { 3949 trace->Flush(compiler, this); 3950 return; 3951 } 3952 3953 RecursionCheck rc(compiler); 3954 3955 Trace* current_trace = trace; 3956 3957 int text_length = GreedyLoopTextLengthForAlternative(&(alternatives_->at(0))); 3958 bool greedy_loop = false; 3959 Label greedy_loop_label; 3960 Trace counter_backtrack_trace; 3961 counter_backtrack_trace.set_backtrack(&greedy_loop_label); 3962 if (not_at_start()) counter_backtrack_trace.set_at_start(false); 3963 3964 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) { 3965 // Here we have special handling for greedy loops containing only text nodes 3966 // and other simple nodes. These are handled by pushing the current 3967 // position on the stack and then incrementing the current position each 3968 // time around the switch. On backtrack we decrement the current position 3969 // and check it against the pushed value. This avoids pushing backtrack 3970 // information for each iteration of the loop, which could take up a lot of 3971 // space. 3972 greedy_loop = true; 3973 ASSERT(trace->stop_node() == NULL); 3974 macro_assembler->PushCurrentPosition(); 3975 current_trace = &counter_backtrack_trace; 3976 Label greedy_match_failed; 3977 Trace greedy_match_trace; 3978 if (not_at_start()) greedy_match_trace.set_at_start(false); 3979 greedy_match_trace.set_backtrack(&greedy_match_failed); 3980 Label loop_label; 3981 macro_assembler->Bind(&loop_label); 3982 greedy_match_trace.set_stop_node(this); 3983 greedy_match_trace.set_loop_label(&loop_label); 3984 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace); 3985 macro_assembler->Bind(&greedy_match_failed); 3986 } 3987 3988 Label second_choice; // For use in greedy matches. 3989 macro_assembler->Bind(&second_choice); 3990 3991 int first_normal_choice = greedy_loop ? 1 : 0; 3992 3993 bool not_at_start = current_trace->at_start() == Trace::FALSE_VALUE; 3994 const int kEatsAtLeastNotYetInitialized = -1; 3995 int eats_at_least = kEatsAtLeastNotYetInitialized; 3996 3997 bool skip_was_emitted = false; 3998 3999 if (!greedy_loop && choice_count == 2) { 4000 GuardedAlternative alt1 = alternatives_->at(1); 4001 if (alt1.guards() == NULL || alt1.guards()->length() == 0) { 4002 RegExpNode* eats_anything_node = alt1.node(); 4003 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) == 4004 this) { 4005 // At this point we know that we are at a non-greedy loop that will eat 4006 // any character one at a time. Any non-anchored regexp has such a 4007 // loop prepended to it in order to find where it starts. We look for 4008 // a pattern of the form ...abc... where we can look 6 characters ahead 4009 // and step forwards 3 if the character is not one of abc. Abc need 4010 // not be atoms, they can be any reasonably limited character class or 4011 // small alternation. 4012 ASSERT(trace->is_trivial()); // This is the case on LoopChoiceNodes. 4013 BoyerMooreLookahead* lookahead = bm_info(not_at_start); 4014 if (lookahead == NULL) { 4015 eats_at_least = Min(kMaxLookaheadForBoyerMoore, 4016 EatsAtLeast(kMaxLookaheadForBoyerMoore, 4017 kRecursionBudget, 4018 not_at_start)); 4019 if (eats_at_least >= 1) { 4020 BoyerMooreLookahead* bm = 4021 new(zone()) BoyerMooreLookahead(eats_at_least, 4022 compiler, 4023 zone()); 4024 GuardedAlternative alt0 = alternatives_->at(0); 4025 alt0.node()->FillInBMInfo(0, kRecursionBudget, bm, not_at_start); 4026 skip_was_emitted = bm->EmitSkipInstructions(macro_assembler); 4027 } 4028 } else { 4029 skip_was_emitted = lookahead->EmitSkipInstructions(macro_assembler); 4030 } 4031 } 4032 } 4033 } 4034 4035 if (eats_at_least == kEatsAtLeastNotYetInitialized) { 4036 // Save some time by looking at most one machine word ahead. 4037 eats_at_least = 4038 EatsAtLeast(compiler->ascii() ? 4 : 2, kRecursionBudget, not_at_start); 4039 } 4040 int preload_characters = CalculatePreloadCharacters(compiler, eats_at_least); 4041 4042 bool preload_is_current = !skip_was_emitted && 4043 (current_trace->characters_preloaded() == preload_characters); 4044 bool preload_has_checked_bounds = preload_is_current; 4045 4046 AlternativeGenerationList alt_gens(choice_count, zone()); 4047 4048 // For now we just call all choices one after the other. The idea ultimately 4049 // is to use the Dispatch table to try only the relevant ones. 4050 for (int i = first_normal_choice; i < choice_count; i++) { 4051 GuardedAlternative alternative = alternatives_->at(i); 4052 AlternativeGeneration* alt_gen = alt_gens.at(i); 4053 alt_gen->quick_check_details.set_characters(preload_characters); 4054 ZoneList<Guard*>* guards = alternative.guards(); 4055 int guard_count = (guards == NULL) ? 0 : guards->length(); 4056 Trace new_trace(*current_trace); 4057 new_trace.set_characters_preloaded(preload_is_current ? 4058 preload_characters : 4059 0); 4060 if (preload_has_checked_bounds) { 4061 new_trace.set_bound_checked_up_to(preload_characters); 4062 } 4063 new_trace.quick_check_performed()->Clear(); 4064 if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE); 4065 alt_gen->expects_preload = preload_is_current; 4066 bool generate_full_check_inline = false; 4067 if (FLAG_regexp_optimization && 4068 try_to_emit_quick_check_for_alternative(i) && 4069 alternative.node()->EmitQuickCheck(compiler, 4070 &new_trace, 4071 preload_has_checked_bounds, 4072 &alt_gen->possible_success, 4073 &alt_gen->quick_check_details, 4074 i < choice_count - 1)) { 4075 // Quick check was generated for this choice. 4076 preload_is_current = true; 4077 preload_has_checked_bounds = true; 4078 // On the last choice in the ChoiceNode we generated the quick 4079 // check to fall through on possible success. So now we need to 4080 // generate the full check inline. 4081 if (i == choice_count - 1) { 4082 macro_assembler->Bind(&alt_gen->possible_success); 4083 new_trace.set_quick_check_performed(&alt_gen->quick_check_details); 4084 new_trace.set_characters_preloaded(preload_characters); 4085 new_trace.set_bound_checked_up_to(preload_characters); 4086 generate_full_check_inline = true; 4087 } 4088 } else if (alt_gen->quick_check_details.cannot_match()) { 4089 if (i == choice_count - 1 && !greedy_loop) { 4090 macro_assembler->GoTo(trace->backtrack()); 4091 } 4092 continue; 4093 } else { 4094 // No quick check was generated. Put the full code here. 4095 // If this is not the first choice then there could be slow checks from 4096 // previous cases that go here when they fail. There's no reason to 4097 // insist that they preload characters since the slow check we are about 4098 // to generate probably can't use it. 4099 if (i != first_normal_choice) { 4100 alt_gen->expects_preload = false; 4101 new_trace.InvalidateCurrentCharacter(); 4102 } 4103 if (i < choice_count - 1) { 4104 new_trace.set_backtrack(&alt_gen->after); 4105 } 4106 generate_full_check_inline = true; 4107 } 4108 if (generate_full_check_inline) { 4109 if (new_trace.actions() != NULL) { 4110 new_trace.set_flush_budget(new_flush_budget); 4111 } 4112 for (int j = 0; j < guard_count; j++) { 4113 GenerateGuard(macro_assembler, guards->at(j), &new_trace); 4114 } 4115 alternative.node()->Emit(compiler, &new_trace); 4116 preload_is_current = false; 4117 } 4118 macro_assembler->Bind(&alt_gen->after); 4119 } 4120 if (greedy_loop) { 4121 macro_assembler->Bind(&greedy_loop_label); 4122 // If we have unwound to the bottom then backtrack. 4123 macro_assembler->CheckGreedyLoop(trace->backtrack()); 4124 // Otherwise try the second priority at an earlier position. 4125 macro_assembler->AdvanceCurrentPosition(-text_length); 4126 macro_assembler->GoTo(&second_choice); 4127 } 4128 4129 // At this point we need to generate slow checks for the alternatives where 4130 // the quick check was inlined. We can recognize these because the associated 4131 // label was bound. 4132 for (int i = first_normal_choice; i < choice_count - 1; i++) { 4133 AlternativeGeneration* alt_gen = alt_gens.at(i); 4134 Trace new_trace(*current_trace); 4135 // If there are actions to be flushed we have to limit how many times 4136 // they are flushed. Take the budget of the parent trace and distribute 4137 // it fairly amongst the children. 4138 if (new_trace.actions() != NULL) { 4139 new_trace.set_flush_budget(new_flush_budget); 4140 } 4141 EmitOutOfLineContinuation(compiler, 4142 &new_trace, 4143 alternatives_->at(i), 4144 alt_gen, 4145 preload_characters, 4146 alt_gens.at(i + 1)->expects_preload); 4147 } 4148 } 4149 4150 4151 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler, 4152 Trace* trace, 4153 GuardedAlternative alternative, 4154 AlternativeGeneration* alt_gen, 4155 int preload_characters, 4156 bool next_expects_preload) { 4157 if (!alt_gen->possible_success.is_linked()) return; 4158 4159 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 4160 macro_assembler->Bind(&alt_gen->possible_success); 4161 Trace out_of_line_trace(*trace); 4162 out_of_line_trace.set_characters_preloaded(preload_characters); 4163 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details); 4164 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE); 4165 ZoneList<Guard*>* guards = alternative.guards(); 4166 int guard_count = (guards == NULL) ? 0 : guards->length(); 4167 if (next_expects_preload) { 4168 Label reload_current_char; 4169 out_of_line_trace.set_backtrack(&reload_current_char); 4170 for (int j = 0; j < guard_count; j++) { 4171 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace); 4172 } 4173 alternative.node()->Emit(compiler, &out_of_line_trace); 4174 macro_assembler->Bind(&reload_current_char); 4175 // Reload the current character, since the next quick check expects that. 4176 // We don't need to check bounds here because we only get into this 4177 // code through a quick check which already did the checked load. 4178 macro_assembler->LoadCurrentCharacter(trace->cp_offset(), 4179 NULL, 4180 false, 4181 preload_characters); 4182 macro_assembler->GoTo(&(alt_gen->after)); 4183 } else { 4184 out_of_line_trace.set_backtrack(&(alt_gen->after)); 4185 for (int j = 0; j < guard_count; j++) { 4186 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace); 4187 } 4188 alternative.node()->Emit(compiler, &out_of_line_trace); 4189 } 4190 } 4191 4192 4193 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) { 4194 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 4195 LimitResult limit_result = LimitVersions(compiler, trace); 4196 if (limit_result == DONE) return; 4197 ASSERT(limit_result == CONTINUE); 4198 4199 RecursionCheck rc(compiler); 4200 4201 switch (action_type_) { 4202 case STORE_POSITION: { 4203 Trace::DeferredCapture 4204 new_capture(data_.u_position_register.reg, 4205 data_.u_position_register.is_capture, 4206 trace); 4207 Trace new_trace = *trace; 4208 new_trace.add_action(&new_capture); 4209 on_success()->Emit(compiler, &new_trace); 4210 break; 4211 } 4212 case INCREMENT_REGISTER: { 4213 Trace::DeferredIncrementRegister 4214 new_increment(data_.u_increment_register.reg); 4215 Trace new_trace = *trace; 4216 new_trace.add_action(&new_increment); 4217 on_success()->Emit(compiler, &new_trace); 4218 break; 4219 } 4220 case SET_REGISTER: { 4221 Trace::DeferredSetRegister 4222 new_set(data_.u_store_register.reg, data_.u_store_register.value); 4223 Trace new_trace = *trace; 4224 new_trace.add_action(&new_set); 4225 on_success()->Emit(compiler, &new_trace); 4226 break; 4227 } 4228 case CLEAR_CAPTURES: { 4229 Trace::DeferredClearCaptures 4230 new_capture(Interval(data_.u_clear_captures.range_from, 4231 data_.u_clear_captures.range_to)); 4232 Trace new_trace = *trace; 4233 new_trace.add_action(&new_capture); 4234 on_success()->Emit(compiler, &new_trace); 4235 break; 4236 } 4237 case BEGIN_SUBMATCH: 4238 if (!trace->is_trivial()) { 4239 trace->Flush(compiler, this); 4240 } else { 4241 assembler->WriteCurrentPositionToRegister( 4242 data_.u_submatch.current_position_register, 0); 4243 assembler->WriteStackPointerToRegister( 4244 data_.u_submatch.stack_pointer_register); 4245 on_success()->Emit(compiler, trace); 4246 } 4247 break; 4248 case EMPTY_MATCH_CHECK: { 4249 int start_pos_reg = data_.u_empty_match_check.start_register; 4250 int stored_pos = 0; 4251 int rep_reg = data_.u_empty_match_check.repetition_register; 4252 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister); 4253 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos); 4254 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) { 4255 // If we know we haven't advanced and there is no minimum we 4256 // can just backtrack immediately. 4257 assembler->GoTo(trace->backtrack()); 4258 } else if (know_dist && stored_pos < trace->cp_offset()) { 4259 // If we know we've advanced we can generate the continuation 4260 // immediately. 4261 on_success()->Emit(compiler, trace); 4262 } else if (!trace->is_trivial()) { 4263 trace->Flush(compiler, this); 4264 } else { 4265 Label skip_empty_check; 4266 // If we have a minimum number of repetitions we check the current 4267 // number first and skip the empty check if it's not enough. 4268 if (has_minimum) { 4269 int limit = data_.u_empty_match_check.repetition_limit; 4270 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check); 4271 } 4272 // If the match is empty we bail out, otherwise we fall through 4273 // to the on-success continuation. 4274 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register, 4275 trace->backtrack()); 4276 assembler->Bind(&skip_empty_check); 4277 on_success()->Emit(compiler, trace); 4278 } 4279 break; 4280 } 4281 case POSITIVE_SUBMATCH_SUCCESS: { 4282 if (!trace->is_trivial()) { 4283 trace->Flush(compiler, this); 4284 return; 4285 } 4286 assembler->ReadCurrentPositionFromRegister( 4287 data_.u_submatch.current_position_register); 4288 assembler->ReadStackPointerFromRegister( 4289 data_.u_submatch.stack_pointer_register); 4290 int clear_register_count = data_.u_submatch.clear_register_count; 4291 if (clear_register_count == 0) { 4292 on_success()->Emit(compiler, trace); 4293 return; 4294 } 4295 int clear_registers_from = data_.u_submatch.clear_register_from; 4296 Label clear_registers_backtrack; 4297 Trace new_trace = *trace; 4298 new_trace.set_backtrack(&clear_registers_backtrack); 4299 on_success()->Emit(compiler, &new_trace); 4300 4301 assembler->Bind(&clear_registers_backtrack); 4302 int clear_registers_to = clear_registers_from + clear_register_count - 1; 4303 assembler->ClearRegisters(clear_registers_from, clear_registers_to); 4304 4305 ASSERT(trace->backtrack() == NULL); 4306 assembler->Backtrack(); 4307 return; 4308 } 4309 default: 4310 UNREACHABLE(); 4311 } 4312 } 4313 4314 4315 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 4316 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 4317 if (!trace->is_trivial()) { 4318 trace->Flush(compiler, this); 4319 return; 4320 } 4321 4322 LimitResult limit_result = LimitVersions(compiler, trace); 4323 if (limit_result == DONE) return; 4324 ASSERT(limit_result == CONTINUE); 4325 4326 RecursionCheck rc(compiler); 4327 4328 ASSERT_EQ(start_reg_ + 1, end_reg_); 4329 if (compiler->ignore_case()) { 4330 assembler->CheckNotBackReferenceIgnoreCase(start_reg_, 4331 trace->backtrack()); 4332 } else { 4333 assembler->CheckNotBackReference(start_reg_, trace->backtrack()); 4334 } 4335 on_success()->Emit(compiler, trace); 4336 } 4337 4338 4339 // ------------------------------------------------------------------- 4340 // Dot/dotty output 4341 4342 4343 #ifdef DEBUG 4344 4345 4346 class DotPrinter: public NodeVisitor { 4347 public: 4348 explicit DotPrinter(bool ignore_case) 4349 : ignore_case_(ignore_case), 4350 stream_(&alloc_) { } 4351 void PrintNode(const char* label, RegExpNode* node); 4352 void Visit(RegExpNode* node); 4353 void PrintAttributes(RegExpNode* from); 4354 StringStream* stream() { return &stream_; } 4355 void PrintOnFailure(RegExpNode* from, RegExpNode* to); 4356 #define DECLARE_VISIT(Type) \ 4357 virtual void Visit##Type(Type##Node* that); 4358 FOR_EACH_NODE_TYPE(DECLARE_VISIT) 4359 #undef DECLARE_VISIT 4360 private: 4361 bool ignore_case_; 4362 HeapStringAllocator alloc_; 4363 StringStream stream_; 4364 }; 4365 4366 4367 void DotPrinter::PrintNode(const char* label, RegExpNode* node) { 4368 stream()->Add("digraph G {\n graph [label=\""); 4369 for (int i = 0; label[i]; i++) { 4370 switch (label[i]) { 4371 case '\\': 4372 stream()->Add("\\\\"); 4373 break; 4374 case '"': 4375 stream()->Add("\""); 4376 break; 4377 default: 4378 stream()->Put(label[i]); 4379 break; 4380 } 4381 } 4382 stream()->Add("\"];\n"); 4383 Visit(node); 4384 stream()->Add("}\n"); 4385 printf("%s", *(stream()->ToCString())); 4386 } 4387 4388 4389 void DotPrinter::Visit(RegExpNode* node) { 4390 if (node->info()->visited) return; 4391 node->info()->visited = true; 4392 node->Accept(this); 4393 } 4394 4395 4396 void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) { 4397 stream()->Add(" n%p -> n%p [style=dotted];\n", from, on_failure); 4398 Visit(on_failure); 4399 } 4400 4401 4402 class TableEntryBodyPrinter { 4403 public: 4404 TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice) 4405 : stream_(stream), choice_(choice) { } 4406 void Call(uc16 from, DispatchTable::Entry entry) { 4407 OutSet* out_set = entry.out_set(); 4408 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { 4409 if (out_set->Get(i)) { 4410 stream()->Add(" n%p:s%io%i -> n%p;\n", 4411 choice(), 4412 from, 4413 i, 4414 choice()->alternatives()->at(i).node()); 4415 } 4416 } 4417 } 4418 private: 4419 StringStream* stream() { return stream_; } 4420 ChoiceNode* choice() { return choice_; } 4421 StringStream* stream_; 4422 ChoiceNode* choice_; 4423 }; 4424 4425 4426 class TableEntryHeaderPrinter { 4427 public: 4428 explicit TableEntryHeaderPrinter(StringStream* stream) 4429 : first_(true), stream_(stream) { } 4430 void Call(uc16 from, DispatchTable::Entry entry) { 4431 if (first_) { 4432 first_ = false; 4433 } else { 4434 stream()->Add("|"); 4435 } 4436 stream()->Add("{\\%k-\\%k|{", from, entry.to()); 4437 OutSet* out_set = entry.out_set(); 4438 int priority = 0; 4439 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { 4440 if (out_set->Get(i)) { 4441 if (priority > 0) stream()->Add("|"); 4442 stream()->Add("<s%io%i> %i", from, i, priority); 4443 priority++; 4444 } 4445 } 4446 stream()->Add("}}"); 4447 } 4448 4449 private: 4450 bool first_; 4451 StringStream* stream() { return stream_; } 4452 StringStream* stream_; 4453 }; 4454 4455 4456 class AttributePrinter { 4457 public: 4458 explicit AttributePrinter(DotPrinter* out) 4459 : out_(out), first_(true) { } 4460 void PrintSeparator() { 4461 if (first_) { 4462 first_ = false; 4463 } else { 4464 out_->stream()->Add("|"); 4465 } 4466 } 4467 void PrintBit(const char* name, bool value) { 4468 if (!value) return; 4469 PrintSeparator(); 4470 out_->stream()->Add("{%s}", name); 4471 } 4472 void PrintPositive(const char* name, int value) { 4473 if (value < 0) return; 4474 PrintSeparator(); 4475 out_->stream()->Add("{%s|%x}", name, value); 4476 } 4477 private: 4478 DotPrinter* out_; 4479 bool first_; 4480 }; 4481 4482 4483 void DotPrinter::PrintAttributes(RegExpNode* that) { 4484 stream()->Add(" a%p [shape=Mrecord, color=grey, fontcolor=grey, " 4485 "margin=0.1, fontsize=10, label=\"{", 4486 that); 4487 AttributePrinter printer(this); 4488 NodeInfo* info = that->info(); 4489 printer.PrintBit("NI", info->follows_newline_interest); 4490 printer.PrintBit("WI", info->follows_word_interest); 4491 printer.PrintBit("SI", info->follows_start_interest); 4492 Label* label = that->label(); 4493 if (label->is_bound()) 4494 printer.PrintPositive("@", label->pos()); 4495 stream()->Add("}\"];\n"); 4496 stream()->Add(" a%p -> n%p [style=dashed, color=grey, " 4497 "arrowhead=none];\n", that, that); 4498 } 4499 4500 4501 static const bool kPrintDispatchTable = false; 4502 void DotPrinter::VisitChoice(ChoiceNode* that) { 4503 if (kPrintDispatchTable) { 4504 stream()->Add(" n%p [shape=Mrecord, label=\"", that); 4505 TableEntryHeaderPrinter header_printer(stream()); 4506 that->GetTable(ignore_case_)->ForEach(&header_printer); 4507 stream()->Add("\"]\n", that); 4508 PrintAttributes(that); 4509 TableEntryBodyPrinter body_printer(stream(), that); 4510 that->GetTable(ignore_case_)->ForEach(&body_printer); 4511 } else { 4512 stream()->Add(" n%p [shape=Mrecord, label=\"?\"];\n", that); 4513 for (int i = 0; i < that->alternatives()->length(); i++) { 4514 GuardedAlternative alt = that->alternatives()->at(i); 4515 stream()->Add(" n%p -> n%p;\n", that, alt.node()); 4516 } 4517 } 4518 for (int i = 0; i < that->alternatives()->length(); i++) { 4519 GuardedAlternative alt = that->alternatives()->at(i); 4520 alt.node()->Accept(this); 4521 } 4522 } 4523 4524 4525 void DotPrinter::VisitText(TextNode* that) { 4526 Zone* zone = that->zone(); 4527 stream()->Add(" n%p [label=\"", that); 4528 for (int i = 0; i < that->elements()->length(); i++) { 4529 if (i > 0) stream()->Add(" "); 4530 TextElement elm = that->elements()->at(i); 4531 switch (elm.text_type) { 4532 case TextElement::ATOM: { 4533 stream()->Add("'%w'", elm.data.u_atom->data()); 4534 break; 4535 } 4536 case TextElement::CHAR_CLASS: { 4537 RegExpCharacterClass* node = elm.data.u_char_class; 4538 stream()->Add("["); 4539 if (node->is_negated()) 4540 stream()->Add("^"); 4541 for (int j = 0; j < node->ranges(zone)->length(); j++) { 4542 CharacterRange range = node->ranges(zone)->at(j); 4543 stream()->Add("%k-%k", range.from(), range.to()); 4544 } 4545 stream()->Add("]"); 4546 break; 4547 } 4548 default: 4549 UNREACHABLE(); 4550 } 4551 } 4552 stream()->Add("\", shape=box, peripheries=2];\n"); 4553 PrintAttributes(that); 4554 stream()->Add(" n%p -> n%p;\n", that, that->on_success()); 4555 Visit(that->on_success()); 4556 } 4557 4558 4559 void DotPrinter::VisitBackReference(BackReferenceNode* that) { 4560 stream()->Add(" n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n", 4561 that, 4562 that->start_register(), 4563 that->end_register()); 4564 PrintAttributes(that); 4565 stream()->Add(" n%p -> n%p;\n", that, that->on_success()); 4566 Visit(that->on_success()); 4567 } 4568 4569 4570 void DotPrinter::VisitEnd(EndNode* that) { 4571 stream()->Add(" n%p [style=bold, shape=point];\n", that); 4572 PrintAttributes(that); 4573 } 4574 4575 4576 void DotPrinter::VisitAssertion(AssertionNode* that) { 4577 stream()->Add(" n%p [", that); 4578 switch (that->assertion_type()) { 4579 case AssertionNode::AT_END: 4580 stream()->Add("label=\"$\", shape=septagon"); 4581 break; 4582 case AssertionNode::AT_START: 4583 stream()->Add("label=\"^\", shape=septagon"); 4584 break; 4585 case AssertionNode::AT_BOUNDARY: 4586 stream()->Add("label=\"\\b\", shape=septagon"); 4587 break; 4588 case AssertionNode::AT_NON_BOUNDARY: 4589 stream()->Add("label=\"\\B\", shape=septagon"); 4590 break; 4591 case AssertionNode::AFTER_NEWLINE: 4592 stream()->Add("label=\"(?<=\\n)\", shape=septagon"); 4593 break; 4594 } 4595 stream()->Add("];\n"); 4596 PrintAttributes(that); 4597 RegExpNode* successor = that->on_success(); 4598 stream()->Add(" n%p -> n%p;\n", that, successor); 4599 Visit(successor); 4600 } 4601 4602 4603 void DotPrinter::VisitAction(ActionNode* that) { 4604 stream()->Add(" n%p [", that); 4605 switch (that->action_type_) { 4606 case ActionNode::SET_REGISTER: 4607 stream()->Add("label=\"$%i:=%i\", shape=octagon", 4608 that->data_.u_store_register.reg, 4609 that->data_.u_store_register.value); 4610 break; 4611 case ActionNode::INCREMENT_REGISTER: 4612 stream()->Add("label=\"$%i++\", shape=octagon", 4613 that->data_.u_increment_register.reg); 4614 break; 4615 case ActionNode::STORE_POSITION: 4616 stream()->Add("label=\"$%i:=$pos\", shape=octagon", 4617 that->data_.u_position_register.reg); 4618 break; 4619 case ActionNode::BEGIN_SUBMATCH: 4620 stream()->Add("label=\"$%i:=$pos,begin\", shape=septagon", 4621 that->data_.u_submatch.current_position_register); 4622 break; 4623 case ActionNode::POSITIVE_SUBMATCH_SUCCESS: 4624 stream()->Add("label=\"escape\", shape=septagon"); 4625 break; 4626 case ActionNode::EMPTY_MATCH_CHECK: 4627 stream()->Add("label=\"$%i=$pos?,$%i<%i?\", shape=septagon", 4628 that->data_.u_empty_match_check.start_register, 4629 that->data_.u_empty_match_check.repetition_register, 4630 that->data_.u_empty_match_check.repetition_limit); 4631 break; 4632 case ActionNode::CLEAR_CAPTURES: { 4633 stream()->Add("label=\"clear $%i to $%i\", shape=septagon", 4634 that->data_.u_clear_captures.range_from, 4635 that->data_.u_clear_captures.range_to); 4636 break; 4637 } 4638 } 4639 stream()->Add("];\n"); 4640 PrintAttributes(that); 4641 RegExpNode* successor = that->on_success(); 4642 stream()->Add(" n%p -> n%p;\n", that, successor); 4643 Visit(successor); 4644 } 4645 4646 4647 class DispatchTableDumper { 4648 public: 4649 explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { } 4650 void Call(uc16 key, DispatchTable::Entry entry); 4651 StringStream* stream() { return stream_; } 4652 private: 4653 StringStream* stream_; 4654 }; 4655 4656 4657 void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) { 4658 stream()->Add("[%k-%k]: {", key, entry.to()); 4659 OutSet* set = entry.out_set(); 4660 bool first = true; 4661 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { 4662 if (set->Get(i)) { 4663 if (first) { 4664 first = false; 4665 } else { 4666 stream()->Add(", "); 4667 } 4668 stream()->Add("%i", i); 4669 } 4670 } 4671 stream()->Add("}\n"); 4672 } 4673 4674 4675 void DispatchTable::Dump() { 4676 HeapStringAllocator alloc; 4677 StringStream stream(&alloc); 4678 DispatchTableDumper dumper(&stream); 4679 tree()->ForEach(&dumper); 4680 OS::PrintError("%s", *stream.ToCString()); 4681 } 4682 4683 4684 void RegExpEngine::DotPrint(const char* label, 4685 RegExpNode* node, 4686 bool ignore_case) { 4687 DotPrinter printer(ignore_case); 4688 printer.PrintNode(label, node); 4689 } 4690 4691 4692 #endif // DEBUG 4693 4694 4695 // ------------------------------------------------------------------- 4696 // Tree to graph conversion 4697 4698 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler, 4699 RegExpNode* on_success) { 4700 ZoneList<TextElement>* elms = 4701 new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone()); 4702 elms->Add(TextElement::Atom(this), compiler->zone()); 4703 return new(compiler->zone()) TextNode(elms, on_success); 4704 } 4705 4706 4707 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, 4708 RegExpNode* on_success) { 4709 return new(compiler->zone()) TextNode(elements(), on_success); 4710 } 4711 4712 4713 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges, 4714 const int* special_class, 4715 int length) { 4716 length--; // Remove final 0x10000. 4717 ASSERT(special_class[length] == 0x10000); 4718 ASSERT(ranges->length() != 0); 4719 ASSERT(length != 0); 4720 ASSERT(special_class[0] != 0); 4721 if (ranges->length() != (length >> 1) + 1) { 4722 return false; 4723 } 4724 CharacterRange range = ranges->at(0); 4725 if (range.from() != 0) { 4726 return false; 4727 } 4728 for (int i = 0; i < length; i += 2) { 4729 if (special_class[i] != (range.to() + 1)) { 4730 return false; 4731 } 4732 range = ranges->at((i >> 1) + 1); 4733 if (special_class[i+1] != range.from()) { 4734 return false; 4735 } 4736 } 4737 if (range.to() != 0xffff) { 4738 return false; 4739 } 4740 return true; 4741 } 4742 4743 4744 static bool CompareRanges(ZoneList<CharacterRange>* ranges, 4745 const int* special_class, 4746 int length) { 4747 length--; // Remove final 0x10000. 4748 ASSERT(special_class[length] == 0x10000); 4749 if (ranges->length() * 2 != length) { 4750 return false; 4751 } 4752 for (int i = 0; i < length; i += 2) { 4753 CharacterRange range = ranges->at(i >> 1); 4754 if (range.from() != special_class[i] || 4755 range.to() != special_class[i + 1] - 1) { 4756 return false; 4757 } 4758 } 4759 return true; 4760 } 4761 4762 4763 bool RegExpCharacterClass::is_standard(Zone* zone) { 4764 // TODO(lrn): Remove need for this function, by not throwing away information 4765 // along the way. 4766 if (is_negated_) { 4767 return false; 4768 } 4769 if (set_.is_standard()) { 4770 return true; 4771 } 4772 if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) { 4773 set_.set_standard_set_type('s'); 4774 return true; 4775 } 4776 if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) { 4777 set_.set_standard_set_type('S'); 4778 return true; 4779 } 4780 if (CompareInverseRanges(set_.ranges(zone), 4781 kLineTerminatorRanges, 4782 kLineTerminatorRangeCount)) { 4783 set_.set_standard_set_type('.'); 4784 return true; 4785 } 4786 if (CompareRanges(set_.ranges(zone), 4787 kLineTerminatorRanges, 4788 kLineTerminatorRangeCount)) { 4789 set_.set_standard_set_type('n'); 4790 return true; 4791 } 4792 if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) { 4793 set_.set_standard_set_type('w'); 4794 return true; 4795 } 4796 if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) { 4797 set_.set_standard_set_type('W'); 4798 return true; 4799 } 4800 return false; 4801 } 4802 4803 4804 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, 4805 RegExpNode* on_success) { 4806 return new(compiler->zone()) TextNode(this, on_success); 4807 } 4808 4809 4810 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler, 4811 RegExpNode* on_success) { 4812 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 4813 int length = alternatives->length(); 4814 ChoiceNode* result = 4815 new(compiler->zone()) ChoiceNode(length, compiler->zone()); 4816 for (int i = 0; i < length; i++) { 4817 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler, 4818 on_success)); 4819 result->AddAlternative(alternative); 4820 } 4821 return result; 4822 } 4823 4824 4825 RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler, 4826 RegExpNode* on_success) { 4827 return ToNode(min(), 4828 max(), 4829 is_greedy(), 4830 body(), 4831 compiler, 4832 on_success); 4833 } 4834 4835 4836 // Scoped object to keep track of how much we unroll quantifier loops in the 4837 // regexp graph generator. 4838 class RegExpExpansionLimiter { 4839 public: 4840 static const int kMaxExpansionFactor = 6; 4841 RegExpExpansionLimiter(RegExpCompiler* compiler, int factor) 4842 : compiler_(compiler), 4843 saved_expansion_factor_(compiler->current_expansion_factor()), 4844 ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) { 4845 ASSERT(factor > 0); 4846 if (ok_to_expand_) { 4847 if (factor > kMaxExpansionFactor) { 4848 // Avoid integer overflow of the current expansion factor. 4849 ok_to_expand_ = false; 4850 compiler->set_current_expansion_factor(kMaxExpansionFactor + 1); 4851 } else { 4852 int new_factor = saved_expansion_factor_ * factor; 4853 ok_to_expand_ = (new_factor <= kMaxExpansionFactor); 4854 compiler->set_current_expansion_factor(new_factor); 4855 } 4856 } 4857 } 4858 4859 ~RegExpExpansionLimiter() { 4860 compiler_->set_current_expansion_factor(saved_expansion_factor_); 4861 } 4862 4863 bool ok_to_expand() { return ok_to_expand_; } 4864 4865 private: 4866 RegExpCompiler* compiler_; 4867 int saved_expansion_factor_; 4868 bool ok_to_expand_; 4869 4870 DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter); 4871 }; 4872 4873 4874 RegExpNode* RegExpQuantifier::ToNode(int min, 4875 int max, 4876 bool is_greedy, 4877 RegExpTree* body, 4878 RegExpCompiler* compiler, 4879 RegExpNode* on_success, 4880 bool not_at_start) { 4881 // x{f, t} becomes this: 4882 // 4883 // (r++)<-. 4884 // | ` 4885 // | (x) 4886 // v ^ 4887 // (r=0)-->(?)---/ [if r < t] 4888 // | 4889 // [if r >= f] \----> ... 4890 // 4891 4892 // 15.10.2.5 RepeatMatcher algorithm. 4893 // The parser has already eliminated the case where max is 0. In the case 4894 // where max_match is zero the parser has removed the quantifier if min was 4895 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom. 4896 4897 // If we know that we cannot match zero length then things are a little 4898 // simpler since we don't need to make the special zero length match check 4899 // from step 2.1. If the min and max are small we can unroll a little in 4900 // this case. 4901 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,} 4902 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3} 4903 if (max == 0) return on_success; // This can happen due to recursion. 4904 bool body_can_be_empty = (body->min_match() == 0); 4905 int body_start_reg = RegExpCompiler::kNoRegister; 4906 Interval capture_registers = body->CaptureRegisters(); 4907 bool needs_capture_clearing = !capture_registers.is_empty(); 4908 Zone* zone = compiler->zone(); 4909 4910 if (body_can_be_empty) { 4911 body_start_reg = compiler->AllocateRegister(); 4912 } else if (FLAG_regexp_optimization && !needs_capture_clearing) { 4913 // Only unroll if there are no captures and the body can't be 4914 // empty. 4915 { 4916 RegExpExpansionLimiter limiter( 4917 compiler, min + ((max != min) ? 1 : 0)); 4918 if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) { 4919 int new_max = (max == kInfinity) ? max : max - min; 4920 // Recurse once to get the loop or optional matches after the fixed 4921 // ones. 4922 RegExpNode* answer = ToNode( 4923 0, new_max, is_greedy, body, compiler, on_success, true); 4924 // Unroll the forced matches from 0 to min. This can cause chains of 4925 // TextNodes (which the parser does not generate). These should be 4926 // combined if it turns out they hinder good code generation. 4927 for (int i = 0; i < min; i++) { 4928 answer = body->ToNode(compiler, answer); 4929 } 4930 return answer; 4931 } 4932 } 4933 if (max <= kMaxUnrolledMaxMatches && min == 0) { 4934 ASSERT(max > 0); // Due to the 'if' above. 4935 RegExpExpansionLimiter limiter(compiler, max); 4936 if (limiter.ok_to_expand()) { 4937 // Unroll the optional matches up to max. 4938 RegExpNode* answer = on_success; 4939 for (int i = 0; i < max; i++) { 4940 ChoiceNode* alternation = new(zone) ChoiceNode(2, zone); 4941 if (is_greedy) { 4942 alternation->AddAlternative( 4943 GuardedAlternative(body->ToNode(compiler, answer))); 4944 alternation->AddAlternative(GuardedAlternative(on_success)); 4945 } else { 4946 alternation->AddAlternative(GuardedAlternative(on_success)); 4947 alternation->AddAlternative( 4948 GuardedAlternative(body->ToNode(compiler, answer))); 4949 } 4950 answer = alternation; 4951 if (not_at_start) alternation->set_not_at_start(); 4952 } 4953 return answer; 4954 } 4955 } 4956 } 4957 bool has_min = min > 0; 4958 bool has_max = max < RegExpTree::kInfinity; 4959 bool needs_counter = has_min || has_max; 4960 int reg_ctr = needs_counter 4961 ? compiler->AllocateRegister() 4962 : RegExpCompiler::kNoRegister; 4963 LoopChoiceNode* center = new(zone) LoopChoiceNode(body->min_match() == 0, 4964 zone); 4965 if (not_at_start) center->set_not_at_start(); 4966 RegExpNode* loop_return = needs_counter 4967 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center)) 4968 : static_cast<RegExpNode*>(center); 4969 if (body_can_be_empty) { 4970 // If the body can be empty we need to check if it was and then 4971 // backtrack. 4972 loop_return = ActionNode::EmptyMatchCheck(body_start_reg, 4973 reg_ctr, 4974 min, 4975 loop_return); 4976 } 4977 RegExpNode* body_node = body->ToNode(compiler, loop_return); 4978 if (body_can_be_empty) { 4979 // If the body can be empty we need to store the start position 4980 // so we can bail out if it was empty. 4981 body_node = ActionNode::StorePosition(body_start_reg, false, body_node); 4982 } 4983 if (needs_capture_clearing) { 4984 // Before entering the body of this loop we need to clear captures. 4985 body_node = ActionNode::ClearCaptures(capture_registers, body_node); 4986 } 4987 GuardedAlternative body_alt(body_node); 4988 if (has_max) { 4989 Guard* body_guard = 4990 new(zone) Guard(reg_ctr, Guard::LT, max); 4991 body_alt.AddGuard(body_guard, zone); 4992 } 4993 GuardedAlternative rest_alt(on_success); 4994 if (has_min) { 4995 Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min); 4996 rest_alt.AddGuard(rest_guard, zone); 4997 } 4998 if (is_greedy) { 4999 center->AddLoopAlternative(body_alt); 5000 center->AddContinueAlternative(rest_alt); 5001 } else { 5002 center->AddContinueAlternative(rest_alt); 5003 center->AddLoopAlternative(body_alt); 5004 } 5005 if (needs_counter) { 5006 return ActionNode::SetRegister(reg_ctr, 0, center); 5007 } else { 5008 return center; 5009 } 5010 } 5011 5012 5013 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, 5014 RegExpNode* on_success) { 5015 NodeInfo info; 5016 Zone* zone = compiler->zone(); 5017 5018 switch (assertion_type()) { 5019 case START_OF_LINE: 5020 return AssertionNode::AfterNewline(on_success); 5021 case START_OF_INPUT: 5022 return AssertionNode::AtStart(on_success); 5023 case BOUNDARY: 5024 return AssertionNode::AtBoundary(on_success); 5025 case NON_BOUNDARY: 5026 return AssertionNode::AtNonBoundary(on_success); 5027 case END_OF_INPUT: 5028 return AssertionNode::AtEnd(on_success); 5029 case END_OF_LINE: { 5030 // Compile $ in multiline regexps as an alternation with a positive 5031 // lookahead in one side and an end-of-input on the other side. 5032 // We need two registers for the lookahead. 5033 int stack_pointer_register = compiler->AllocateRegister(); 5034 int position_register = compiler->AllocateRegister(); 5035 // The ChoiceNode to distinguish between a newline and end-of-input. 5036 ChoiceNode* result = new(zone) ChoiceNode(2, zone); 5037 // Create a newline atom. 5038 ZoneList<CharacterRange>* newline_ranges = 5039 new(zone) ZoneList<CharacterRange>(3, zone); 5040 CharacterRange::AddClassEscape('n', newline_ranges, zone); 5041 RegExpCharacterClass* newline_atom = new(zone) RegExpCharacterClass('n'); 5042 TextNode* newline_matcher = new(zone) TextNode( 5043 newline_atom, 5044 ActionNode::PositiveSubmatchSuccess(stack_pointer_register, 5045 position_register, 5046 0, // No captures inside. 5047 -1, // Ignored if no captures. 5048 on_success)); 5049 // Create an end-of-input matcher. 5050 RegExpNode* end_of_line = ActionNode::BeginSubmatch( 5051 stack_pointer_register, 5052 position_register, 5053 newline_matcher); 5054 // Add the two alternatives to the ChoiceNode. 5055 GuardedAlternative eol_alternative(end_of_line); 5056 result->AddAlternative(eol_alternative); 5057 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success)); 5058 result->AddAlternative(end_alternative); 5059 return result; 5060 } 5061 default: 5062 UNREACHABLE(); 5063 } 5064 return on_success; 5065 } 5066 5067 5068 RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler, 5069 RegExpNode* on_success) { 5070 return new(compiler->zone()) 5071 BackReferenceNode(RegExpCapture::StartRegister(index()), 5072 RegExpCapture::EndRegister(index()), 5073 on_success); 5074 } 5075 5076 5077 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler, 5078 RegExpNode* on_success) { 5079 return on_success; 5080 } 5081 5082 5083 RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler, 5084 RegExpNode* on_success) { 5085 int stack_pointer_register = compiler->AllocateRegister(); 5086 int position_register = compiler->AllocateRegister(); 5087 5088 const int registers_per_capture = 2; 5089 const int register_of_first_capture = 2; 5090 int register_count = capture_count_ * registers_per_capture; 5091 int register_start = 5092 register_of_first_capture + capture_from_ * registers_per_capture; 5093 5094 RegExpNode* success; 5095 if (is_positive()) { 5096 RegExpNode* node = ActionNode::BeginSubmatch( 5097 stack_pointer_register, 5098 position_register, 5099 body()->ToNode( 5100 compiler, 5101 ActionNode::PositiveSubmatchSuccess(stack_pointer_register, 5102 position_register, 5103 register_count, 5104 register_start, 5105 on_success))); 5106 return node; 5107 } else { 5108 // We use a ChoiceNode for a negative lookahead because it has most of 5109 // the characteristics we need. It has the body of the lookahead as its 5110 // first alternative and the expression after the lookahead of the second 5111 // alternative. If the first alternative succeeds then the 5112 // NegativeSubmatchSuccess will unwind the stack including everything the 5113 // choice node set up and backtrack. If the first alternative fails then 5114 // the second alternative is tried, which is exactly the desired result 5115 // for a negative lookahead. The NegativeLookaheadChoiceNode is a special 5116 // ChoiceNode that knows to ignore the first exit when calculating quick 5117 // checks. 5118 Zone* zone = compiler->zone(); 5119 5120 GuardedAlternative body_alt( 5121 body()->ToNode( 5122 compiler, 5123 success = new(zone) NegativeSubmatchSuccess(stack_pointer_register, 5124 position_register, 5125 register_count, 5126 register_start, 5127 zone))); 5128 ChoiceNode* choice_node = 5129 new(zone) NegativeLookaheadChoiceNode(body_alt, 5130 GuardedAlternative(on_success), 5131 zone); 5132 return ActionNode::BeginSubmatch(stack_pointer_register, 5133 position_register, 5134 choice_node); 5135 } 5136 } 5137 5138 5139 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler, 5140 RegExpNode* on_success) { 5141 return ToNode(body(), index(), compiler, on_success); 5142 } 5143 5144 5145 RegExpNode* RegExpCapture::ToNode(RegExpTree* body, 5146 int index, 5147 RegExpCompiler* compiler, 5148 RegExpNode* on_success) { 5149 int start_reg = RegExpCapture::StartRegister(index); 5150 int end_reg = RegExpCapture::EndRegister(index); 5151 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success); 5152 RegExpNode* body_node = body->ToNode(compiler, store_end); 5153 return ActionNode::StorePosition(start_reg, true, body_node); 5154 } 5155 5156 5157 RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler, 5158 RegExpNode* on_success) { 5159 ZoneList<RegExpTree*>* children = nodes(); 5160 RegExpNode* current = on_success; 5161 for (int i = children->length() - 1; i >= 0; i--) { 5162 current = children->at(i)->ToNode(compiler, current); 5163 } 5164 return current; 5165 } 5166 5167 5168 static void AddClass(const int* elmv, 5169 int elmc, 5170 ZoneList<CharacterRange>* ranges, 5171 Zone* zone) { 5172 elmc--; 5173 ASSERT(elmv[elmc] == 0x10000); 5174 for (int i = 0; i < elmc; i += 2) { 5175 ASSERT(elmv[i] < elmv[i + 1]); 5176 ranges->Add(CharacterRange(elmv[i], elmv[i + 1] - 1), zone); 5177 } 5178 } 5179 5180 5181 static void AddClassNegated(const int *elmv, 5182 int elmc, 5183 ZoneList<CharacterRange>* ranges, 5184 Zone* zone) { 5185 elmc--; 5186 ASSERT(elmv[elmc] == 0x10000); 5187 ASSERT(elmv[0] != 0x0000); 5188 ASSERT(elmv[elmc-1] != String::kMaxUtf16CodeUnit); 5189 uc16 last = 0x0000; 5190 for (int i = 0; i < elmc; i += 2) { 5191 ASSERT(last <= elmv[i] - 1); 5192 ASSERT(elmv[i] < elmv[i + 1]); 5193 ranges->Add(CharacterRange(last, elmv[i] - 1), zone); 5194 last = elmv[i + 1]; 5195 } 5196 ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit), zone); 5197 } 5198 5199 5200 void CharacterRange::AddClassEscape(uc16 type, 5201 ZoneList<CharacterRange>* ranges, 5202 Zone* zone) { 5203 switch (type) { 5204 case 's': 5205 AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone); 5206 break; 5207 case 'S': 5208 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone); 5209 break; 5210 case 'w': 5211 AddClass(kWordRanges, kWordRangeCount, ranges, zone); 5212 break; 5213 case 'W': 5214 AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone); 5215 break; 5216 case 'd': 5217 AddClass(kDigitRanges, kDigitRangeCount, ranges, zone); 5218 break; 5219 case 'D': 5220 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone); 5221 break; 5222 case '.': 5223 AddClassNegated(kLineTerminatorRanges, 5224 kLineTerminatorRangeCount, 5225 ranges, 5226 zone); 5227 break; 5228 // This is not a character range as defined by the spec but a 5229 // convenient shorthand for a character class that matches any 5230 // character. 5231 case '*': 5232 ranges->Add(CharacterRange::Everything(), zone); 5233 break; 5234 // This is the set of characters matched by the $ and ^ symbols 5235 // in multiline mode. 5236 case 'n': 5237 AddClass(kLineTerminatorRanges, 5238 kLineTerminatorRangeCount, 5239 ranges, 5240 zone); 5241 break; 5242 default: 5243 UNREACHABLE(); 5244 } 5245 } 5246 5247 5248 Vector<const int> CharacterRange::GetWordBounds() { 5249 return Vector<const int>(kWordRanges, kWordRangeCount - 1); 5250 } 5251 5252 5253 class CharacterRangeSplitter { 5254 public: 5255 CharacterRangeSplitter(ZoneList<CharacterRange>** included, 5256 ZoneList<CharacterRange>** excluded, 5257 Zone* zone) 5258 : included_(included), 5259 excluded_(excluded), 5260 zone_(zone) { } 5261 void Call(uc16 from, DispatchTable::Entry entry); 5262 5263 static const int kInBase = 0; 5264 static const int kInOverlay = 1; 5265 5266 private: 5267 ZoneList<CharacterRange>** included_; 5268 ZoneList<CharacterRange>** excluded_; 5269 Zone* zone_; 5270 }; 5271 5272 5273 void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) { 5274 if (!entry.out_set()->Get(kInBase)) return; 5275 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay) 5276 ? included_ 5277 : excluded_; 5278 if (*target == NULL) *target = new(zone_) ZoneList<CharacterRange>(2, zone_); 5279 (*target)->Add(CharacterRange(entry.from(), entry.to()), zone_); 5280 } 5281 5282 5283 void CharacterRange::Split(ZoneList<CharacterRange>* base, 5284 Vector<const int> overlay, 5285 ZoneList<CharacterRange>** included, 5286 ZoneList<CharacterRange>** excluded, 5287 Zone* zone) { 5288 ASSERT_EQ(NULL, *included); 5289 ASSERT_EQ(NULL, *excluded); 5290 DispatchTable table(zone); 5291 for (int i = 0; i < base->length(); i++) 5292 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase, zone); 5293 for (int i = 0; i < overlay.length(); i += 2) { 5294 table.AddRange(CharacterRange(overlay[i], overlay[i + 1] - 1), 5295 CharacterRangeSplitter::kInOverlay, zone); 5296 } 5297 CharacterRangeSplitter callback(included, excluded, zone); 5298 table.ForEach(&callback); 5299 } 5300 5301 5302 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges, 5303 bool is_ascii, 5304 Zone* zone) { 5305 Isolate* isolate = Isolate::Current(); 5306 uc16 bottom = from(); 5307 uc16 top = to(); 5308 if (is_ascii && !RangeContainsLatin1Equivalents(*this)) { 5309 if (bottom > String::kMaxOneByteCharCode) return; 5310 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode; 5311 } 5312 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 5313 if (top == bottom) { 5314 // If this is a singleton we just expand the one character. 5315 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars); 5316 for (int i = 0; i < length; i++) { 5317 uc32 chr = chars[i]; 5318 if (chr != bottom) { 5319 ranges->Add(CharacterRange::Singleton(chars[i]), zone); 5320 } 5321 } 5322 } else { 5323 // If this is a range we expand the characters block by block, 5324 // expanding contiguous subranges (blocks) one at a time. 5325 // The approach is as follows. For a given start character we 5326 // look up the remainder of the block that contains it (represented 5327 // by the end point), for instance we find 'z' if the character 5328 // is 'c'. A block is characterized by the property 5329 // that all characters uncanonicalize in the same way, except that 5330 // each entry in the result is incremented by the distance from the first 5331 // element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and 5332 // the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. 5333 // Once we've found the end point we look up its uncanonicalization 5334 // and produce a range for each element. For instance for [c-f] 5335 // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only 5336 // add a range if it is not already contained in the input, so [c-f] 5337 // will be skipped but [C-F] will be added. If this range is not 5338 // completely contained in a block we do this for all the blocks 5339 // covered by the range (handling characters that is not in a block 5340 // as a "singleton block"). 5341 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 5342 int pos = bottom; 5343 while (pos <= top) { 5344 int length = isolate->jsregexp_canonrange()->get(pos, '\0', range); 5345 uc16 block_end; 5346 if (length == 0) { 5347 block_end = pos; 5348 } else { 5349 ASSERT_EQ(1, length); 5350 block_end = range[0]; 5351 } 5352 int end = (block_end > top) ? top : block_end; 5353 length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', range); 5354 for (int i = 0; i < length; i++) { 5355 uc32 c = range[i]; 5356 uc16 range_from = c - (block_end - pos); 5357 uc16 range_to = c - (block_end - end); 5358 if (!(bottom <= range_from && range_to <= top)) { 5359 ranges->Add(CharacterRange(range_from, range_to), zone); 5360 } 5361 } 5362 pos = end + 1; 5363 } 5364 } 5365 } 5366 5367 5368 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) { 5369 ASSERT_NOT_NULL(ranges); 5370 int n = ranges->length(); 5371 if (n <= 1) return true; 5372 int max = ranges->at(0).to(); 5373 for (int i = 1; i < n; i++) { 5374 CharacterRange next_range = ranges->at(i); 5375 if (next_range.from() <= max + 1) return false; 5376 max = next_range.to(); 5377 } 5378 return true; 5379 } 5380 5381 5382 ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) { 5383 if (ranges_ == NULL) { 5384 ranges_ = new(zone) ZoneList<CharacterRange>(2, zone); 5385 CharacterRange::AddClassEscape(standard_set_type_, ranges_, zone); 5386 } 5387 return ranges_; 5388 } 5389 5390 5391 // Move a number of elements in a zonelist to another position 5392 // in the same list. Handles overlapping source and target areas. 5393 static void MoveRanges(ZoneList<CharacterRange>* list, 5394 int from, 5395 int to, 5396 int count) { 5397 // Ranges are potentially overlapping. 5398 if (from < to) { 5399 for (int i = count - 1; i >= 0; i--) { 5400 list->at(to + i) = list->at(from + i); 5401 } 5402 } else { 5403 for (int i = 0; i < count; i++) { 5404 list->at(to + i) = list->at(from + i); 5405 } 5406 } 5407 } 5408 5409 5410 static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list, 5411 int count, 5412 CharacterRange insert) { 5413 // Inserts a range into list[0..count[, which must be sorted 5414 // by from value and non-overlapping and non-adjacent, using at most 5415 // list[0..count] for the result. Returns the number of resulting 5416 // canonicalized ranges. Inserting a range may collapse existing ranges into 5417 // fewer ranges, so the return value can be anything in the range 1..count+1. 5418 uc16 from = insert.from(); 5419 uc16 to = insert.to(); 5420 int start_pos = 0; 5421 int end_pos = count; 5422 for (int i = count - 1; i >= 0; i--) { 5423 CharacterRange current = list->at(i); 5424 if (current.from() > to + 1) { 5425 end_pos = i; 5426 } else if (current.to() + 1 < from) { 5427 start_pos = i + 1; 5428 break; 5429 } 5430 } 5431 5432 // Inserted range overlaps, or is adjacent to, ranges at positions 5433 // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are 5434 // not affected by the insertion. 5435 // If start_pos == end_pos, the range must be inserted before start_pos. 5436 // if start_pos < end_pos, the entire range from start_pos to end_pos 5437 // must be merged with the insert range. 5438 5439 if (start_pos == end_pos) { 5440 // Insert between existing ranges at position start_pos. 5441 if (start_pos < count) { 5442 MoveRanges(list, start_pos, start_pos + 1, count - start_pos); 5443 } 5444 list->at(start_pos) = insert; 5445 return count + 1; 5446 } 5447 if (start_pos + 1 == end_pos) { 5448 // Replace single existing range at position start_pos. 5449 CharacterRange to_replace = list->at(start_pos); 5450 int new_from = Min(to_replace.from(), from); 5451 int new_to = Max(to_replace.to(), to); 5452 list->at(start_pos) = CharacterRange(new_from, new_to); 5453 return count; 5454 } 5455 // Replace a number of existing ranges from start_pos to end_pos - 1. 5456 // Move the remaining ranges down. 5457 5458 int new_from = Min(list->at(start_pos).from(), from); 5459 int new_to = Max(list->at(end_pos - 1).to(), to); 5460 if (end_pos < count) { 5461 MoveRanges(list, end_pos, start_pos + 1, count - end_pos); 5462 } 5463 list->at(start_pos) = CharacterRange(new_from, new_to); 5464 return count - (end_pos - start_pos) + 1; 5465 } 5466 5467 5468 void CharacterSet::Canonicalize() { 5469 // Special/default classes are always considered canonical. The result 5470 // of calling ranges() will be sorted. 5471 if (ranges_ == NULL) return; 5472 CharacterRange::Canonicalize(ranges_); 5473 } 5474 5475 5476 void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) { 5477 if (character_ranges->length() <= 1) return; 5478 // Check whether ranges are already canonical (increasing, non-overlapping, 5479 // non-adjacent). 5480 int n = character_ranges->length(); 5481 int max = character_ranges->at(0).to(); 5482 int i = 1; 5483 while (i < n) { 5484 CharacterRange current = character_ranges->at(i); 5485 if (current.from() <= max + 1) { 5486 break; 5487 } 5488 max = current.to(); 5489 i++; 5490 } 5491 // Canonical until the i'th range. If that's all of them, we are done. 5492 if (i == n) return; 5493 5494 // The ranges at index i and forward are not canonicalized. Make them so by 5495 // doing the equivalent of insertion sort (inserting each into the previous 5496 // list, in order). 5497 // Notice that inserting a range can reduce the number of ranges in the 5498 // result due to combining of adjacent and overlapping ranges. 5499 int read = i; // Range to insert. 5500 int num_canonical = i; // Length of canonicalized part of list. 5501 do { 5502 num_canonical = InsertRangeInCanonicalList(character_ranges, 5503 num_canonical, 5504 character_ranges->at(read)); 5505 read++; 5506 } while (read < n); 5507 character_ranges->Rewind(num_canonical); 5508 5509 ASSERT(CharacterRange::IsCanonical(character_ranges)); 5510 } 5511 5512 5513 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges, 5514 ZoneList<CharacterRange>* negated_ranges, 5515 Zone* zone) { 5516 ASSERT(CharacterRange::IsCanonical(ranges)); 5517 ASSERT_EQ(0, negated_ranges->length()); 5518 int range_count = ranges->length(); 5519 uc16 from = 0; 5520 int i = 0; 5521 if (range_count > 0 && ranges->at(0).from() == 0) { 5522 from = ranges->at(0).to(); 5523 i = 1; 5524 } 5525 while (i < range_count) { 5526 CharacterRange range = ranges->at(i); 5527 negated_ranges->Add(CharacterRange(from + 1, range.from() - 1), zone); 5528 from = range.to(); 5529 i++; 5530 } 5531 if (from < String::kMaxUtf16CodeUnit) { 5532 negated_ranges->Add(CharacterRange(from + 1, String::kMaxUtf16CodeUnit), 5533 zone); 5534 } 5535 } 5536 5537 5538 // ------------------------------------------------------------------- 5539 // Splay tree 5540 5541 5542 OutSet* OutSet::Extend(unsigned value, Zone* zone) { 5543 if (Get(value)) 5544 return this; 5545 if (successors(zone) != NULL) { 5546 for (int i = 0; i < successors(zone)->length(); i++) { 5547 OutSet* successor = successors(zone)->at(i); 5548 if (successor->Get(value)) 5549 return successor; 5550 } 5551 } else { 5552 successors_ = new(zone) ZoneList<OutSet*>(2, zone); 5553 } 5554 OutSet* result = new(zone) OutSet(first_, remaining_); 5555 result->Set(value, zone); 5556 successors(zone)->Add(result, zone); 5557 return result; 5558 } 5559 5560 5561 void OutSet::Set(unsigned value, Zone *zone) { 5562 if (value < kFirstLimit) { 5563 first_ |= (1 << value); 5564 } else { 5565 if (remaining_ == NULL) 5566 remaining_ = new(zone) ZoneList<unsigned>(1, zone); 5567 if (remaining_->is_empty() || !remaining_->Contains(value)) 5568 remaining_->Add(value, zone); 5569 } 5570 } 5571 5572 5573 bool OutSet::Get(unsigned value) { 5574 if (value < kFirstLimit) { 5575 return (first_ & (1 << value)) != 0; 5576 } else if (remaining_ == NULL) { 5577 return false; 5578 } else { 5579 return remaining_->Contains(value); 5580 } 5581 } 5582 5583 5584 const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar; 5585 5586 5587 void DispatchTable::AddRange(CharacterRange full_range, int value, 5588 Zone* zone) { 5589 CharacterRange current = full_range; 5590 if (tree()->is_empty()) { 5591 // If this is the first range we just insert into the table. 5592 ZoneSplayTree<Config>::Locator loc; 5593 ASSERT_RESULT(tree()->Insert(current.from(), &loc)); 5594 loc.set_value(Entry(current.from(), current.to(), 5595 empty()->Extend(value, zone))); 5596 return; 5597 } 5598 // First see if there is a range to the left of this one that 5599 // overlaps. 5600 ZoneSplayTree<Config>::Locator loc; 5601 if (tree()->FindGreatestLessThan(current.from(), &loc)) { 5602 Entry* entry = &loc.value(); 5603 // If we've found a range that overlaps with this one, and it 5604 // starts strictly to the left of this one, we have to fix it 5605 // because the following code only handles ranges that start on 5606 // or after the start point of the range we're adding. 5607 if (entry->from() < current.from() && entry->to() >= current.from()) { 5608 // Snap the overlapping range in half around the start point of 5609 // the range we're adding. 5610 CharacterRange left(entry->from(), current.from() - 1); 5611 CharacterRange right(current.from(), entry->to()); 5612 // The left part of the overlapping range doesn't overlap. 5613 // Truncate the whole entry to be just the left part. 5614 entry->set_to(left.to()); 5615 // The right part is the one that overlaps. We add this part 5616 // to the map and let the next step deal with merging it with 5617 // the range we're adding. 5618 ZoneSplayTree<Config>::Locator loc; 5619 ASSERT_RESULT(tree()->Insert(right.from(), &loc)); 5620 loc.set_value(Entry(right.from(), 5621 right.to(), 5622 entry->out_set())); 5623 } 5624 } 5625 while (current.is_valid()) { 5626 if (tree()->FindLeastGreaterThan(current.from(), &loc) && 5627 (loc.value().from() <= current.to()) && 5628 (loc.value().to() >= current.from())) { 5629 Entry* entry = &loc.value(); 5630 // We have overlap. If there is space between the start point of 5631 // the range we're adding and where the overlapping range starts 5632 // then we have to add a range covering just that space. 5633 if (current.from() < entry->from()) { 5634 ZoneSplayTree<Config>::Locator ins; 5635 ASSERT_RESULT(tree()->Insert(current.from(), &ins)); 5636 ins.set_value(Entry(current.from(), 5637 entry->from() - 1, 5638 empty()->Extend(value, zone))); 5639 current.set_from(entry->from()); 5640 } 5641 ASSERT_EQ(current.from(), entry->from()); 5642 // If the overlapping range extends beyond the one we want to add 5643 // we have to snap the right part off and add it separately. 5644 if (entry->to() > current.to()) { 5645 ZoneSplayTree<Config>::Locator ins; 5646 ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins)); 5647 ins.set_value(Entry(current.to() + 1, 5648 entry->to(), 5649 entry->out_set())); 5650 entry->set_to(current.to()); 5651 } 5652 ASSERT(entry->to() <= current.to()); 5653 // The overlapping range is now completely contained by the range 5654 // we're adding so we can just update it and move the start point 5655 // of the range we're adding just past it. 5656 entry->AddValue(value, zone); 5657 // Bail out if the last interval ended at 0xFFFF since otherwise 5658 // adding 1 will wrap around to 0. 5659 if (entry->to() == String::kMaxUtf16CodeUnit) 5660 break; 5661 ASSERT(entry->to() + 1 > current.from()); 5662 current.set_from(entry->to() + 1); 5663 } else { 5664 // There is no overlap so we can just add the range 5665 ZoneSplayTree<Config>::Locator ins; 5666 ASSERT_RESULT(tree()->Insert(current.from(), &ins)); 5667 ins.set_value(Entry(current.from(), 5668 current.to(), 5669 empty()->Extend(value, zone))); 5670 break; 5671 } 5672 } 5673 } 5674 5675 5676 OutSet* DispatchTable::Get(uc16 value) { 5677 ZoneSplayTree<Config>::Locator loc; 5678 if (!tree()->FindGreatestLessThan(value, &loc)) 5679 return empty(); 5680 Entry* entry = &loc.value(); 5681 if (value <= entry->to()) 5682 return entry->out_set(); 5683 else 5684 return empty(); 5685 } 5686 5687 5688 // ------------------------------------------------------------------- 5689 // Analysis 5690 5691 5692 void Analysis::EnsureAnalyzed(RegExpNode* that) { 5693 StackLimitCheck check(Isolate::Current()); 5694 if (check.HasOverflowed()) { 5695 fail("Stack overflow"); 5696 return; 5697 } 5698 if (that->info()->been_analyzed || that->info()->being_analyzed) 5699 return; 5700 that->info()->being_analyzed = true; 5701 that->Accept(this); 5702 that->info()->being_analyzed = false; 5703 that->info()->been_analyzed = true; 5704 } 5705 5706 5707 void Analysis::VisitEnd(EndNode* that) { 5708 // nothing to do 5709 } 5710 5711 5712 void TextNode::CalculateOffsets() { 5713 int element_count = elements()->length(); 5714 // Set up the offsets of the elements relative to the start. This is a fixed 5715 // quantity since a TextNode can only contain fixed-width things. 5716 int cp_offset = 0; 5717 for (int i = 0; i < element_count; i++) { 5718 TextElement& elm = elements()->at(i); 5719 elm.cp_offset = cp_offset; 5720 if (elm.text_type == TextElement::ATOM) { 5721 cp_offset += elm.data.u_atom->data().length(); 5722 } else { 5723 cp_offset++; 5724 } 5725 } 5726 } 5727 5728 5729 void Analysis::VisitText(TextNode* that) { 5730 if (ignore_case_) { 5731 that->MakeCaseIndependent(is_ascii_); 5732 } 5733 EnsureAnalyzed(that->on_success()); 5734 if (!has_failed()) { 5735 that->CalculateOffsets(); 5736 } 5737 } 5738 5739 5740 void Analysis::VisitAction(ActionNode* that) { 5741 RegExpNode* target = that->on_success(); 5742 EnsureAnalyzed(target); 5743 if (!has_failed()) { 5744 // If the next node is interested in what it follows then this node 5745 // has to be interested too so it can pass the information on. 5746 that->info()->AddFromFollowing(target->info()); 5747 } 5748 } 5749 5750 5751 void Analysis::VisitChoice(ChoiceNode* that) { 5752 NodeInfo* info = that->info(); 5753 for (int i = 0; i < that->alternatives()->length(); i++) { 5754 RegExpNode* node = that->alternatives()->at(i).node(); 5755 EnsureAnalyzed(node); 5756 if (has_failed()) return; 5757 // Anything the following nodes need to know has to be known by 5758 // this node also, so it can pass it on. 5759 info->AddFromFollowing(node->info()); 5760 } 5761 } 5762 5763 5764 void Analysis::VisitLoopChoice(LoopChoiceNode* that) { 5765 NodeInfo* info = that->info(); 5766 for (int i = 0; i < that->alternatives()->length(); i++) { 5767 RegExpNode* node = that->alternatives()->at(i).node(); 5768 if (node != that->loop_node()) { 5769 EnsureAnalyzed(node); 5770 if (has_failed()) return; 5771 info->AddFromFollowing(node->info()); 5772 } 5773 } 5774 // Check the loop last since it may need the value of this node 5775 // to get a correct result. 5776 EnsureAnalyzed(that->loop_node()); 5777 if (!has_failed()) { 5778 info->AddFromFollowing(that->loop_node()->info()); 5779 } 5780 } 5781 5782 5783 void Analysis::VisitBackReference(BackReferenceNode* that) { 5784 EnsureAnalyzed(that->on_success()); 5785 } 5786 5787 5788 void Analysis::VisitAssertion(AssertionNode* that) { 5789 EnsureAnalyzed(that->on_success()); 5790 } 5791 5792 5793 void BackReferenceNode::FillInBMInfo(int offset, 5794 int budget, 5795 BoyerMooreLookahead* bm, 5796 bool not_at_start) { 5797 // Working out the set of characters that a backreference can match is too 5798 // hard, so we just say that any character can match. 5799 bm->SetRest(offset); 5800 SaveBMInfo(bm, not_at_start, offset); 5801 } 5802 5803 5804 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize == 5805 RegExpMacroAssembler::kTableSize); 5806 5807 5808 void ChoiceNode::FillInBMInfo(int offset, 5809 int budget, 5810 BoyerMooreLookahead* bm, 5811 bool not_at_start) { 5812 ZoneList<GuardedAlternative>* alts = alternatives(); 5813 budget = (budget - 1) / alts->length(); 5814 for (int i = 0; i < alts->length(); i++) { 5815 GuardedAlternative& alt = alts->at(i); 5816 if (alt.guards() != NULL && alt.guards()->length() != 0) { 5817 bm->SetRest(offset); // Give up trying to fill in info. 5818 SaveBMInfo(bm, not_at_start, offset); 5819 return; 5820 } 5821 alt.node()->FillInBMInfo(offset, budget, bm, not_at_start); 5822 } 5823 SaveBMInfo(bm, not_at_start, offset); 5824 } 5825 5826 5827 void TextNode::FillInBMInfo(int initial_offset, 5828 int budget, 5829 BoyerMooreLookahead* bm, 5830 bool not_at_start) { 5831 if (initial_offset >= bm->length()) return; 5832 int offset = initial_offset; 5833 int max_char = bm->max_char(); 5834 for (int i = 0; i < elements()->length(); i++) { 5835 if (offset >= bm->length()) { 5836 if (initial_offset == 0) set_bm_info(not_at_start, bm); 5837 return; 5838 } 5839 TextElement text = elements()->at(i); 5840 if (text.text_type == TextElement::ATOM) { 5841 RegExpAtom* atom = text.data.u_atom; 5842 for (int j = 0; j < atom->length(); j++, offset++) { 5843 if (offset >= bm->length()) { 5844 if (initial_offset == 0) set_bm_info(not_at_start, bm); 5845 return; 5846 } 5847 uc16 character = atom->data()[j]; 5848 if (bm->compiler()->ignore_case()) { 5849 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 5850 int length = GetCaseIndependentLetters( 5851 ISOLATE, 5852 character, 5853 bm->max_char() == String::kMaxOneByteCharCode, 5854 chars); 5855 for (int j = 0; j < length; j++) { 5856 bm->Set(offset, chars[j]); 5857 } 5858 } else { 5859 if (character <= max_char) bm->Set(offset, character); 5860 } 5861 } 5862 } else { 5863 ASSERT(text.text_type == TextElement::CHAR_CLASS); 5864 RegExpCharacterClass* char_class = text.data.u_char_class; 5865 ZoneList<CharacterRange>* ranges = char_class->ranges(zone()); 5866 if (char_class->is_negated()) { 5867 bm->SetAll(offset); 5868 } else { 5869 for (int k = 0; k < ranges->length(); k++) { 5870 CharacterRange& range = ranges->at(k); 5871 if (range.from() > max_char) continue; 5872 int to = Min(max_char, static_cast<int>(range.to())); 5873 bm->SetInterval(offset, Interval(range.from(), to)); 5874 } 5875 } 5876 offset++; 5877 } 5878 } 5879 if (offset >= bm->length()) { 5880 if (initial_offset == 0) set_bm_info(not_at_start, bm); 5881 return; 5882 } 5883 on_success()->FillInBMInfo(offset, 5884 budget - 1, 5885 bm, 5886 true); // Not at start after a text node. 5887 if (initial_offset == 0) set_bm_info(not_at_start, bm); 5888 } 5889 5890 5891 // ------------------------------------------------------------------- 5892 // Dispatch table construction 5893 5894 5895 void DispatchTableConstructor::VisitEnd(EndNode* that) { 5896 AddRange(CharacterRange::Everything()); 5897 } 5898 5899 5900 void DispatchTableConstructor::BuildTable(ChoiceNode* node) { 5901 node->set_being_calculated(true); 5902 ZoneList<GuardedAlternative>* alternatives = node->alternatives(); 5903 for (int i = 0; i < alternatives->length(); i++) { 5904 set_choice_index(i); 5905 alternatives->at(i).node()->Accept(this); 5906 } 5907 node->set_being_calculated(false); 5908 } 5909 5910 5911 class AddDispatchRange { 5912 public: 5913 explicit AddDispatchRange(DispatchTableConstructor* constructor) 5914 : constructor_(constructor) { } 5915 void Call(uc32 from, DispatchTable::Entry entry); 5916 private: 5917 DispatchTableConstructor* constructor_; 5918 }; 5919 5920 5921 void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) { 5922 CharacterRange range(from, entry.to()); 5923 constructor_->AddRange(range); 5924 } 5925 5926 5927 void DispatchTableConstructor::VisitChoice(ChoiceNode* node) { 5928 if (node->being_calculated()) 5929 return; 5930 DispatchTable* table = node->GetTable(ignore_case_); 5931 AddDispatchRange adder(this); 5932 table->ForEach(&adder); 5933 } 5934 5935 5936 void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) { 5937 // TODO(160): Find the node that we refer back to and propagate its start 5938 // set back to here. For now we just accept anything. 5939 AddRange(CharacterRange::Everything()); 5940 } 5941 5942 5943 void DispatchTableConstructor::VisitAssertion(AssertionNode* that) { 5944 RegExpNode* target = that->on_success(); 5945 target->Accept(this); 5946 } 5947 5948 5949 static int CompareRangeByFrom(const CharacterRange* a, 5950 const CharacterRange* b) { 5951 return Compare<uc16>(a->from(), b->from()); 5952 } 5953 5954 5955 void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) { 5956 ranges->Sort(CompareRangeByFrom); 5957 uc16 last = 0; 5958 for (int i = 0; i < ranges->length(); i++) { 5959 CharacterRange range = ranges->at(i); 5960 if (last < range.from()) 5961 AddRange(CharacterRange(last, range.from() - 1)); 5962 if (range.to() >= last) { 5963 if (range.to() == String::kMaxUtf16CodeUnit) { 5964 return; 5965 } else { 5966 last = range.to() + 1; 5967 } 5968 } 5969 } 5970 AddRange(CharacterRange(last, String::kMaxUtf16CodeUnit)); 5971 } 5972 5973 5974 void DispatchTableConstructor::VisitText(TextNode* that) { 5975 TextElement elm = that->elements()->at(0); 5976 switch (elm.text_type) { 5977 case TextElement::ATOM: { 5978 uc16 c = elm.data.u_atom->data()[0]; 5979 AddRange(CharacterRange(c, c)); 5980 break; 5981 } 5982 case TextElement::CHAR_CLASS: { 5983 RegExpCharacterClass* tree = elm.data.u_char_class; 5984 ZoneList<CharacterRange>* ranges = tree->ranges(that->zone()); 5985 if (tree->is_negated()) { 5986 AddInverse(ranges); 5987 } else { 5988 for (int i = 0; i < ranges->length(); i++) 5989 AddRange(ranges->at(i)); 5990 } 5991 break; 5992 } 5993 default: { 5994 UNIMPLEMENTED(); 5995 } 5996 } 5997 } 5998 5999 6000 void DispatchTableConstructor::VisitAction(ActionNode* that) { 6001 RegExpNode* target = that->on_success(); 6002 target->Accept(this); 6003 } 6004 6005 6006 RegExpEngine::CompilationResult RegExpEngine::Compile( 6007 RegExpCompileData* data, 6008 bool ignore_case, 6009 bool is_global, 6010 bool is_multiline, 6011 Handle<String> pattern, 6012 Handle<String> sample_subject, 6013 bool is_ascii, 6014 Zone* zone) { 6015 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) { 6016 return IrregexpRegExpTooBig(); 6017 } 6018 RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii, zone); 6019 6020 // Sample some characters from the middle of the string. 6021 static const int kSampleSize = 128; 6022 6023 FlattenString(sample_subject); 6024 int chars_sampled = 0; 6025 int half_way = (sample_subject->length() - kSampleSize) / 2; 6026 for (int i = Max(0, half_way); 6027 i < sample_subject->length() && chars_sampled < kSampleSize; 6028 i++, chars_sampled++) { 6029 compiler.frequency_collator()->CountCharacter(sample_subject->Get(i)); 6030 } 6031 6032 // Wrap the body of the regexp in capture #0. 6033 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree, 6034 0, 6035 &compiler, 6036 compiler.accept()); 6037 RegExpNode* node = captured_body; 6038 bool is_end_anchored = data->tree->IsAnchoredAtEnd(); 6039 bool is_start_anchored = data->tree->IsAnchoredAtStart(); 6040 int max_length = data->tree->max_match(); 6041 if (!is_start_anchored) { 6042 // Add a .*? at the beginning, outside the body capture, unless 6043 // this expression is anchored at the beginning. 6044 RegExpNode* loop_node = 6045 RegExpQuantifier::ToNode(0, 6046 RegExpTree::kInfinity, 6047 false, 6048 new(zone) RegExpCharacterClass('*'), 6049 &compiler, 6050 captured_body, 6051 data->contains_anchor); 6052 6053 if (data->contains_anchor) { 6054 // Unroll loop once, to take care of the case that might start 6055 // at the start of input. 6056 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone); 6057 first_step_node->AddAlternative(GuardedAlternative(captured_body)); 6058 first_step_node->AddAlternative(GuardedAlternative( 6059 new(zone) TextNode(new(zone) RegExpCharacterClass('*'), loop_node))); 6060 node = first_step_node; 6061 } else { 6062 node = loop_node; 6063 } 6064 } 6065 if (is_ascii) { 6066 node = node->FilterASCII(RegExpCompiler::kMaxRecursion, ignore_case); 6067 // Do it again to propagate the new nodes to places where they were not 6068 // put because they had not been calculated yet. 6069 if (node != NULL) { 6070 node = node->FilterASCII(RegExpCompiler::kMaxRecursion, ignore_case); 6071 } 6072 } 6073 6074 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone); 6075 data->node = node; 6076 Analysis analysis(ignore_case, is_ascii); 6077 analysis.EnsureAnalyzed(node); 6078 if (analysis.has_failed()) { 6079 const char* error_message = analysis.error_message(); 6080 return CompilationResult(error_message); 6081 } 6082 6083 // Create the correct assembler for the architecture. 6084 #ifndef V8_INTERPRETED_REGEXP 6085 // Native regexp implementation. 6086 6087 NativeRegExpMacroAssembler::Mode mode = 6088 is_ascii ? NativeRegExpMacroAssembler::ASCII 6089 : NativeRegExpMacroAssembler::UC16; 6090 6091 #if V8_TARGET_ARCH_IA32 6092 RegExpMacroAssemblerIA32 macro_assembler(mode, (data->capture_count + 1) * 2, 6093 zone); 6094 #elif V8_TARGET_ARCH_X64 6095 RegExpMacroAssemblerX64 macro_assembler(mode, (data->capture_count + 1) * 2, 6096 zone); 6097 #elif V8_TARGET_ARCH_ARM 6098 RegExpMacroAssemblerARM macro_assembler(mode, (data->capture_count + 1) * 2, 6099 zone); 6100 #elif V8_TARGET_ARCH_MIPS 6101 RegExpMacroAssemblerMIPS macro_assembler(mode, (data->capture_count + 1) * 2, 6102 zone); 6103 #endif 6104 6105 #else // V8_INTERPRETED_REGEXP 6106 // Interpreted regexp implementation. 6107 EmbeddedVector<byte, 1024> codes; 6108 RegExpMacroAssemblerIrregexp macro_assembler(codes, zone); 6109 #endif // V8_INTERPRETED_REGEXP 6110 6111 // Inserted here, instead of in Assembler, because it depends on information 6112 // in the AST that isn't replicated in the Node structure. 6113 static const int kMaxBacksearchLimit = 1024; 6114 if (is_end_anchored && 6115 !is_start_anchored && 6116 max_length < kMaxBacksearchLimit) { 6117 macro_assembler.SetCurrentPositionFromEnd(max_length); 6118 } 6119 6120 if (is_global) { 6121 macro_assembler.set_global_mode( 6122 (data->tree->min_match() > 0) 6123 ? RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK 6124 : RegExpMacroAssembler::GLOBAL); 6125 } 6126 6127 return compiler.Assemble(¯o_assembler, 6128 node, 6129 data->capture_count, 6130 pattern); 6131 } 6132 6133 6134 }} // namespace v8::internal 6135