1 // Copyright 2012 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "src/v8.h" 6 7 #include "src/ast.h" 8 #include "src/compiler.h" 9 #include "src/execution.h" 10 #include "src/factory.h" 11 #include "src/jsregexp.h" 12 #include "src/jsregexp-inl.h" 13 #include "src/platform.h" 14 #include "src/string-search.h" 15 #include "src/runtime.h" 16 #include "src/compilation-cache.h" 17 #include "src/string-stream.h" 18 #include "src/parser.h" 19 #include "src/regexp-macro-assembler.h" 20 #include "src/regexp-macro-assembler-tracer.h" 21 #include "src/regexp-macro-assembler-irregexp.h" 22 #include "src/regexp-stack.h" 23 24 #ifndef V8_INTERPRETED_REGEXP 25 #if V8_TARGET_ARCH_IA32 26 #include "src/ia32/regexp-macro-assembler-ia32.h" 27 #elif V8_TARGET_ARCH_X64 28 #include "src/x64/regexp-macro-assembler-x64.h" 29 #elif V8_TARGET_ARCH_ARM64 30 #include "src/arm64/regexp-macro-assembler-arm64.h" 31 #elif V8_TARGET_ARCH_ARM 32 #include "src/arm/regexp-macro-assembler-arm.h" 33 #elif V8_TARGET_ARCH_MIPS 34 #include "src/mips/regexp-macro-assembler-mips.h" 35 #elif V8_TARGET_ARCH_X87 36 #include "src/x87/regexp-macro-assembler-x87.h" 37 #else 38 #error Unsupported target architecture. 39 #endif 40 #endif 41 42 #include "src/interpreter-irregexp.h" 43 44 45 namespace v8 { 46 namespace internal { 47 48 MaybeHandle<Object> RegExpImpl::CreateRegExpLiteral( 49 Handle<JSFunction> constructor, 50 Handle<String> pattern, 51 Handle<String> flags) { 52 // Call the construct code with 2 arguments. 53 Handle<Object> argv[] = { pattern, flags }; 54 return Execution::New(constructor, ARRAY_SIZE(argv), argv); 55 } 56 57 58 static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) { 59 int flags = JSRegExp::NONE; 60 for (int i = 0; i < str->length(); i++) { 61 switch (str->Get(i)) { 62 case 'i': 63 flags |= JSRegExp::IGNORE_CASE; 64 break; 65 case 'g': 66 flags |= JSRegExp::GLOBAL; 67 break; 68 case 'm': 69 flags |= JSRegExp::MULTILINE; 70 break; 71 } 72 } 73 return JSRegExp::Flags(flags); 74 } 75 76 77 MUST_USE_RESULT 78 static inline MaybeHandle<Object> ThrowRegExpException( 79 Handle<JSRegExp> re, 80 Handle<String> pattern, 81 Handle<String> error_text, 82 const char* message) { 83 Isolate* isolate = re->GetIsolate(); 84 Factory* factory = isolate->factory(); 85 Handle<FixedArray> elements = factory->NewFixedArray(2); 86 elements->set(0, *pattern); 87 elements->set(1, *error_text); 88 Handle<JSArray> array = factory->NewJSArrayWithElements(elements); 89 Handle<Object> regexp_err = factory->NewSyntaxError(message, array); 90 return isolate->Throw<Object>(regexp_err); 91 } 92 93 94 ContainedInLattice AddRange(ContainedInLattice containment, 95 const int* ranges, 96 int ranges_length, 97 Interval new_range) { 98 ASSERT((ranges_length & 1) == 1); 99 ASSERT(ranges[ranges_length - 1] == String::kMaxUtf16CodeUnit + 1); 100 if (containment == kLatticeUnknown) return containment; 101 bool inside = false; 102 int last = 0; 103 for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) { 104 // Consider the range from last to ranges[i]. 105 // We haven't got to the new range yet. 106 if (ranges[i] <= new_range.from()) continue; 107 // New range is wholly inside last-ranges[i]. Note that new_range.to() is 108 // inclusive, but the values in ranges are not. 109 if (last <= new_range.from() && new_range.to() < ranges[i]) { 110 return Combine(containment, inside ? kLatticeIn : kLatticeOut); 111 } 112 return kLatticeUnknown; 113 } 114 return containment; 115 } 116 117 118 // More makes code generation slower, less makes V8 benchmark score lower. 119 const int kMaxLookaheadForBoyerMoore = 8; 120 // In a 3-character pattern you can maximally step forwards 3 characters 121 // at a time, which is not always enough to pay for the extra logic. 122 const int kPatternTooShortForBoyerMoore = 2; 123 124 125 // Identifies the sort of regexps where the regexp engine is faster 126 // than the code used for atom matches. 127 static bool HasFewDifferentCharacters(Handle<String> pattern) { 128 int length = Min(kMaxLookaheadForBoyerMoore, pattern->length()); 129 if (length <= kPatternTooShortForBoyerMoore) return false; 130 const int kMod = 128; 131 bool character_found[kMod]; 132 int different = 0; 133 memset(&character_found[0], 0, sizeof(character_found)); 134 for (int i = 0; i < length; i++) { 135 int ch = (pattern->Get(i) & (kMod - 1)); 136 if (!character_found[ch]) { 137 character_found[ch] = true; 138 different++; 139 // We declare a regexp low-alphabet if it has at least 3 times as many 140 // characters as it has different characters. 141 if (different * 3 > length) return false; 142 } 143 } 144 return true; 145 } 146 147 148 // Generic RegExp methods. Dispatches to implementation specific methods. 149 150 151 MaybeHandle<Object> RegExpImpl::Compile(Handle<JSRegExp> re, 152 Handle<String> pattern, 153 Handle<String> flag_str) { 154 Isolate* isolate = re->GetIsolate(); 155 Zone zone(isolate); 156 JSRegExp::Flags flags = RegExpFlagsFromString(flag_str); 157 CompilationCache* compilation_cache = isolate->compilation_cache(); 158 MaybeHandle<FixedArray> maybe_cached = 159 compilation_cache->LookupRegExp(pattern, flags); 160 Handle<FixedArray> cached; 161 bool in_cache = maybe_cached.ToHandle(&cached); 162 LOG(isolate, RegExpCompileEvent(re, in_cache)); 163 164 Handle<Object> result; 165 if (in_cache) { 166 re->set_data(*cached); 167 return re; 168 } 169 pattern = String::Flatten(pattern); 170 PostponeInterruptsScope postpone(isolate); 171 RegExpCompileData parse_result; 172 FlatStringReader reader(isolate, pattern); 173 if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(), 174 &parse_result, &zone)) { 175 // Throw an exception if we fail to parse the pattern. 176 return ThrowRegExpException(re, 177 pattern, 178 parse_result.error, 179 "malformed_regexp"); 180 } 181 182 bool has_been_compiled = false; 183 184 if (parse_result.simple && 185 !flags.is_ignore_case() && 186 !HasFewDifferentCharacters(pattern)) { 187 // Parse-tree is a single atom that is equal to the pattern. 188 AtomCompile(re, pattern, flags, pattern); 189 has_been_compiled = true; 190 } else if (parse_result.tree->IsAtom() && 191 !flags.is_ignore_case() && 192 parse_result.capture_count == 0) { 193 RegExpAtom* atom = parse_result.tree->AsAtom(); 194 Vector<const uc16> atom_pattern = atom->data(); 195 Handle<String> atom_string; 196 ASSIGN_RETURN_ON_EXCEPTION( 197 isolate, atom_string, 198 isolate->factory()->NewStringFromTwoByte(atom_pattern), 199 Object); 200 if (!HasFewDifferentCharacters(atom_string)) { 201 AtomCompile(re, pattern, flags, atom_string); 202 has_been_compiled = true; 203 } 204 } 205 if (!has_been_compiled) { 206 IrregexpInitialize(re, pattern, flags, parse_result.capture_count); 207 } 208 ASSERT(re->data()->IsFixedArray()); 209 // Compilation succeeded so the data is set on the regexp 210 // and we can store it in the cache. 211 Handle<FixedArray> data(FixedArray::cast(re->data())); 212 compilation_cache->PutRegExp(pattern, flags, data); 213 214 return re; 215 } 216 217 218 MaybeHandle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp, 219 Handle<String> subject, 220 int index, 221 Handle<JSArray> last_match_info) { 222 switch (regexp->TypeTag()) { 223 case JSRegExp::ATOM: 224 return AtomExec(regexp, subject, index, last_match_info); 225 case JSRegExp::IRREGEXP: { 226 return IrregexpExec(regexp, subject, index, last_match_info); 227 } 228 default: 229 UNREACHABLE(); 230 return MaybeHandle<Object>(); 231 } 232 } 233 234 235 // RegExp Atom implementation: Simple string search using indexOf. 236 237 238 void RegExpImpl::AtomCompile(Handle<JSRegExp> re, 239 Handle<String> pattern, 240 JSRegExp::Flags flags, 241 Handle<String> match_pattern) { 242 re->GetIsolate()->factory()->SetRegExpAtomData(re, 243 JSRegExp::ATOM, 244 pattern, 245 flags, 246 match_pattern); 247 } 248 249 250 static void SetAtomLastCapture(FixedArray* array, 251 String* subject, 252 int from, 253 int to) { 254 SealHandleScope shs(array->GetIsolate()); 255 RegExpImpl::SetLastCaptureCount(array, 2); 256 RegExpImpl::SetLastSubject(array, subject); 257 RegExpImpl::SetLastInput(array, subject); 258 RegExpImpl::SetCapture(array, 0, from); 259 RegExpImpl::SetCapture(array, 1, to); 260 } 261 262 263 int RegExpImpl::AtomExecRaw(Handle<JSRegExp> regexp, 264 Handle<String> subject, 265 int index, 266 int32_t* output, 267 int output_size) { 268 Isolate* isolate = regexp->GetIsolate(); 269 270 ASSERT(0 <= index); 271 ASSERT(index <= subject->length()); 272 273 subject = String::Flatten(subject); 274 DisallowHeapAllocation no_gc; // ensure vectors stay valid 275 276 String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex)); 277 int needle_len = needle->length(); 278 ASSERT(needle->IsFlat()); 279 ASSERT_LT(0, needle_len); 280 281 if (index + needle_len > subject->length()) { 282 return RegExpImpl::RE_FAILURE; 283 } 284 285 for (int i = 0; i < output_size; i += 2) { 286 String::FlatContent needle_content = needle->GetFlatContent(); 287 String::FlatContent subject_content = subject->GetFlatContent(); 288 ASSERT(needle_content.IsFlat()); 289 ASSERT(subject_content.IsFlat()); 290 // dispatch on type of strings 291 index = (needle_content.IsAscii() 292 ? (subject_content.IsAscii() 293 ? SearchString(isolate, 294 subject_content.ToOneByteVector(), 295 needle_content.ToOneByteVector(), 296 index) 297 : SearchString(isolate, 298 subject_content.ToUC16Vector(), 299 needle_content.ToOneByteVector(), 300 index)) 301 : (subject_content.IsAscii() 302 ? SearchString(isolate, 303 subject_content.ToOneByteVector(), 304 needle_content.ToUC16Vector(), 305 index) 306 : SearchString(isolate, 307 subject_content.ToUC16Vector(), 308 needle_content.ToUC16Vector(), 309 index))); 310 if (index == -1) { 311 return i / 2; // Return number of matches. 312 } else { 313 output[i] = index; 314 output[i+1] = index + needle_len; 315 index += needle_len; 316 } 317 } 318 return output_size / 2; 319 } 320 321 322 Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re, 323 Handle<String> subject, 324 int index, 325 Handle<JSArray> last_match_info) { 326 Isolate* isolate = re->GetIsolate(); 327 328 static const int kNumRegisters = 2; 329 STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize); 330 int32_t* output_registers = isolate->jsregexp_static_offsets_vector(); 331 332 int res = AtomExecRaw(re, subject, index, output_registers, kNumRegisters); 333 334 if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value(); 335 336 ASSERT_EQ(res, RegExpImpl::RE_SUCCESS); 337 SealHandleScope shs(isolate); 338 FixedArray* array = FixedArray::cast(last_match_info->elements()); 339 SetAtomLastCapture(array, *subject, output_registers[0], output_registers[1]); 340 return last_match_info; 341 } 342 343 344 // Irregexp implementation. 345 346 // Ensures that the regexp object contains a compiled version of the 347 // source for either ASCII or non-ASCII strings. 348 // If the compiled version doesn't already exist, it is compiled 349 // from the source pattern. 350 // If compilation fails, an exception is thrown and this function 351 // returns false. 352 bool RegExpImpl::EnsureCompiledIrregexp( 353 Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii) { 354 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii)); 355 #ifdef V8_INTERPRETED_REGEXP 356 if (compiled_code->IsByteArray()) return true; 357 #else // V8_INTERPRETED_REGEXP (RegExp native code) 358 if (compiled_code->IsCode()) return true; 359 #endif 360 // We could potentially have marked this as flushable, but have kept 361 // a saved version if we did not flush it yet. 362 Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_ascii)); 363 if (saved_code->IsCode()) { 364 // Reinstate the code in the original place. 365 re->SetDataAt(JSRegExp::code_index(is_ascii), saved_code); 366 ASSERT(compiled_code->IsSmi()); 367 return true; 368 } 369 return CompileIrregexp(re, sample_subject, is_ascii); 370 } 371 372 373 static bool CreateRegExpErrorObjectAndThrow(Handle<JSRegExp> re, 374 bool is_ascii, 375 Handle<String> error_message, 376 Isolate* isolate) { 377 Factory* factory = isolate->factory(); 378 Handle<FixedArray> elements = factory->NewFixedArray(2); 379 elements->set(0, re->Pattern()); 380 elements->set(1, *error_message); 381 Handle<JSArray> array = factory->NewJSArrayWithElements(elements); 382 Handle<Object> regexp_err = 383 factory->NewSyntaxError("malformed_regexp", array); 384 isolate->Throw(*regexp_err); 385 return false; 386 } 387 388 389 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, 390 Handle<String> sample_subject, 391 bool is_ascii) { 392 // Compile the RegExp. 393 Isolate* isolate = re->GetIsolate(); 394 Zone zone(isolate); 395 PostponeInterruptsScope postpone(isolate); 396 // If we had a compilation error the last time this is saved at the 397 // saved code index. 398 Object* entry = re->DataAt(JSRegExp::code_index(is_ascii)); 399 // When arriving here entry can only be a smi, either representing an 400 // uncompiled regexp, a previous compilation error, or code that has 401 // been flushed. 402 ASSERT(entry->IsSmi()); 403 int entry_value = Smi::cast(entry)->value(); 404 ASSERT(entry_value == JSRegExp::kUninitializedValue || 405 entry_value == JSRegExp::kCompilationErrorValue || 406 (entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0)); 407 408 if (entry_value == JSRegExp::kCompilationErrorValue) { 409 // A previous compilation failed and threw an error which we store in 410 // the saved code index (we store the error message, not the actual 411 // error). Recreate the error object and throw it. 412 Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_ascii)); 413 ASSERT(error_string->IsString()); 414 Handle<String> error_message(String::cast(error_string)); 415 CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate); 416 return false; 417 } 418 419 JSRegExp::Flags flags = re->GetFlags(); 420 421 Handle<String> pattern(re->Pattern()); 422 pattern = String::Flatten(pattern); 423 RegExpCompileData compile_data; 424 FlatStringReader reader(isolate, pattern); 425 if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(), 426 &compile_data, 427 &zone)) { 428 // Throw an exception if we fail to parse the pattern. 429 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once. 430 USE(ThrowRegExpException(re, 431 pattern, 432 compile_data.error, 433 "malformed_regexp")); 434 return false; 435 } 436 RegExpEngine::CompilationResult result = 437 RegExpEngine::Compile(&compile_data, 438 flags.is_ignore_case(), 439 flags.is_global(), 440 flags.is_multiline(), 441 pattern, 442 sample_subject, 443 is_ascii, 444 &zone); 445 if (result.error_message != NULL) { 446 // Unable to compile regexp. 447 Handle<String> error_message = isolate->factory()->NewStringFromUtf8( 448 CStrVector(result.error_message)).ToHandleChecked(); 449 CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate); 450 return false; 451 } 452 453 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data())); 454 data->set(JSRegExp::code_index(is_ascii), result.code); 455 int register_max = IrregexpMaxRegisterCount(*data); 456 if (result.num_registers > register_max) { 457 SetIrregexpMaxRegisterCount(*data, result.num_registers); 458 } 459 460 return true; 461 } 462 463 464 int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) { 465 return Smi::cast( 466 re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value(); 467 } 468 469 470 void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) { 471 re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value)); 472 } 473 474 475 int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) { 476 return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value(); 477 } 478 479 480 int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) { 481 return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value(); 482 } 483 484 485 ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_ascii) { 486 return ByteArray::cast(re->get(JSRegExp::code_index(is_ascii))); 487 } 488 489 490 Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_ascii) { 491 return Code::cast(re->get(JSRegExp::code_index(is_ascii))); 492 } 493 494 495 void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re, 496 Handle<String> pattern, 497 JSRegExp::Flags flags, 498 int capture_count) { 499 // Initialize compiled code entries to null. 500 re->GetIsolate()->factory()->SetRegExpIrregexpData(re, 501 JSRegExp::IRREGEXP, 502 pattern, 503 flags, 504 capture_count); 505 } 506 507 508 int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp, 509 Handle<String> subject) { 510 subject = String::Flatten(subject); 511 512 // Check the asciiness of the underlying storage. 513 bool is_ascii = subject->IsOneByteRepresentationUnderneath(); 514 if (!EnsureCompiledIrregexp(regexp, subject, is_ascii)) return -1; 515 516 #ifdef V8_INTERPRETED_REGEXP 517 // Byte-code regexp needs space allocated for all its registers. 518 // The result captures are copied to the start of the registers array 519 // if the match succeeds. This way those registers are not clobbered 520 // when we set the last match info from last successful match. 521 return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) + 522 (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2; 523 #else // V8_INTERPRETED_REGEXP 524 // Native regexp only needs room to output captures. Registers are handled 525 // internally. 526 return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2; 527 #endif // V8_INTERPRETED_REGEXP 528 } 529 530 531 int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp, 532 Handle<String> subject, 533 int index, 534 int32_t* output, 535 int output_size) { 536 Isolate* isolate = regexp->GetIsolate(); 537 538 Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate); 539 540 ASSERT(index >= 0); 541 ASSERT(index <= subject->length()); 542 ASSERT(subject->IsFlat()); 543 544 bool is_ascii = subject->IsOneByteRepresentationUnderneath(); 545 546 #ifndef V8_INTERPRETED_REGEXP 547 ASSERT(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2); 548 do { 549 EnsureCompiledIrregexp(regexp, subject, is_ascii); 550 Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii), isolate); 551 // The stack is used to allocate registers for the compiled regexp code. 552 // This means that in case of failure, the output registers array is left 553 // untouched and contains the capture results from the previous successful 554 // match. We can use that to set the last match info lazily. 555 NativeRegExpMacroAssembler::Result res = 556 NativeRegExpMacroAssembler::Match(code, 557 subject, 558 output, 559 output_size, 560 index, 561 isolate); 562 if (res != NativeRegExpMacroAssembler::RETRY) { 563 ASSERT(res != NativeRegExpMacroAssembler::EXCEPTION || 564 isolate->has_pending_exception()); 565 STATIC_ASSERT( 566 static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS); 567 STATIC_ASSERT( 568 static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE); 569 STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION) 570 == RE_EXCEPTION); 571 return static_cast<IrregexpResult>(res); 572 } 573 // If result is RETRY, the string has changed representation, and we 574 // must restart from scratch. 575 // In this case, it means we must make sure we are prepared to handle 576 // the, potentially, different subject (the string can switch between 577 // being internal and external, and even between being ASCII and UC16, 578 // but the characters are always the same). 579 IrregexpPrepare(regexp, subject); 580 is_ascii = subject->IsOneByteRepresentationUnderneath(); 581 } while (true); 582 UNREACHABLE(); 583 return RE_EXCEPTION; 584 #else // V8_INTERPRETED_REGEXP 585 586 ASSERT(output_size >= IrregexpNumberOfRegisters(*irregexp)); 587 // We must have done EnsureCompiledIrregexp, so we can get the number of 588 // registers. 589 int number_of_capture_registers = 590 (IrregexpNumberOfCaptures(*irregexp) + 1) * 2; 591 int32_t* raw_output = &output[number_of_capture_registers]; 592 // We do not touch the actual capture result registers until we know there 593 // has been a match so that we can use those capture results to set the 594 // last match info. 595 for (int i = number_of_capture_registers - 1; i >= 0; i--) { 596 raw_output[i] = -1; 597 } 598 Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_ascii), isolate); 599 600 IrregexpResult result = IrregexpInterpreter::Match(isolate, 601 byte_codes, 602 subject, 603 raw_output, 604 index); 605 if (result == RE_SUCCESS) { 606 // Copy capture results to the start of the registers array. 607 MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t)); 608 } 609 if (result == RE_EXCEPTION) { 610 ASSERT(!isolate->has_pending_exception()); 611 isolate->StackOverflow(); 612 } 613 return result; 614 #endif // V8_INTERPRETED_REGEXP 615 } 616 617 618 MaybeHandle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp, 619 Handle<String> subject, 620 int previous_index, 621 Handle<JSArray> last_match_info) { 622 Isolate* isolate = regexp->GetIsolate(); 623 ASSERT_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP); 624 625 // Prepare space for the return values. 626 #if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG) 627 if (FLAG_trace_regexp_bytecodes) { 628 String* pattern = regexp->Pattern(); 629 PrintF("\n\nRegexp match: /%s/\n\n", pattern->ToCString().get()); 630 PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get()); 631 } 632 #endif 633 int required_registers = RegExpImpl::IrregexpPrepare(regexp, subject); 634 if (required_registers < 0) { 635 // Compiling failed with an exception. 636 ASSERT(isolate->has_pending_exception()); 637 return MaybeHandle<Object>(); 638 } 639 640 int32_t* output_registers = NULL; 641 if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) { 642 output_registers = NewArray<int32_t>(required_registers); 643 } 644 SmartArrayPointer<int32_t> auto_release(output_registers); 645 if (output_registers == NULL) { 646 output_registers = isolate->jsregexp_static_offsets_vector(); 647 } 648 649 int res = RegExpImpl::IrregexpExecRaw( 650 regexp, subject, previous_index, output_registers, required_registers); 651 if (res == RE_SUCCESS) { 652 int capture_count = 653 IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())); 654 return SetLastMatchInfo( 655 last_match_info, subject, capture_count, output_registers); 656 } 657 if (res == RE_EXCEPTION) { 658 ASSERT(isolate->has_pending_exception()); 659 return MaybeHandle<Object>(); 660 } 661 ASSERT(res == RE_FAILURE); 662 return isolate->factory()->null_value(); 663 } 664 665 666 Handle<JSArray> RegExpImpl::SetLastMatchInfo(Handle<JSArray> last_match_info, 667 Handle<String> subject, 668 int capture_count, 669 int32_t* match) { 670 ASSERT(last_match_info->HasFastObjectElements()); 671 int capture_register_count = (capture_count + 1) * 2; 672 JSArray::EnsureSize(last_match_info, 673 capture_register_count + kLastMatchOverhead); 674 DisallowHeapAllocation no_allocation; 675 FixedArray* array = FixedArray::cast(last_match_info->elements()); 676 if (match != NULL) { 677 for (int i = 0; i < capture_register_count; i += 2) { 678 SetCapture(array, i, match[i]); 679 SetCapture(array, i + 1, match[i + 1]); 680 } 681 } 682 SetLastCaptureCount(array, capture_register_count); 683 SetLastSubject(array, *subject); 684 SetLastInput(array, *subject); 685 return last_match_info; 686 } 687 688 689 RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp, 690 Handle<String> subject, 691 bool is_global, 692 Isolate* isolate) 693 : register_array_(NULL), 694 register_array_size_(0), 695 regexp_(regexp), 696 subject_(subject) { 697 #ifdef V8_INTERPRETED_REGEXP 698 bool interpreted = true; 699 #else 700 bool interpreted = false; 701 #endif // V8_INTERPRETED_REGEXP 702 703 if (regexp_->TypeTag() == JSRegExp::ATOM) { 704 static const int kAtomRegistersPerMatch = 2; 705 registers_per_match_ = kAtomRegistersPerMatch; 706 // There is no distinction between interpreted and native for atom regexps. 707 interpreted = false; 708 } else { 709 registers_per_match_ = RegExpImpl::IrregexpPrepare(regexp_, subject_); 710 if (registers_per_match_ < 0) { 711 num_matches_ = -1; // Signal exception. 712 return; 713 } 714 } 715 716 if (is_global && !interpreted) { 717 register_array_size_ = 718 Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize); 719 max_matches_ = register_array_size_ / registers_per_match_; 720 } else { 721 // Global loop in interpreted regexp is not implemented. We choose 722 // the size of the offsets vector so that it can only store one match. 723 register_array_size_ = registers_per_match_; 724 max_matches_ = 1; 725 } 726 727 if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) { 728 register_array_ = NewArray<int32_t>(register_array_size_); 729 } else { 730 register_array_ = isolate->jsregexp_static_offsets_vector(); 731 } 732 733 // Set state so that fetching the results the first time triggers a call 734 // to the compiled regexp. 735 current_match_index_ = max_matches_ - 1; 736 num_matches_ = max_matches_; 737 ASSERT(registers_per_match_ >= 2); // Each match has at least one capture. 738 ASSERT_GE(register_array_size_, registers_per_match_); 739 int32_t* last_match = 740 ®ister_array_[current_match_index_ * registers_per_match_]; 741 last_match[0] = -1; 742 last_match[1] = 0; 743 } 744 745 746 // ------------------------------------------------------------------- 747 // Implementation of the Irregexp regular expression engine. 748 // 749 // The Irregexp regular expression engine is intended to be a complete 750 // implementation of ECMAScript regular expressions. It generates either 751 // bytecodes or native code. 752 753 // The Irregexp regexp engine is structured in three steps. 754 // 1) The parser generates an abstract syntax tree. See ast.cc. 755 // 2) From the AST a node network is created. The nodes are all 756 // subclasses of RegExpNode. The nodes represent states when 757 // executing a regular expression. Several optimizations are 758 // performed on the node network. 759 // 3) From the nodes we generate either byte codes or native code 760 // that can actually execute the regular expression (perform 761 // the search). The code generation step is described in more 762 // detail below. 763 764 // Code generation. 765 // 766 // The nodes are divided into four main categories. 767 // * Choice nodes 768 // These represent places where the regular expression can 769 // match in more than one way. For example on entry to an 770 // alternation (foo|bar) or a repetition (*, +, ? or {}). 771 // * Action nodes 772 // These represent places where some action should be 773 // performed. Examples include recording the current position 774 // in the input string to a register (in order to implement 775 // captures) or other actions on register for example in order 776 // to implement the counters needed for {} repetitions. 777 // * Matching nodes 778 // These attempt to match some element part of the input string. 779 // Examples of elements include character classes, plain strings 780 // or back references. 781 // * End nodes 782 // These are used to implement the actions required on finding 783 // a successful match or failing to find a match. 784 // 785 // The code generated (whether as byte codes or native code) maintains 786 // some state as it runs. This consists of the following elements: 787 // 788 // * The capture registers. Used for string captures. 789 // * Other registers. Used for counters etc. 790 // * The current position. 791 // * The stack of backtracking information. Used when a matching node 792 // fails to find a match and needs to try an alternative. 793 // 794 // Conceptual regular expression execution model: 795 // 796 // There is a simple conceptual model of regular expression execution 797 // which will be presented first. The actual code generated is a more 798 // efficient simulation of the simple conceptual model: 799 // 800 // * Choice nodes are implemented as follows: 801 // For each choice except the last { 802 // push current position 803 // push backtrack code location 804 // <generate code to test for choice> 805 // backtrack code location: 806 // pop current position 807 // } 808 // <generate code to test for last choice> 809 // 810 // * Actions nodes are generated as follows 811 // <push affected registers on backtrack stack> 812 // <generate code to perform action> 813 // push backtrack code location 814 // <generate code to test for following nodes> 815 // backtrack code location: 816 // <pop affected registers to restore their state> 817 // <pop backtrack location from stack and go to it> 818 // 819 // * Matching nodes are generated as follows: 820 // if input string matches at current position 821 // update current position 822 // <generate code to test for following nodes> 823 // else 824 // <pop backtrack location from stack and go to it> 825 // 826 // Thus it can be seen that the current position is saved and restored 827 // by the choice nodes, whereas the registers are saved and restored by 828 // by the action nodes that manipulate them. 829 // 830 // The other interesting aspect of this model is that nodes are generated 831 // at the point where they are needed by a recursive call to Emit(). If 832 // the node has already been code generated then the Emit() call will 833 // generate a jump to the previously generated code instead. In order to 834 // limit recursion it is possible for the Emit() function to put the node 835 // on a work list for later generation and instead generate a jump. The 836 // destination of the jump is resolved later when the code is generated. 837 // 838 // Actual regular expression code generation. 839 // 840 // Code generation is actually more complicated than the above. In order 841 // to improve the efficiency of the generated code some optimizations are 842 // performed 843 // 844 // * Choice nodes have 1-character lookahead. 845 // A choice node looks at the following character and eliminates some of 846 // the choices immediately based on that character. This is not yet 847 // implemented. 848 // * Simple greedy loops store reduced backtracking information. 849 // A quantifier like /.*foo/m will greedily match the whole input. It will 850 // then need to backtrack to a point where it can match "foo". The naive 851 // implementation of this would push each character position onto the 852 // backtracking stack, then pop them off one by one. This would use space 853 // proportional to the length of the input string. However since the "." 854 // can only match in one way and always has a constant length (in this case 855 // of 1) it suffices to store the current position on the top of the stack 856 // once. Matching now becomes merely incrementing the current position and 857 // backtracking becomes decrementing the current position and checking the 858 // result against the stored current position. This is faster and saves 859 // space. 860 // * The current state is virtualized. 861 // This is used to defer expensive operations until it is clear that they 862 // are needed and to generate code for a node more than once, allowing 863 // specialized an efficient versions of the code to be created. This is 864 // explained in the section below. 865 // 866 // Execution state virtualization. 867 // 868 // Instead of emitting code, nodes that manipulate the state can record their 869 // manipulation in an object called the Trace. The Trace object can record a 870 // current position offset, an optional backtrack code location on the top of 871 // the virtualized backtrack stack and some register changes. When a node is 872 // to be emitted it can flush the Trace or update it. Flushing the Trace 873 // will emit code to bring the actual state into line with the virtual state. 874 // Avoiding flushing the state can postpone some work (e.g. updates of capture 875 // registers). Postponing work can save time when executing the regular 876 // expression since it may be found that the work never has to be done as a 877 // failure to match can occur. In addition it is much faster to jump to a 878 // known backtrack code location than it is to pop an unknown backtrack 879 // location from the stack and jump there. 880 // 881 // The virtual state found in the Trace affects code generation. For example 882 // the virtual state contains the difference between the actual current 883 // position and the virtual current position, and matching code needs to use 884 // this offset to attempt a match in the correct location of the input 885 // string. Therefore code generated for a non-trivial trace is specialized 886 // to that trace. The code generator therefore has the ability to generate 887 // code for each node several times. In order to limit the size of the 888 // generated code there is an arbitrary limit on how many specialized sets of 889 // code may be generated for a given node. If the limit is reached, the 890 // trace is flushed and a generic version of the code for a node is emitted. 891 // This is subsequently used for that node. The code emitted for non-generic 892 // trace is not recorded in the node and so it cannot currently be reused in 893 // the event that code generation is requested for an identical trace. 894 895 896 void RegExpTree::AppendToText(RegExpText* text, Zone* zone) { 897 UNREACHABLE(); 898 } 899 900 901 void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) { 902 text->AddElement(TextElement::Atom(this), zone); 903 } 904 905 906 void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) { 907 text->AddElement(TextElement::CharClass(this), zone); 908 } 909 910 911 void RegExpText::AppendToText(RegExpText* text, Zone* zone) { 912 for (int i = 0; i < elements()->length(); i++) 913 text->AddElement(elements()->at(i), zone); 914 } 915 916 917 TextElement TextElement::Atom(RegExpAtom* atom) { 918 return TextElement(ATOM, atom); 919 } 920 921 922 TextElement TextElement::CharClass(RegExpCharacterClass* char_class) { 923 return TextElement(CHAR_CLASS, char_class); 924 } 925 926 927 int TextElement::length() const { 928 switch (text_type()) { 929 case ATOM: 930 return atom()->length(); 931 932 case CHAR_CLASS: 933 return 1; 934 } 935 UNREACHABLE(); 936 return 0; 937 } 938 939 940 DispatchTable* ChoiceNode::GetTable(bool ignore_case) { 941 if (table_ == NULL) { 942 table_ = new(zone()) DispatchTable(zone()); 943 DispatchTableConstructor cons(table_, ignore_case, zone()); 944 cons.BuildTable(this); 945 } 946 return table_; 947 } 948 949 950 class FrequencyCollator { 951 public: 952 FrequencyCollator() : total_samples_(0) { 953 for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) { 954 frequencies_[i] = CharacterFrequency(i); 955 } 956 } 957 958 void CountCharacter(int character) { 959 int index = (character & RegExpMacroAssembler::kTableMask); 960 frequencies_[index].Increment(); 961 total_samples_++; 962 } 963 964 // Does not measure in percent, but rather per-128 (the table size from the 965 // regexp macro assembler). 966 int Frequency(int in_character) { 967 ASSERT((in_character & RegExpMacroAssembler::kTableMask) == in_character); 968 if (total_samples_ < 1) return 1; // Division by zero. 969 int freq_in_per128 = 970 (frequencies_[in_character].counter() * 128) / total_samples_; 971 return freq_in_per128; 972 } 973 974 private: 975 class CharacterFrequency { 976 public: 977 CharacterFrequency() : counter_(0), character_(-1) { } 978 explicit CharacterFrequency(int character) 979 : counter_(0), character_(character) { } 980 981 void Increment() { counter_++; } 982 int counter() { return counter_; } 983 int character() { return character_; } 984 985 private: 986 int counter_; 987 int character_; 988 }; 989 990 991 private: 992 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize]; 993 int total_samples_; 994 }; 995 996 997 class RegExpCompiler { 998 public: 999 RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii, 1000 Zone* zone); 1001 1002 int AllocateRegister() { 1003 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) { 1004 reg_exp_too_big_ = true; 1005 return next_register_; 1006 } 1007 return next_register_++; 1008 } 1009 1010 RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler, 1011 RegExpNode* start, 1012 int capture_count, 1013 Handle<String> pattern); 1014 1015 inline void AddWork(RegExpNode* node) { work_list_->Add(node); } 1016 1017 static const int kImplementationOffset = 0; 1018 static const int kNumberOfRegistersOffset = 0; 1019 static const int kCodeOffset = 1; 1020 1021 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; } 1022 EndNode* accept() { return accept_; } 1023 1024 static const int kMaxRecursion = 100; 1025 inline int recursion_depth() { return recursion_depth_; } 1026 inline void IncrementRecursionDepth() { recursion_depth_++; } 1027 inline void DecrementRecursionDepth() { recursion_depth_--; } 1028 1029 void SetRegExpTooBig() { reg_exp_too_big_ = true; } 1030 1031 inline bool ignore_case() { return ignore_case_; } 1032 inline bool ascii() { return ascii_; } 1033 FrequencyCollator* frequency_collator() { return &frequency_collator_; } 1034 1035 int current_expansion_factor() { return current_expansion_factor_; } 1036 void set_current_expansion_factor(int value) { 1037 current_expansion_factor_ = value; 1038 } 1039 1040 Zone* zone() const { return zone_; } 1041 1042 static const int kNoRegister = -1; 1043 1044 private: 1045 EndNode* accept_; 1046 int next_register_; 1047 List<RegExpNode*>* work_list_; 1048 int recursion_depth_; 1049 RegExpMacroAssembler* macro_assembler_; 1050 bool ignore_case_; 1051 bool ascii_; 1052 bool reg_exp_too_big_; 1053 int current_expansion_factor_; 1054 FrequencyCollator frequency_collator_; 1055 Zone* zone_; 1056 }; 1057 1058 1059 class RecursionCheck { 1060 public: 1061 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) { 1062 compiler->IncrementRecursionDepth(); 1063 } 1064 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); } 1065 private: 1066 RegExpCompiler* compiler_; 1067 }; 1068 1069 1070 static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) { 1071 return RegExpEngine::CompilationResult(isolate, "RegExp too big"); 1072 } 1073 1074 1075 // Attempts to compile the regexp using an Irregexp code generator. Returns 1076 // a fixed array or a null handle depending on whether it succeeded. 1077 RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii, 1078 Zone* zone) 1079 : next_register_(2 * (capture_count + 1)), 1080 work_list_(NULL), 1081 recursion_depth_(0), 1082 ignore_case_(ignore_case), 1083 ascii_(ascii), 1084 reg_exp_too_big_(false), 1085 current_expansion_factor_(1), 1086 frequency_collator_(), 1087 zone_(zone) { 1088 accept_ = new(zone) EndNode(EndNode::ACCEPT, zone); 1089 ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister); 1090 } 1091 1092 1093 RegExpEngine::CompilationResult RegExpCompiler::Assemble( 1094 RegExpMacroAssembler* macro_assembler, 1095 RegExpNode* start, 1096 int capture_count, 1097 Handle<String> pattern) { 1098 Heap* heap = pattern->GetHeap(); 1099 1100 bool use_slow_safe_regexp_compiler = false; 1101 if (heap->total_regexp_code_generated() > 1102 RegExpImpl::kRegWxpCompiledLimit && 1103 heap->isolate()->memory_allocator()->SizeExecutable() > 1104 RegExpImpl::kRegExpExecutableMemoryLimit) { 1105 use_slow_safe_regexp_compiler = true; 1106 } 1107 1108 macro_assembler->set_slow_safe(use_slow_safe_regexp_compiler); 1109 1110 #ifdef DEBUG 1111 if (FLAG_trace_regexp_assembler) 1112 macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler); 1113 else 1114 #endif 1115 macro_assembler_ = macro_assembler; 1116 1117 List <RegExpNode*> work_list(0); 1118 work_list_ = &work_list; 1119 Label fail; 1120 macro_assembler_->PushBacktrack(&fail); 1121 Trace new_trace; 1122 start->Emit(this, &new_trace); 1123 macro_assembler_->Bind(&fail); 1124 macro_assembler_->Fail(); 1125 while (!work_list.is_empty()) { 1126 work_list.RemoveLast()->Emit(this, &new_trace); 1127 } 1128 if (reg_exp_too_big_) return IrregexpRegExpTooBig(zone_->isolate()); 1129 1130 Handle<HeapObject> code = macro_assembler_->GetCode(pattern); 1131 heap->IncreaseTotalRegexpCodeGenerated(code->Size()); 1132 work_list_ = NULL; 1133 #ifdef DEBUG 1134 if (FLAG_print_code) { 1135 CodeTracer::Scope trace_scope(heap->isolate()->GetCodeTracer()); 1136 Handle<Code>::cast(code)->Disassemble(pattern->ToCString().get(), 1137 trace_scope.file()); 1138 } 1139 if (FLAG_trace_regexp_assembler) { 1140 delete macro_assembler_; 1141 } 1142 #endif 1143 return RegExpEngine::CompilationResult(*code, next_register_); 1144 } 1145 1146 1147 bool Trace::DeferredAction::Mentions(int that) { 1148 if (action_type() == ActionNode::CLEAR_CAPTURES) { 1149 Interval range = static_cast<DeferredClearCaptures*>(this)->range(); 1150 return range.Contains(that); 1151 } else { 1152 return reg() == that; 1153 } 1154 } 1155 1156 1157 bool Trace::mentions_reg(int reg) { 1158 for (DeferredAction* action = actions_; 1159 action != NULL; 1160 action = action->next()) { 1161 if (action->Mentions(reg)) 1162 return true; 1163 } 1164 return false; 1165 } 1166 1167 1168 bool Trace::GetStoredPosition(int reg, int* cp_offset) { 1169 ASSERT_EQ(0, *cp_offset); 1170 for (DeferredAction* action = actions_; 1171 action != NULL; 1172 action = action->next()) { 1173 if (action->Mentions(reg)) { 1174 if (action->action_type() == ActionNode::STORE_POSITION) { 1175 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset(); 1176 return true; 1177 } else { 1178 return false; 1179 } 1180 } 1181 } 1182 return false; 1183 } 1184 1185 1186 int Trace::FindAffectedRegisters(OutSet* affected_registers, 1187 Zone* zone) { 1188 int max_register = RegExpCompiler::kNoRegister; 1189 for (DeferredAction* action = actions_; 1190 action != NULL; 1191 action = action->next()) { 1192 if (action->action_type() == ActionNode::CLEAR_CAPTURES) { 1193 Interval range = static_cast<DeferredClearCaptures*>(action)->range(); 1194 for (int i = range.from(); i <= range.to(); i++) 1195 affected_registers->Set(i, zone); 1196 if (range.to() > max_register) max_register = range.to(); 1197 } else { 1198 affected_registers->Set(action->reg(), zone); 1199 if (action->reg() > max_register) max_register = action->reg(); 1200 } 1201 } 1202 return max_register; 1203 } 1204 1205 1206 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler, 1207 int max_register, 1208 const OutSet& registers_to_pop, 1209 const OutSet& registers_to_clear) { 1210 for (int reg = max_register; reg >= 0; reg--) { 1211 if (registers_to_pop.Get(reg)) { 1212 assembler->PopRegister(reg); 1213 } else if (registers_to_clear.Get(reg)) { 1214 int clear_to = reg; 1215 while (reg > 0 && registers_to_clear.Get(reg - 1)) { 1216 reg--; 1217 } 1218 assembler->ClearRegisters(reg, clear_to); 1219 } 1220 } 1221 } 1222 1223 1224 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler, 1225 int max_register, 1226 const OutSet& affected_registers, 1227 OutSet* registers_to_pop, 1228 OutSet* registers_to_clear, 1229 Zone* zone) { 1230 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1. 1231 const int push_limit = (assembler->stack_limit_slack() + 1) / 2; 1232 1233 // Count pushes performed to force a stack limit check occasionally. 1234 int pushes = 0; 1235 1236 for (int reg = 0; reg <= max_register; reg++) { 1237 if (!affected_registers.Get(reg)) { 1238 continue; 1239 } 1240 1241 // The chronologically first deferred action in the trace 1242 // is used to infer the action needed to restore a register 1243 // to its previous state (or not, if it's safe to ignore it). 1244 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR }; 1245 DeferredActionUndoType undo_action = IGNORE; 1246 1247 int value = 0; 1248 bool absolute = false; 1249 bool clear = false; 1250 int store_position = -1; 1251 // This is a little tricky because we are scanning the actions in reverse 1252 // historical order (newest first). 1253 for (DeferredAction* action = actions_; 1254 action != NULL; 1255 action = action->next()) { 1256 if (action->Mentions(reg)) { 1257 switch (action->action_type()) { 1258 case ActionNode::SET_REGISTER: { 1259 Trace::DeferredSetRegister* psr = 1260 static_cast<Trace::DeferredSetRegister*>(action); 1261 if (!absolute) { 1262 value += psr->value(); 1263 absolute = true; 1264 } 1265 // SET_REGISTER is currently only used for newly introduced loop 1266 // counters. They can have a significant previous value if they 1267 // occour in a loop. TODO(lrn): Propagate this information, so 1268 // we can set undo_action to IGNORE if we know there is no value to 1269 // restore. 1270 undo_action = RESTORE; 1271 ASSERT_EQ(store_position, -1); 1272 ASSERT(!clear); 1273 break; 1274 } 1275 case ActionNode::INCREMENT_REGISTER: 1276 if (!absolute) { 1277 value++; 1278 } 1279 ASSERT_EQ(store_position, -1); 1280 ASSERT(!clear); 1281 undo_action = RESTORE; 1282 break; 1283 case ActionNode::STORE_POSITION: { 1284 Trace::DeferredCapture* pc = 1285 static_cast<Trace::DeferredCapture*>(action); 1286 if (!clear && store_position == -1) { 1287 store_position = pc->cp_offset(); 1288 } 1289 1290 // For captures we know that stores and clears alternate. 1291 // Other register, are never cleared, and if the occur 1292 // inside a loop, they might be assigned more than once. 1293 if (reg <= 1) { 1294 // Registers zero and one, aka "capture zero", is 1295 // always set correctly if we succeed. There is no 1296 // need to undo a setting on backtrack, because we 1297 // will set it again or fail. 1298 undo_action = IGNORE; 1299 } else { 1300 undo_action = pc->is_capture() ? CLEAR : RESTORE; 1301 } 1302 ASSERT(!absolute); 1303 ASSERT_EQ(value, 0); 1304 break; 1305 } 1306 case ActionNode::CLEAR_CAPTURES: { 1307 // Since we're scanning in reverse order, if we've already 1308 // set the position we have to ignore historically earlier 1309 // clearing operations. 1310 if (store_position == -1) { 1311 clear = true; 1312 } 1313 undo_action = RESTORE; 1314 ASSERT(!absolute); 1315 ASSERT_EQ(value, 0); 1316 break; 1317 } 1318 default: 1319 UNREACHABLE(); 1320 break; 1321 } 1322 } 1323 } 1324 // Prepare for the undo-action (e.g., push if it's going to be popped). 1325 if (undo_action == RESTORE) { 1326 pushes++; 1327 RegExpMacroAssembler::StackCheckFlag stack_check = 1328 RegExpMacroAssembler::kNoStackLimitCheck; 1329 if (pushes == push_limit) { 1330 stack_check = RegExpMacroAssembler::kCheckStackLimit; 1331 pushes = 0; 1332 } 1333 1334 assembler->PushRegister(reg, stack_check); 1335 registers_to_pop->Set(reg, zone); 1336 } else if (undo_action == CLEAR) { 1337 registers_to_clear->Set(reg, zone); 1338 } 1339 // Perform the chronologically last action (or accumulated increment) 1340 // for the register. 1341 if (store_position != -1) { 1342 assembler->WriteCurrentPositionToRegister(reg, store_position); 1343 } else if (clear) { 1344 assembler->ClearRegisters(reg, reg); 1345 } else if (absolute) { 1346 assembler->SetRegister(reg, value); 1347 } else if (value != 0) { 1348 assembler->AdvanceRegister(reg, value); 1349 } 1350 } 1351 } 1352 1353 1354 // This is called as we come into a loop choice node and some other tricky 1355 // nodes. It normalizes the state of the code generator to ensure we can 1356 // generate generic code. 1357 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) { 1358 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1359 1360 ASSERT(!is_trivial()); 1361 1362 if (actions_ == NULL && backtrack() == NULL) { 1363 // Here we just have some deferred cp advances to fix and we are back to 1364 // a normal situation. We may also have to forget some information gained 1365 // through a quick check that was already performed. 1366 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_); 1367 // Create a new trivial state and generate the node with that. 1368 Trace new_state; 1369 successor->Emit(compiler, &new_state); 1370 return; 1371 } 1372 1373 // Generate deferred actions here along with code to undo them again. 1374 OutSet affected_registers; 1375 1376 if (backtrack() != NULL) { 1377 // Here we have a concrete backtrack location. These are set up by choice 1378 // nodes and so they indicate that we have a deferred save of the current 1379 // position which we may need to emit here. 1380 assembler->PushCurrentPosition(); 1381 } 1382 1383 int max_register = FindAffectedRegisters(&affected_registers, 1384 compiler->zone()); 1385 OutSet registers_to_pop; 1386 OutSet registers_to_clear; 1387 PerformDeferredActions(assembler, 1388 max_register, 1389 affected_registers, 1390 ®isters_to_pop, 1391 ®isters_to_clear, 1392 compiler->zone()); 1393 if (cp_offset_ != 0) { 1394 assembler->AdvanceCurrentPosition(cp_offset_); 1395 } 1396 1397 // Create a new trivial state and generate the node with that. 1398 Label undo; 1399 assembler->PushBacktrack(&undo); 1400 Trace new_state; 1401 successor->Emit(compiler, &new_state); 1402 1403 // On backtrack we need to restore state. 1404 assembler->Bind(&undo); 1405 RestoreAffectedRegisters(assembler, 1406 max_register, 1407 registers_to_pop, 1408 registers_to_clear); 1409 if (backtrack() == NULL) { 1410 assembler->Backtrack(); 1411 } else { 1412 assembler->PopCurrentPosition(); 1413 assembler->GoTo(backtrack()); 1414 } 1415 } 1416 1417 1418 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) { 1419 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1420 1421 // Omit flushing the trace. We discard the entire stack frame anyway. 1422 1423 if (!label()->is_bound()) { 1424 // We are completely independent of the trace, since we ignore it, 1425 // so this code can be used as the generic version. 1426 assembler->Bind(label()); 1427 } 1428 1429 // Throw away everything on the backtrack stack since the start 1430 // of the negative submatch and restore the character position. 1431 assembler->ReadCurrentPositionFromRegister(current_position_register_); 1432 assembler->ReadStackPointerFromRegister(stack_pointer_register_); 1433 if (clear_capture_count_ > 0) { 1434 // Clear any captures that might have been performed during the success 1435 // of the body of the negative look-ahead. 1436 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1; 1437 assembler->ClearRegisters(clear_capture_start_, clear_capture_end); 1438 } 1439 // Now that we have unwound the stack we find at the top of the stack the 1440 // backtrack that the BeginSubmatch node got. 1441 assembler->Backtrack(); 1442 } 1443 1444 1445 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) { 1446 if (!trace->is_trivial()) { 1447 trace->Flush(compiler, this); 1448 return; 1449 } 1450 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1451 if (!label()->is_bound()) { 1452 assembler->Bind(label()); 1453 } 1454 switch (action_) { 1455 case ACCEPT: 1456 assembler->Succeed(); 1457 return; 1458 case BACKTRACK: 1459 assembler->GoTo(trace->backtrack()); 1460 return; 1461 case NEGATIVE_SUBMATCH_SUCCESS: 1462 // This case is handled in a different virtual method. 1463 UNREACHABLE(); 1464 } 1465 UNIMPLEMENTED(); 1466 } 1467 1468 1469 void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) { 1470 if (guards_ == NULL) 1471 guards_ = new(zone) ZoneList<Guard*>(1, zone); 1472 guards_->Add(guard, zone); 1473 } 1474 1475 1476 ActionNode* ActionNode::SetRegister(int reg, 1477 int val, 1478 RegExpNode* on_success) { 1479 ActionNode* result = 1480 new(on_success->zone()) ActionNode(SET_REGISTER, on_success); 1481 result->data_.u_store_register.reg = reg; 1482 result->data_.u_store_register.value = val; 1483 return result; 1484 } 1485 1486 1487 ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) { 1488 ActionNode* result = 1489 new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success); 1490 result->data_.u_increment_register.reg = reg; 1491 return result; 1492 } 1493 1494 1495 ActionNode* ActionNode::StorePosition(int reg, 1496 bool is_capture, 1497 RegExpNode* on_success) { 1498 ActionNode* result = 1499 new(on_success->zone()) ActionNode(STORE_POSITION, on_success); 1500 result->data_.u_position_register.reg = reg; 1501 result->data_.u_position_register.is_capture = is_capture; 1502 return result; 1503 } 1504 1505 1506 ActionNode* ActionNode::ClearCaptures(Interval range, 1507 RegExpNode* on_success) { 1508 ActionNode* result = 1509 new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success); 1510 result->data_.u_clear_captures.range_from = range.from(); 1511 result->data_.u_clear_captures.range_to = range.to(); 1512 return result; 1513 } 1514 1515 1516 ActionNode* ActionNode::BeginSubmatch(int stack_reg, 1517 int position_reg, 1518 RegExpNode* on_success) { 1519 ActionNode* result = 1520 new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success); 1521 result->data_.u_submatch.stack_pointer_register = stack_reg; 1522 result->data_.u_submatch.current_position_register = position_reg; 1523 return result; 1524 } 1525 1526 1527 ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg, 1528 int position_reg, 1529 int clear_register_count, 1530 int clear_register_from, 1531 RegExpNode* on_success) { 1532 ActionNode* result = 1533 new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success); 1534 result->data_.u_submatch.stack_pointer_register = stack_reg; 1535 result->data_.u_submatch.current_position_register = position_reg; 1536 result->data_.u_submatch.clear_register_count = clear_register_count; 1537 result->data_.u_submatch.clear_register_from = clear_register_from; 1538 return result; 1539 } 1540 1541 1542 ActionNode* ActionNode::EmptyMatchCheck(int start_register, 1543 int repetition_register, 1544 int repetition_limit, 1545 RegExpNode* on_success) { 1546 ActionNode* result = 1547 new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success); 1548 result->data_.u_empty_match_check.start_register = start_register; 1549 result->data_.u_empty_match_check.repetition_register = repetition_register; 1550 result->data_.u_empty_match_check.repetition_limit = repetition_limit; 1551 return result; 1552 } 1553 1554 1555 #define DEFINE_ACCEPT(Type) \ 1556 void Type##Node::Accept(NodeVisitor* visitor) { \ 1557 visitor->Visit##Type(this); \ 1558 } 1559 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT) 1560 #undef DEFINE_ACCEPT 1561 1562 1563 void LoopChoiceNode::Accept(NodeVisitor* visitor) { 1564 visitor->VisitLoopChoice(this); 1565 } 1566 1567 1568 // ------------------------------------------------------------------- 1569 // Emit code. 1570 1571 1572 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler, 1573 Guard* guard, 1574 Trace* trace) { 1575 switch (guard->op()) { 1576 case Guard::LT: 1577 ASSERT(!trace->mentions_reg(guard->reg())); 1578 macro_assembler->IfRegisterGE(guard->reg(), 1579 guard->value(), 1580 trace->backtrack()); 1581 break; 1582 case Guard::GEQ: 1583 ASSERT(!trace->mentions_reg(guard->reg())); 1584 macro_assembler->IfRegisterLT(guard->reg(), 1585 guard->value(), 1586 trace->backtrack()); 1587 break; 1588 } 1589 } 1590 1591 1592 // Returns the number of characters in the equivalence class, omitting those 1593 // that cannot occur in the source string because it is ASCII. 1594 static int GetCaseIndependentLetters(Isolate* isolate, 1595 uc16 character, 1596 bool ascii_subject, 1597 unibrow::uchar* letters) { 1598 int length = 1599 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters); 1600 // Unibrow returns 0 or 1 for characters where case independence is 1601 // trivial. 1602 if (length == 0) { 1603 letters[0] = character; 1604 length = 1; 1605 } 1606 if (!ascii_subject || character <= String::kMaxOneByteCharCode) { 1607 return length; 1608 } 1609 // The standard requires that non-ASCII characters cannot have ASCII 1610 // character codes in their equivalence class. 1611 return 0; 1612 } 1613 1614 1615 static inline bool EmitSimpleCharacter(Isolate* isolate, 1616 RegExpCompiler* compiler, 1617 uc16 c, 1618 Label* on_failure, 1619 int cp_offset, 1620 bool check, 1621 bool preloaded) { 1622 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1623 bool bound_checked = false; 1624 if (!preloaded) { 1625 assembler->LoadCurrentCharacter( 1626 cp_offset, 1627 on_failure, 1628 check); 1629 bound_checked = true; 1630 } 1631 assembler->CheckNotCharacter(c, on_failure); 1632 return bound_checked; 1633 } 1634 1635 1636 // Only emits non-letters (things that don't have case). Only used for case 1637 // independent matches. 1638 static inline bool EmitAtomNonLetter(Isolate* isolate, 1639 RegExpCompiler* compiler, 1640 uc16 c, 1641 Label* on_failure, 1642 int cp_offset, 1643 bool check, 1644 bool preloaded) { 1645 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 1646 bool ascii = compiler->ascii(); 1647 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1648 int length = GetCaseIndependentLetters(isolate, c, ascii, chars); 1649 if (length < 1) { 1650 // This can't match. Must be an ASCII subject and a non-ASCII character. 1651 // We do not need to do anything since the ASCII pass already handled this. 1652 return false; // Bounds not checked. 1653 } 1654 bool checked = false; 1655 // We handle the length > 1 case in a later pass. 1656 if (length == 1) { 1657 if (ascii && c > String::kMaxOneByteCharCodeU) { 1658 // Can't match - see above. 1659 return false; // Bounds not checked. 1660 } 1661 if (!preloaded) { 1662 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 1663 checked = check; 1664 } 1665 macro_assembler->CheckNotCharacter(c, on_failure); 1666 } 1667 return checked; 1668 } 1669 1670 1671 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler, 1672 bool ascii, 1673 uc16 c1, 1674 uc16 c2, 1675 Label* on_failure) { 1676 uc16 char_mask; 1677 if (ascii) { 1678 char_mask = String::kMaxOneByteCharCode; 1679 } else { 1680 char_mask = String::kMaxUtf16CodeUnit; 1681 } 1682 uc16 exor = c1 ^ c2; 1683 // Check whether exor has only one bit set. 1684 if (((exor - 1) & exor) == 0) { 1685 // If c1 and c2 differ only by one bit. 1686 // Ecma262UnCanonicalize always gives the highest number last. 1687 ASSERT(c2 > c1); 1688 uc16 mask = char_mask ^ exor; 1689 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure); 1690 return true; 1691 } 1692 ASSERT(c2 > c1); 1693 uc16 diff = c2 - c1; 1694 if (((diff - 1) & diff) == 0 && c1 >= diff) { 1695 // If the characters differ by 2^n but don't differ by one bit then 1696 // subtract the difference from the found character, then do the or 1697 // trick. We avoid the theoretical case where negative numbers are 1698 // involved in order to simplify code generation. 1699 uc16 mask = char_mask ^ diff; 1700 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff, 1701 diff, 1702 mask, 1703 on_failure); 1704 return true; 1705 } 1706 return false; 1707 } 1708 1709 1710 typedef bool EmitCharacterFunction(Isolate* isolate, 1711 RegExpCompiler* compiler, 1712 uc16 c, 1713 Label* on_failure, 1714 int cp_offset, 1715 bool check, 1716 bool preloaded); 1717 1718 // Only emits letters (things that have case). Only used for case independent 1719 // matches. 1720 static inline bool EmitAtomLetter(Isolate* isolate, 1721 RegExpCompiler* compiler, 1722 uc16 c, 1723 Label* on_failure, 1724 int cp_offset, 1725 bool check, 1726 bool preloaded) { 1727 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 1728 bool ascii = compiler->ascii(); 1729 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1730 int length = GetCaseIndependentLetters(isolate, c, ascii, chars); 1731 if (length <= 1) return false; 1732 // We may not need to check against the end of the input string 1733 // if this character lies before a character that matched. 1734 if (!preloaded) { 1735 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 1736 } 1737 Label ok; 1738 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4); 1739 switch (length) { 1740 case 2: { 1741 if (ShortCutEmitCharacterPair(macro_assembler, 1742 ascii, 1743 chars[0], 1744 chars[1], 1745 on_failure)) { 1746 } else { 1747 macro_assembler->CheckCharacter(chars[0], &ok); 1748 macro_assembler->CheckNotCharacter(chars[1], on_failure); 1749 macro_assembler->Bind(&ok); 1750 } 1751 break; 1752 } 1753 case 4: 1754 macro_assembler->CheckCharacter(chars[3], &ok); 1755 // Fall through! 1756 case 3: 1757 macro_assembler->CheckCharacter(chars[0], &ok); 1758 macro_assembler->CheckCharacter(chars[1], &ok); 1759 macro_assembler->CheckNotCharacter(chars[2], on_failure); 1760 macro_assembler->Bind(&ok); 1761 break; 1762 default: 1763 UNREACHABLE(); 1764 break; 1765 } 1766 return true; 1767 } 1768 1769 1770 static void EmitBoundaryTest(RegExpMacroAssembler* masm, 1771 int border, 1772 Label* fall_through, 1773 Label* above_or_equal, 1774 Label* below) { 1775 if (below != fall_through) { 1776 masm->CheckCharacterLT(border, below); 1777 if (above_or_equal != fall_through) masm->GoTo(above_or_equal); 1778 } else { 1779 masm->CheckCharacterGT(border - 1, above_or_equal); 1780 } 1781 } 1782 1783 1784 static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm, 1785 int first, 1786 int last, 1787 Label* fall_through, 1788 Label* in_range, 1789 Label* out_of_range) { 1790 if (in_range == fall_through) { 1791 if (first == last) { 1792 masm->CheckNotCharacter(first, out_of_range); 1793 } else { 1794 masm->CheckCharacterNotInRange(first, last, out_of_range); 1795 } 1796 } else { 1797 if (first == last) { 1798 masm->CheckCharacter(first, in_range); 1799 } else { 1800 masm->CheckCharacterInRange(first, last, in_range); 1801 } 1802 if (out_of_range != fall_through) masm->GoTo(out_of_range); 1803 } 1804 } 1805 1806 1807 // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even. 1808 // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd. 1809 static void EmitUseLookupTable( 1810 RegExpMacroAssembler* masm, 1811 ZoneList<int>* ranges, 1812 int start_index, 1813 int end_index, 1814 int min_char, 1815 Label* fall_through, 1816 Label* even_label, 1817 Label* odd_label) { 1818 static const int kSize = RegExpMacroAssembler::kTableSize; 1819 static const int kMask = RegExpMacroAssembler::kTableMask; 1820 1821 int base = (min_char & ~kMask); 1822 USE(base); 1823 1824 // Assert that everything is on one kTableSize page. 1825 for (int i = start_index; i <= end_index; i++) { 1826 ASSERT_EQ(ranges->at(i) & ~kMask, base); 1827 } 1828 ASSERT(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base); 1829 1830 char templ[kSize]; 1831 Label* on_bit_set; 1832 Label* on_bit_clear; 1833 int bit; 1834 if (even_label == fall_through) { 1835 on_bit_set = odd_label; 1836 on_bit_clear = even_label; 1837 bit = 1; 1838 } else { 1839 on_bit_set = even_label; 1840 on_bit_clear = odd_label; 1841 bit = 0; 1842 } 1843 for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) { 1844 templ[i] = bit; 1845 } 1846 int j = 0; 1847 bit ^= 1; 1848 for (int i = start_index; i < end_index; i++) { 1849 for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) { 1850 templ[j] = bit; 1851 } 1852 bit ^= 1; 1853 } 1854 for (int i = j; i < kSize; i++) { 1855 templ[i] = bit; 1856 } 1857 Factory* factory = masm->zone()->isolate()->factory(); 1858 // TODO(erikcorry): Cache these. 1859 Handle<ByteArray> ba = factory->NewByteArray(kSize, TENURED); 1860 for (int i = 0; i < kSize; i++) { 1861 ba->set(i, templ[i]); 1862 } 1863 masm->CheckBitInTable(ba, on_bit_set); 1864 if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear); 1865 } 1866 1867 1868 static void CutOutRange(RegExpMacroAssembler* masm, 1869 ZoneList<int>* ranges, 1870 int start_index, 1871 int end_index, 1872 int cut_index, 1873 Label* even_label, 1874 Label* odd_label) { 1875 bool odd = (((cut_index - start_index) & 1) == 1); 1876 Label* in_range_label = odd ? odd_label : even_label; 1877 Label dummy; 1878 EmitDoubleBoundaryTest(masm, 1879 ranges->at(cut_index), 1880 ranges->at(cut_index + 1) - 1, 1881 &dummy, 1882 in_range_label, 1883 &dummy); 1884 ASSERT(!dummy.is_linked()); 1885 // Cut out the single range by rewriting the array. This creates a new 1886 // range that is a merger of the two ranges on either side of the one we 1887 // are cutting out. The oddity of the labels is preserved. 1888 for (int j = cut_index; j > start_index; j--) { 1889 ranges->at(j) = ranges->at(j - 1); 1890 } 1891 for (int j = cut_index + 1; j < end_index; j++) { 1892 ranges->at(j) = ranges->at(j + 1); 1893 } 1894 } 1895 1896 1897 // Unicode case. Split the search space into kSize spaces that are handled 1898 // with recursion. 1899 static void SplitSearchSpace(ZoneList<int>* ranges, 1900 int start_index, 1901 int end_index, 1902 int* new_start_index, 1903 int* new_end_index, 1904 int* border) { 1905 static const int kSize = RegExpMacroAssembler::kTableSize; 1906 static const int kMask = RegExpMacroAssembler::kTableMask; 1907 1908 int first = ranges->at(start_index); 1909 int last = ranges->at(end_index) - 1; 1910 1911 *new_start_index = start_index; 1912 *border = (ranges->at(start_index) & ~kMask) + kSize; 1913 while (*new_start_index < end_index) { 1914 if (ranges->at(*new_start_index) > *border) break; 1915 (*new_start_index)++; 1916 } 1917 // new_start_index is the index of the first edge that is beyond the 1918 // current kSize space. 1919 1920 // For very large search spaces we do a binary chop search of the non-ASCII 1921 // space instead of just going to the end of the current kSize space. The 1922 // heuristics are complicated a little by the fact that any 128-character 1923 // encoding space can be quickly tested with a table lookup, so we don't 1924 // wish to do binary chop search at a smaller granularity than that. A 1925 // 128-character space can take up a lot of space in the ranges array if, 1926 // for example, we only want to match every second character (eg. the lower 1927 // case characters on some Unicode pages). 1928 int binary_chop_index = (end_index + start_index) / 2; 1929 // The first test ensures that we get to the code that handles the ASCII 1930 // range with a single not-taken branch, speeding up this important 1931 // character range (even non-ASCII charset-based text has spaces and 1932 // punctuation). 1933 if (*border - 1 > String::kMaxOneByteCharCode && // ASCII case. 1934 end_index - start_index > (*new_start_index - start_index) * 2 && 1935 last - first > kSize * 2 && 1936 binary_chop_index > *new_start_index && 1937 ranges->at(binary_chop_index) >= first + 2 * kSize) { 1938 int scan_forward_for_section_border = binary_chop_index;; 1939 int new_border = (ranges->at(binary_chop_index) | kMask) + 1; 1940 1941 while (scan_forward_for_section_border < end_index) { 1942 if (ranges->at(scan_forward_for_section_border) > new_border) { 1943 *new_start_index = scan_forward_for_section_border; 1944 *border = new_border; 1945 break; 1946 } 1947 scan_forward_for_section_border++; 1948 } 1949 } 1950 1951 ASSERT(*new_start_index > start_index); 1952 *new_end_index = *new_start_index - 1; 1953 if (ranges->at(*new_end_index) == *border) { 1954 (*new_end_index)--; 1955 } 1956 if (*border >= ranges->at(end_index)) { 1957 *border = ranges->at(end_index); 1958 *new_start_index = end_index; // Won't be used. 1959 *new_end_index = end_index - 1; 1960 } 1961 } 1962 1963 1964 // Gets a series of segment boundaries representing a character class. If the 1965 // character is in the range between an even and an odd boundary (counting from 1966 // start_index) then go to even_label, otherwise go to odd_label. We already 1967 // know that the character is in the range of min_char to max_char inclusive. 1968 // Either label can be NULL indicating backtracking. Either label can also be 1969 // equal to the fall_through label. 1970 static void GenerateBranches(RegExpMacroAssembler* masm, 1971 ZoneList<int>* ranges, 1972 int start_index, 1973 int end_index, 1974 uc16 min_char, 1975 uc16 max_char, 1976 Label* fall_through, 1977 Label* even_label, 1978 Label* odd_label) { 1979 int first = ranges->at(start_index); 1980 int last = ranges->at(end_index) - 1; 1981 1982 ASSERT_LT(min_char, first); 1983 1984 // Just need to test if the character is before or on-or-after 1985 // a particular character. 1986 if (start_index == end_index) { 1987 EmitBoundaryTest(masm, first, fall_through, even_label, odd_label); 1988 return; 1989 } 1990 1991 // Another almost trivial case: There is one interval in the middle that is 1992 // different from the end intervals. 1993 if (start_index + 1 == end_index) { 1994 EmitDoubleBoundaryTest( 1995 masm, first, last, fall_through, even_label, odd_label); 1996 return; 1997 } 1998 1999 // It's not worth using table lookup if there are very few intervals in the 2000 // character class. 2001 if (end_index - start_index <= 6) { 2002 // It is faster to test for individual characters, so we look for those 2003 // first, then try arbitrary ranges in the second round. 2004 static int kNoCutIndex = -1; 2005 int cut = kNoCutIndex; 2006 for (int i = start_index; i < end_index; i++) { 2007 if (ranges->at(i) == ranges->at(i + 1) - 1) { 2008 cut = i; 2009 break; 2010 } 2011 } 2012 if (cut == kNoCutIndex) cut = start_index; 2013 CutOutRange( 2014 masm, ranges, start_index, end_index, cut, even_label, odd_label); 2015 ASSERT_GE(end_index - start_index, 2); 2016 GenerateBranches(masm, 2017 ranges, 2018 start_index + 1, 2019 end_index - 1, 2020 min_char, 2021 max_char, 2022 fall_through, 2023 even_label, 2024 odd_label); 2025 return; 2026 } 2027 2028 // If there are a lot of intervals in the regexp, then we will use tables to 2029 // determine whether the character is inside or outside the character class. 2030 static const int kBits = RegExpMacroAssembler::kTableSizeBits; 2031 2032 if ((max_char >> kBits) == (min_char >> kBits)) { 2033 EmitUseLookupTable(masm, 2034 ranges, 2035 start_index, 2036 end_index, 2037 min_char, 2038 fall_through, 2039 even_label, 2040 odd_label); 2041 return; 2042 } 2043 2044 if ((min_char >> kBits) != (first >> kBits)) { 2045 masm->CheckCharacterLT(first, odd_label); 2046 GenerateBranches(masm, 2047 ranges, 2048 start_index + 1, 2049 end_index, 2050 first, 2051 max_char, 2052 fall_through, 2053 odd_label, 2054 even_label); 2055 return; 2056 } 2057 2058 int new_start_index = 0; 2059 int new_end_index = 0; 2060 int border = 0; 2061 2062 SplitSearchSpace(ranges, 2063 start_index, 2064 end_index, 2065 &new_start_index, 2066 &new_end_index, 2067 &border); 2068 2069 Label handle_rest; 2070 Label* above = &handle_rest; 2071 if (border == last + 1) { 2072 // We didn't find any section that started after the limit, so everything 2073 // above the border is one of the terminal labels. 2074 above = (end_index & 1) != (start_index & 1) ? odd_label : even_label; 2075 ASSERT(new_end_index == end_index - 1); 2076 } 2077 2078 ASSERT_LE(start_index, new_end_index); 2079 ASSERT_LE(new_start_index, end_index); 2080 ASSERT_LT(start_index, new_start_index); 2081 ASSERT_LT(new_end_index, end_index); 2082 ASSERT(new_end_index + 1 == new_start_index || 2083 (new_end_index + 2 == new_start_index && 2084 border == ranges->at(new_end_index + 1))); 2085 ASSERT_LT(min_char, border - 1); 2086 ASSERT_LT(border, max_char); 2087 ASSERT_LT(ranges->at(new_end_index), border); 2088 ASSERT(border < ranges->at(new_start_index) || 2089 (border == ranges->at(new_start_index) && 2090 new_start_index == end_index && 2091 new_end_index == end_index - 1 && 2092 border == last + 1)); 2093 ASSERT(new_start_index == 0 || border >= ranges->at(new_start_index - 1)); 2094 2095 masm->CheckCharacterGT(border - 1, above); 2096 Label dummy; 2097 GenerateBranches(masm, 2098 ranges, 2099 start_index, 2100 new_end_index, 2101 min_char, 2102 border - 1, 2103 &dummy, 2104 even_label, 2105 odd_label); 2106 if (handle_rest.is_linked()) { 2107 masm->Bind(&handle_rest); 2108 bool flip = (new_start_index & 1) != (start_index & 1); 2109 GenerateBranches(masm, 2110 ranges, 2111 new_start_index, 2112 end_index, 2113 border, 2114 max_char, 2115 &dummy, 2116 flip ? odd_label : even_label, 2117 flip ? even_label : odd_label); 2118 } 2119 } 2120 2121 2122 static void EmitCharClass(RegExpMacroAssembler* macro_assembler, 2123 RegExpCharacterClass* cc, 2124 bool ascii, 2125 Label* on_failure, 2126 int cp_offset, 2127 bool check_offset, 2128 bool preloaded, 2129 Zone* zone) { 2130 ZoneList<CharacterRange>* ranges = cc->ranges(zone); 2131 if (!CharacterRange::IsCanonical(ranges)) { 2132 CharacterRange::Canonicalize(ranges); 2133 } 2134 2135 int max_char; 2136 if (ascii) { 2137 max_char = String::kMaxOneByteCharCode; 2138 } else { 2139 max_char = String::kMaxUtf16CodeUnit; 2140 } 2141 2142 int range_count = ranges->length(); 2143 2144 int last_valid_range = range_count - 1; 2145 while (last_valid_range >= 0) { 2146 CharacterRange& range = ranges->at(last_valid_range); 2147 if (range.from() <= max_char) { 2148 break; 2149 } 2150 last_valid_range--; 2151 } 2152 2153 if (last_valid_range < 0) { 2154 if (!cc->is_negated()) { 2155 macro_assembler->GoTo(on_failure); 2156 } 2157 if (check_offset) { 2158 macro_assembler->CheckPosition(cp_offset, on_failure); 2159 } 2160 return; 2161 } 2162 2163 if (last_valid_range == 0 && 2164 ranges->at(0).IsEverything(max_char)) { 2165 if (cc->is_negated()) { 2166 macro_assembler->GoTo(on_failure); 2167 } else { 2168 // This is a common case hit by non-anchored expressions. 2169 if (check_offset) { 2170 macro_assembler->CheckPosition(cp_offset, on_failure); 2171 } 2172 } 2173 return; 2174 } 2175 if (last_valid_range == 0 && 2176 !cc->is_negated() && 2177 ranges->at(0).IsEverything(max_char)) { 2178 // This is a common case hit by non-anchored expressions. 2179 if (check_offset) { 2180 macro_assembler->CheckPosition(cp_offset, on_failure); 2181 } 2182 return; 2183 } 2184 2185 if (!preloaded) { 2186 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset); 2187 } 2188 2189 if (cc->is_standard(zone) && 2190 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(), 2191 on_failure)) { 2192 return; 2193 } 2194 2195 2196 // A new list with ascending entries. Each entry is a code unit 2197 // where there is a boundary between code units that are part of 2198 // the class and code units that are not. Normally we insert an 2199 // entry at zero which goes to the failure label, but if there 2200 // was already one there we fall through for success on that entry. 2201 // Subsequent entries have alternating meaning (success/failure). 2202 ZoneList<int>* range_boundaries = 2203 new(zone) ZoneList<int>(last_valid_range, zone); 2204 2205 bool zeroth_entry_is_failure = !cc->is_negated(); 2206 2207 for (int i = 0; i <= last_valid_range; i++) { 2208 CharacterRange& range = ranges->at(i); 2209 if (range.from() == 0) { 2210 ASSERT_EQ(i, 0); 2211 zeroth_entry_is_failure = !zeroth_entry_is_failure; 2212 } else { 2213 range_boundaries->Add(range.from(), zone); 2214 } 2215 range_boundaries->Add(range.to() + 1, zone); 2216 } 2217 int end_index = range_boundaries->length() - 1; 2218 if (range_boundaries->at(end_index) > max_char) { 2219 end_index--; 2220 } 2221 2222 Label fall_through; 2223 GenerateBranches(macro_assembler, 2224 range_boundaries, 2225 0, // start_index. 2226 end_index, 2227 0, // min_char. 2228 max_char, 2229 &fall_through, 2230 zeroth_entry_is_failure ? &fall_through : on_failure, 2231 zeroth_entry_is_failure ? on_failure : &fall_through); 2232 macro_assembler->Bind(&fall_through); 2233 } 2234 2235 2236 RegExpNode::~RegExpNode() { 2237 } 2238 2239 2240 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler, 2241 Trace* trace) { 2242 // If we are generating a greedy loop then don't stop and don't reuse code. 2243 if (trace->stop_node() != NULL) { 2244 return CONTINUE; 2245 } 2246 2247 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 2248 if (trace->is_trivial()) { 2249 if (label_.is_bound()) { 2250 // We are being asked to generate a generic version, but that's already 2251 // been done so just go to it. 2252 macro_assembler->GoTo(&label_); 2253 return DONE; 2254 } 2255 if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) { 2256 // To avoid too deep recursion we push the node to the work queue and just 2257 // generate a goto here. 2258 compiler->AddWork(this); 2259 macro_assembler->GoTo(&label_); 2260 return DONE; 2261 } 2262 // Generate generic version of the node and bind the label for later use. 2263 macro_assembler->Bind(&label_); 2264 return CONTINUE; 2265 } 2266 2267 // We are being asked to make a non-generic version. Keep track of how many 2268 // non-generic versions we generate so as not to overdo it. 2269 trace_count_++; 2270 if (FLAG_regexp_optimization && 2271 trace_count_ < kMaxCopiesCodeGenerated && 2272 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) { 2273 return CONTINUE; 2274 } 2275 2276 // If we get here code has been generated for this node too many times or 2277 // recursion is too deep. Time to switch to a generic version. The code for 2278 // generic versions above can handle deep recursion properly. 2279 trace->Flush(compiler, this); 2280 return DONE; 2281 } 2282 2283 2284 int ActionNode::EatsAtLeast(int still_to_find, 2285 int budget, 2286 bool not_at_start) { 2287 if (budget <= 0) return 0; 2288 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input! 2289 return on_success()->EatsAtLeast(still_to_find, 2290 budget - 1, 2291 not_at_start); 2292 } 2293 2294 2295 void ActionNode::FillInBMInfo(int offset, 2296 int budget, 2297 BoyerMooreLookahead* bm, 2298 bool not_at_start) { 2299 if (action_type_ == BEGIN_SUBMATCH) { 2300 bm->SetRest(offset); 2301 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) { 2302 on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start); 2303 } 2304 SaveBMInfo(bm, not_at_start, offset); 2305 } 2306 2307 2308 int AssertionNode::EatsAtLeast(int still_to_find, 2309 int budget, 2310 bool not_at_start) { 2311 if (budget <= 0) return 0; 2312 // If we know we are not at the start and we are asked "how many characters 2313 // will you match if you succeed?" then we can answer anything since false 2314 // implies false. So lets just return the max answer (still_to_find) since 2315 // that won't prevent us from preloading a lot of characters for the other 2316 // branches in the node graph. 2317 if (assertion_type() == AT_START && not_at_start) return still_to_find; 2318 return on_success()->EatsAtLeast(still_to_find, 2319 budget - 1, 2320 not_at_start); 2321 } 2322 2323 2324 void AssertionNode::FillInBMInfo(int offset, 2325 int budget, 2326 BoyerMooreLookahead* bm, 2327 bool not_at_start) { 2328 // Match the behaviour of EatsAtLeast on this node. 2329 if (assertion_type() == AT_START && not_at_start) return; 2330 on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start); 2331 SaveBMInfo(bm, not_at_start, offset); 2332 } 2333 2334 2335 int BackReferenceNode::EatsAtLeast(int still_to_find, 2336 int budget, 2337 bool not_at_start) { 2338 if (budget <= 0) return 0; 2339 return on_success()->EatsAtLeast(still_to_find, 2340 budget - 1, 2341 not_at_start); 2342 } 2343 2344 2345 int TextNode::EatsAtLeast(int still_to_find, 2346 int budget, 2347 bool not_at_start) { 2348 int answer = Length(); 2349 if (answer >= still_to_find) return answer; 2350 if (budget <= 0) return answer; 2351 // We are not at start after this node so we set the last argument to 'true'. 2352 return answer + on_success()->EatsAtLeast(still_to_find - answer, 2353 budget - 1, 2354 true); 2355 } 2356 2357 2358 int NegativeLookaheadChoiceNode::EatsAtLeast(int still_to_find, 2359 int budget, 2360 bool not_at_start) { 2361 if (budget <= 0) return 0; 2362 // Alternative 0 is the negative lookahead, alternative 1 is what comes 2363 // afterwards. 2364 RegExpNode* node = alternatives_->at(1).node(); 2365 return node->EatsAtLeast(still_to_find, budget - 1, not_at_start); 2366 } 2367 2368 2369 void NegativeLookaheadChoiceNode::GetQuickCheckDetails( 2370 QuickCheckDetails* details, 2371 RegExpCompiler* compiler, 2372 int filled_in, 2373 bool not_at_start) { 2374 // Alternative 0 is the negative lookahead, alternative 1 is what comes 2375 // afterwards. 2376 RegExpNode* node = alternatives_->at(1).node(); 2377 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start); 2378 } 2379 2380 2381 int ChoiceNode::EatsAtLeastHelper(int still_to_find, 2382 int budget, 2383 RegExpNode* ignore_this_node, 2384 bool not_at_start) { 2385 if (budget <= 0) return 0; 2386 int min = 100; 2387 int choice_count = alternatives_->length(); 2388 budget = (budget - 1) / choice_count; 2389 for (int i = 0; i < choice_count; i++) { 2390 RegExpNode* node = alternatives_->at(i).node(); 2391 if (node == ignore_this_node) continue; 2392 int node_eats_at_least = 2393 node->EatsAtLeast(still_to_find, budget, not_at_start); 2394 if (node_eats_at_least < min) min = node_eats_at_least; 2395 if (min == 0) return 0; 2396 } 2397 return min; 2398 } 2399 2400 2401 int LoopChoiceNode::EatsAtLeast(int still_to_find, 2402 int budget, 2403 bool not_at_start) { 2404 return EatsAtLeastHelper(still_to_find, 2405 budget - 1, 2406 loop_node_, 2407 not_at_start); 2408 } 2409 2410 2411 int ChoiceNode::EatsAtLeast(int still_to_find, 2412 int budget, 2413 bool not_at_start) { 2414 return EatsAtLeastHelper(still_to_find, 2415 budget, 2416 NULL, 2417 not_at_start); 2418 } 2419 2420 2421 // Takes the left-most 1-bit and smears it out, setting all bits to its right. 2422 static inline uint32_t SmearBitsRight(uint32_t v) { 2423 v |= v >> 1; 2424 v |= v >> 2; 2425 v |= v >> 4; 2426 v |= v >> 8; 2427 v |= v >> 16; 2428 return v; 2429 } 2430 2431 2432 bool QuickCheckDetails::Rationalize(bool asc) { 2433 bool found_useful_op = false; 2434 uint32_t char_mask; 2435 if (asc) { 2436 char_mask = String::kMaxOneByteCharCode; 2437 } else { 2438 char_mask = String::kMaxUtf16CodeUnit; 2439 } 2440 mask_ = 0; 2441 value_ = 0; 2442 int char_shift = 0; 2443 for (int i = 0; i < characters_; i++) { 2444 Position* pos = &positions_[i]; 2445 if ((pos->mask & String::kMaxOneByteCharCode) != 0) { 2446 found_useful_op = true; 2447 } 2448 mask_ |= (pos->mask & char_mask) << char_shift; 2449 value_ |= (pos->value & char_mask) << char_shift; 2450 char_shift += asc ? 8 : 16; 2451 } 2452 return found_useful_op; 2453 } 2454 2455 2456 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler, 2457 Trace* trace, 2458 bool preload_has_checked_bounds, 2459 Label* on_possible_success, 2460 QuickCheckDetails* details, 2461 bool fall_through_on_failure) { 2462 if (details->characters() == 0) return false; 2463 GetQuickCheckDetails( 2464 details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE); 2465 if (details->cannot_match()) return false; 2466 if (!details->Rationalize(compiler->ascii())) return false; 2467 ASSERT(details->characters() == 1 || 2468 compiler->macro_assembler()->CanReadUnaligned()); 2469 uint32_t mask = details->mask(); 2470 uint32_t value = details->value(); 2471 2472 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2473 2474 if (trace->characters_preloaded() != details->characters()) { 2475 assembler->LoadCurrentCharacter(trace->cp_offset(), 2476 trace->backtrack(), 2477 !preload_has_checked_bounds, 2478 details->characters()); 2479 } 2480 2481 2482 bool need_mask = true; 2483 2484 if (details->characters() == 1) { 2485 // If number of characters preloaded is 1 then we used a byte or 16 bit 2486 // load so the value is already masked down. 2487 uint32_t char_mask; 2488 if (compiler->ascii()) { 2489 char_mask = String::kMaxOneByteCharCode; 2490 } else { 2491 char_mask = String::kMaxUtf16CodeUnit; 2492 } 2493 if ((mask & char_mask) == char_mask) need_mask = false; 2494 mask &= char_mask; 2495 } else { 2496 // For 2-character preloads in ASCII mode or 1-character preloads in 2497 // TWO_BYTE mode we also use a 16 bit load with zero extend. 2498 if (details->characters() == 2 && compiler->ascii()) { 2499 if ((mask & 0xffff) == 0xffff) need_mask = false; 2500 } else if (details->characters() == 1 && !compiler->ascii()) { 2501 if ((mask & 0xffff) == 0xffff) need_mask = false; 2502 } else { 2503 if (mask == 0xffffffff) need_mask = false; 2504 } 2505 } 2506 2507 if (fall_through_on_failure) { 2508 if (need_mask) { 2509 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success); 2510 } else { 2511 assembler->CheckCharacter(value, on_possible_success); 2512 } 2513 } else { 2514 if (need_mask) { 2515 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack()); 2516 } else { 2517 assembler->CheckNotCharacter(value, trace->backtrack()); 2518 } 2519 } 2520 return true; 2521 } 2522 2523 2524 // Here is the meat of GetQuickCheckDetails (see also the comment on the 2525 // super-class in the .h file). 2526 // 2527 // We iterate along the text object, building up for each character a 2528 // mask and value that can be used to test for a quick failure to match. 2529 // The masks and values for the positions will be combined into a single 2530 // machine word for the current character width in order to be used in 2531 // generating a quick check. 2532 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, 2533 RegExpCompiler* compiler, 2534 int characters_filled_in, 2535 bool not_at_start) { 2536 Isolate* isolate = compiler->macro_assembler()->zone()->isolate(); 2537 ASSERT(characters_filled_in < details->characters()); 2538 int characters = details->characters(); 2539 int char_mask; 2540 if (compiler->ascii()) { 2541 char_mask = String::kMaxOneByteCharCode; 2542 } else { 2543 char_mask = String::kMaxUtf16CodeUnit; 2544 } 2545 for (int k = 0; k < elms_->length(); k++) { 2546 TextElement elm = elms_->at(k); 2547 if (elm.text_type() == TextElement::ATOM) { 2548 Vector<const uc16> quarks = elm.atom()->data(); 2549 for (int i = 0; i < characters && i < quarks.length(); i++) { 2550 QuickCheckDetails::Position* pos = 2551 details->positions(characters_filled_in); 2552 uc16 c = quarks[i]; 2553 if (c > char_mask) { 2554 // If we expect a non-ASCII character from an ASCII string, 2555 // there is no way we can match. Not even case independent 2556 // matching can turn an ASCII character into non-ASCII or 2557 // vice versa. 2558 details->set_cannot_match(); 2559 pos->determines_perfectly = false; 2560 return; 2561 } 2562 if (compiler->ignore_case()) { 2563 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 2564 int length = GetCaseIndependentLetters(isolate, c, compiler->ascii(), 2565 chars); 2566 ASSERT(length != 0); // Can only happen if c > char_mask (see above). 2567 if (length == 1) { 2568 // This letter has no case equivalents, so it's nice and simple 2569 // and the mask-compare will determine definitely whether we have 2570 // a match at this character position. 2571 pos->mask = char_mask; 2572 pos->value = c; 2573 pos->determines_perfectly = true; 2574 } else { 2575 uint32_t common_bits = char_mask; 2576 uint32_t bits = chars[0]; 2577 for (int j = 1; j < length; j++) { 2578 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits); 2579 common_bits ^= differing_bits; 2580 bits &= common_bits; 2581 } 2582 // If length is 2 and common bits has only one zero in it then 2583 // our mask and compare instruction will determine definitely 2584 // whether we have a match at this character position. Otherwise 2585 // it can only be an approximate check. 2586 uint32_t one_zero = (common_bits | ~char_mask); 2587 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) { 2588 pos->determines_perfectly = true; 2589 } 2590 pos->mask = common_bits; 2591 pos->value = bits; 2592 } 2593 } else { 2594 // Don't ignore case. Nice simple case where the mask-compare will 2595 // determine definitely whether we have a match at this character 2596 // position. 2597 pos->mask = char_mask; 2598 pos->value = c; 2599 pos->determines_perfectly = true; 2600 } 2601 characters_filled_in++; 2602 ASSERT(characters_filled_in <= details->characters()); 2603 if (characters_filled_in == details->characters()) { 2604 return; 2605 } 2606 } 2607 } else { 2608 QuickCheckDetails::Position* pos = 2609 details->positions(characters_filled_in); 2610 RegExpCharacterClass* tree = elm.char_class(); 2611 ZoneList<CharacterRange>* ranges = tree->ranges(zone()); 2612 if (tree->is_negated()) { 2613 // A quick check uses multi-character mask and compare. There is no 2614 // useful way to incorporate a negative char class into this scheme 2615 // so we just conservatively create a mask and value that will always 2616 // succeed. 2617 pos->mask = 0; 2618 pos->value = 0; 2619 } else { 2620 int first_range = 0; 2621 while (ranges->at(first_range).from() > char_mask) { 2622 first_range++; 2623 if (first_range == ranges->length()) { 2624 details->set_cannot_match(); 2625 pos->determines_perfectly = false; 2626 return; 2627 } 2628 } 2629 CharacterRange range = ranges->at(first_range); 2630 uc16 from = range.from(); 2631 uc16 to = range.to(); 2632 if (to > char_mask) { 2633 to = char_mask; 2634 } 2635 uint32_t differing_bits = (from ^ to); 2636 // A mask and compare is only perfect if the differing bits form a 2637 // number like 00011111 with one single block of trailing 1s. 2638 if ((differing_bits & (differing_bits + 1)) == 0 && 2639 from + differing_bits == to) { 2640 pos->determines_perfectly = true; 2641 } 2642 uint32_t common_bits = ~SmearBitsRight(differing_bits); 2643 uint32_t bits = (from & common_bits); 2644 for (int i = first_range + 1; i < ranges->length(); i++) { 2645 CharacterRange range = ranges->at(i); 2646 uc16 from = range.from(); 2647 uc16 to = range.to(); 2648 if (from > char_mask) continue; 2649 if (to > char_mask) to = char_mask; 2650 // Here we are combining more ranges into the mask and compare 2651 // value. With each new range the mask becomes more sparse and 2652 // so the chances of a false positive rise. A character class 2653 // with multiple ranges is assumed never to be equivalent to a 2654 // mask and compare operation. 2655 pos->determines_perfectly = false; 2656 uint32_t new_common_bits = (from ^ to); 2657 new_common_bits = ~SmearBitsRight(new_common_bits); 2658 common_bits &= new_common_bits; 2659 bits &= new_common_bits; 2660 uint32_t differing_bits = (from & common_bits) ^ bits; 2661 common_bits ^= differing_bits; 2662 bits &= common_bits; 2663 } 2664 pos->mask = common_bits; 2665 pos->value = bits; 2666 } 2667 characters_filled_in++; 2668 ASSERT(characters_filled_in <= details->characters()); 2669 if (characters_filled_in == details->characters()) { 2670 return; 2671 } 2672 } 2673 } 2674 ASSERT(characters_filled_in != details->characters()); 2675 if (!details->cannot_match()) { 2676 on_success()-> GetQuickCheckDetails(details, 2677 compiler, 2678 characters_filled_in, 2679 true); 2680 } 2681 } 2682 2683 2684 void QuickCheckDetails::Clear() { 2685 for (int i = 0; i < characters_; i++) { 2686 positions_[i].mask = 0; 2687 positions_[i].value = 0; 2688 positions_[i].determines_perfectly = false; 2689 } 2690 characters_ = 0; 2691 } 2692 2693 2694 void QuickCheckDetails::Advance(int by, bool ascii) { 2695 ASSERT(by >= 0); 2696 if (by >= characters_) { 2697 Clear(); 2698 return; 2699 } 2700 for (int i = 0; i < characters_ - by; i++) { 2701 positions_[i] = positions_[by + i]; 2702 } 2703 for (int i = characters_ - by; i < characters_; i++) { 2704 positions_[i].mask = 0; 2705 positions_[i].value = 0; 2706 positions_[i].determines_perfectly = false; 2707 } 2708 characters_ -= by; 2709 // We could change mask_ and value_ here but we would never advance unless 2710 // they had already been used in a check and they won't be used again because 2711 // it would gain us nothing. So there's no point. 2712 } 2713 2714 2715 void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) { 2716 ASSERT(characters_ == other->characters_); 2717 if (other->cannot_match_) { 2718 return; 2719 } 2720 if (cannot_match_) { 2721 *this = *other; 2722 return; 2723 } 2724 for (int i = from_index; i < characters_; i++) { 2725 QuickCheckDetails::Position* pos = positions(i); 2726 QuickCheckDetails::Position* other_pos = other->positions(i); 2727 if (pos->mask != other_pos->mask || 2728 pos->value != other_pos->value || 2729 !other_pos->determines_perfectly) { 2730 // Our mask-compare operation will be approximate unless we have the 2731 // exact same operation on both sides of the alternation. 2732 pos->determines_perfectly = false; 2733 } 2734 pos->mask &= other_pos->mask; 2735 pos->value &= pos->mask; 2736 other_pos->value &= pos->mask; 2737 uc16 differing_bits = (pos->value ^ other_pos->value); 2738 pos->mask &= ~differing_bits; 2739 pos->value &= pos->mask; 2740 } 2741 } 2742 2743 2744 class VisitMarker { 2745 public: 2746 explicit VisitMarker(NodeInfo* info) : info_(info) { 2747 ASSERT(!info->visited); 2748 info->visited = true; 2749 } 2750 ~VisitMarker() { 2751 info_->visited = false; 2752 } 2753 private: 2754 NodeInfo* info_; 2755 }; 2756 2757 2758 RegExpNode* SeqRegExpNode::FilterASCII(int depth, bool ignore_case) { 2759 if (info()->replacement_calculated) return replacement(); 2760 if (depth < 0) return this; 2761 ASSERT(!info()->visited); 2762 VisitMarker marker(info()); 2763 return FilterSuccessor(depth - 1, ignore_case); 2764 } 2765 2766 2767 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) { 2768 RegExpNode* next = on_success_->FilterASCII(depth - 1, ignore_case); 2769 if (next == NULL) return set_replacement(NULL); 2770 on_success_ = next; 2771 return set_replacement(this); 2772 } 2773 2774 2775 // We need to check for the following characters: 0x39c 0x3bc 0x178. 2776 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) { 2777 // TODO(dcarney): this could be a lot more efficient. 2778 return range.Contains(0x39c) || 2779 range.Contains(0x3bc) || range.Contains(0x178); 2780 } 2781 2782 2783 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) { 2784 for (int i = 0; i < ranges->length(); i++) { 2785 // TODO(dcarney): this could be a lot more efficient. 2786 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true; 2787 } 2788 return false; 2789 } 2790 2791 2792 RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) { 2793 if (info()->replacement_calculated) return replacement(); 2794 if (depth < 0) return this; 2795 ASSERT(!info()->visited); 2796 VisitMarker marker(info()); 2797 int element_count = elms_->length(); 2798 for (int i = 0; i < element_count; i++) { 2799 TextElement elm = elms_->at(i); 2800 if (elm.text_type() == TextElement::ATOM) { 2801 Vector<const uc16> quarks = elm.atom()->data(); 2802 for (int j = 0; j < quarks.length(); j++) { 2803 uint16_t c = quarks[j]; 2804 if (c <= String::kMaxOneByteCharCode) continue; 2805 if (!ignore_case) return set_replacement(NULL); 2806 // Here, we need to check for characters whose upper and lower cases 2807 // are outside the Latin-1 range. 2808 uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c); 2809 // Character is outside Latin-1 completely 2810 if (converted == 0) return set_replacement(NULL); 2811 // Convert quark to Latin-1 in place. 2812 uint16_t* copy = const_cast<uint16_t*>(quarks.start()); 2813 copy[j] = converted; 2814 } 2815 } else { 2816 ASSERT(elm.text_type() == TextElement::CHAR_CLASS); 2817 RegExpCharacterClass* cc = elm.char_class(); 2818 ZoneList<CharacterRange>* ranges = cc->ranges(zone()); 2819 if (!CharacterRange::IsCanonical(ranges)) { 2820 CharacterRange::Canonicalize(ranges); 2821 } 2822 // Now they are in order so we only need to look at the first. 2823 int range_count = ranges->length(); 2824 if (cc->is_negated()) { 2825 if (range_count != 0 && 2826 ranges->at(0).from() == 0 && 2827 ranges->at(0).to() >= String::kMaxOneByteCharCode) { 2828 // This will be handled in a later filter. 2829 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue; 2830 return set_replacement(NULL); 2831 } 2832 } else { 2833 if (range_count == 0 || 2834 ranges->at(0).from() > String::kMaxOneByteCharCode) { 2835 // This will be handled in a later filter. 2836 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue; 2837 return set_replacement(NULL); 2838 } 2839 } 2840 } 2841 } 2842 return FilterSuccessor(depth - 1, ignore_case); 2843 } 2844 2845 2846 RegExpNode* LoopChoiceNode::FilterASCII(int depth, bool ignore_case) { 2847 if (info()->replacement_calculated) return replacement(); 2848 if (depth < 0) return this; 2849 if (info()->visited) return this; 2850 { 2851 VisitMarker marker(info()); 2852 2853 RegExpNode* continue_replacement = 2854 continue_node_->FilterASCII(depth - 1, ignore_case); 2855 // If we can't continue after the loop then there is no sense in doing the 2856 // loop. 2857 if (continue_replacement == NULL) return set_replacement(NULL); 2858 } 2859 2860 return ChoiceNode::FilterASCII(depth - 1, ignore_case); 2861 } 2862 2863 2864 RegExpNode* ChoiceNode::FilterASCII(int depth, bool ignore_case) { 2865 if (info()->replacement_calculated) return replacement(); 2866 if (depth < 0) return this; 2867 if (info()->visited) return this; 2868 VisitMarker marker(info()); 2869 int choice_count = alternatives_->length(); 2870 2871 for (int i = 0; i < choice_count; i++) { 2872 GuardedAlternative alternative = alternatives_->at(i); 2873 if (alternative.guards() != NULL && alternative.guards()->length() != 0) { 2874 set_replacement(this); 2875 return this; 2876 } 2877 } 2878 2879 int surviving = 0; 2880 RegExpNode* survivor = NULL; 2881 for (int i = 0; i < choice_count; i++) { 2882 GuardedAlternative alternative = alternatives_->at(i); 2883 RegExpNode* replacement = 2884 alternative.node()->FilterASCII(depth - 1, ignore_case); 2885 ASSERT(replacement != this); // No missing EMPTY_MATCH_CHECK. 2886 if (replacement != NULL) { 2887 alternatives_->at(i).set_node(replacement); 2888 surviving++; 2889 survivor = replacement; 2890 } 2891 } 2892 if (surviving < 2) return set_replacement(survivor); 2893 2894 set_replacement(this); 2895 if (surviving == choice_count) { 2896 return this; 2897 } 2898 // Only some of the nodes survived the filtering. We need to rebuild the 2899 // alternatives list. 2900 ZoneList<GuardedAlternative>* new_alternatives = 2901 new(zone()) ZoneList<GuardedAlternative>(surviving, zone()); 2902 for (int i = 0; i < choice_count; i++) { 2903 RegExpNode* replacement = 2904 alternatives_->at(i).node()->FilterASCII(depth - 1, ignore_case); 2905 if (replacement != NULL) { 2906 alternatives_->at(i).set_node(replacement); 2907 new_alternatives->Add(alternatives_->at(i), zone()); 2908 } 2909 } 2910 alternatives_ = new_alternatives; 2911 return this; 2912 } 2913 2914 2915 RegExpNode* NegativeLookaheadChoiceNode::FilterASCII(int depth, 2916 bool ignore_case) { 2917 if (info()->replacement_calculated) return replacement(); 2918 if (depth < 0) return this; 2919 if (info()->visited) return this; 2920 VisitMarker marker(info()); 2921 // Alternative 0 is the negative lookahead, alternative 1 is what comes 2922 // afterwards. 2923 RegExpNode* node = alternatives_->at(1).node(); 2924 RegExpNode* replacement = node->FilterASCII(depth - 1, ignore_case); 2925 if (replacement == NULL) return set_replacement(NULL); 2926 alternatives_->at(1).set_node(replacement); 2927 2928 RegExpNode* neg_node = alternatives_->at(0).node(); 2929 RegExpNode* neg_replacement = neg_node->FilterASCII(depth - 1, ignore_case); 2930 // If the negative lookahead is always going to fail then 2931 // we don't need to check it. 2932 if (neg_replacement == NULL) return set_replacement(replacement); 2933 alternatives_->at(0).set_node(neg_replacement); 2934 return set_replacement(this); 2935 } 2936 2937 2938 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 2939 RegExpCompiler* compiler, 2940 int characters_filled_in, 2941 bool not_at_start) { 2942 if (body_can_be_zero_length_ || info()->visited) return; 2943 VisitMarker marker(info()); 2944 return ChoiceNode::GetQuickCheckDetails(details, 2945 compiler, 2946 characters_filled_in, 2947 not_at_start); 2948 } 2949 2950 2951 void LoopChoiceNode::FillInBMInfo(int offset, 2952 int budget, 2953 BoyerMooreLookahead* bm, 2954 bool not_at_start) { 2955 if (body_can_be_zero_length_ || budget <= 0) { 2956 bm->SetRest(offset); 2957 SaveBMInfo(bm, not_at_start, offset); 2958 return; 2959 } 2960 ChoiceNode::FillInBMInfo(offset, budget - 1, bm, not_at_start); 2961 SaveBMInfo(bm, not_at_start, offset); 2962 } 2963 2964 2965 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 2966 RegExpCompiler* compiler, 2967 int characters_filled_in, 2968 bool not_at_start) { 2969 not_at_start = (not_at_start || not_at_start_); 2970 int choice_count = alternatives_->length(); 2971 ASSERT(choice_count > 0); 2972 alternatives_->at(0).node()->GetQuickCheckDetails(details, 2973 compiler, 2974 characters_filled_in, 2975 not_at_start); 2976 for (int i = 1; i < choice_count; i++) { 2977 QuickCheckDetails new_details(details->characters()); 2978 RegExpNode* node = alternatives_->at(i).node(); 2979 node->GetQuickCheckDetails(&new_details, compiler, 2980 characters_filled_in, 2981 not_at_start); 2982 // Here we merge the quick match details of the two branches. 2983 details->Merge(&new_details, characters_filled_in); 2984 } 2985 } 2986 2987 2988 // Check for [0-9A-Z_a-z]. 2989 static void EmitWordCheck(RegExpMacroAssembler* assembler, 2990 Label* word, 2991 Label* non_word, 2992 bool fall_through_on_word) { 2993 if (assembler->CheckSpecialCharacterClass( 2994 fall_through_on_word ? 'w' : 'W', 2995 fall_through_on_word ? non_word : word)) { 2996 // Optimized implementation available. 2997 return; 2998 } 2999 assembler->CheckCharacterGT('z', non_word); 3000 assembler->CheckCharacterLT('0', non_word); 3001 assembler->CheckCharacterGT('a' - 1, word); 3002 assembler->CheckCharacterLT('9' + 1, word); 3003 assembler->CheckCharacterLT('A', non_word); 3004 assembler->CheckCharacterLT('Z' + 1, word); 3005 if (fall_through_on_word) { 3006 assembler->CheckNotCharacter('_', non_word); 3007 } else { 3008 assembler->CheckCharacter('_', word); 3009 } 3010 } 3011 3012 3013 // Emit the code to check for a ^ in multiline mode (1-character lookbehind 3014 // that matches newline or the start of input). 3015 static void EmitHat(RegExpCompiler* compiler, 3016 RegExpNode* on_success, 3017 Trace* trace) { 3018 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3019 // We will be loading the previous character into the current character 3020 // register. 3021 Trace new_trace(*trace); 3022 new_trace.InvalidateCurrentCharacter(); 3023 3024 Label ok; 3025 if (new_trace.cp_offset() == 0) { 3026 // The start of input counts as a newline in this context, so skip to 3027 // ok if we are at the start. 3028 assembler->CheckAtStart(&ok); 3029 } 3030 // We already checked that we are not at the start of input so it must be 3031 // OK to load the previous character. 3032 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1, 3033 new_trace.backtrack(), 3034 false); 3035 if (!assembler->CheckSpecialCharacterClass('n', 3036 new_trace.backtrack())) { 3037 // Newline means \n, \r, 0x2028 or 0x2029. 3038 if (!compiler->ascii()) { 3039 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok); 3040 } 3041 assembler->CheckCharacter('\n', &ok); 3042 assembler->CheckNotCharacter('\r', new_trace.backtrack()); 3043 } 3044 assembler->Bind(&ok); 3045 on_success->Emit(compiler, &new_trace); 3046 } 3047 3048 3049 // Emit the code to handle \b and \B (word-boundary or non-word-boundary). 3050 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) { 3051 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3052 Trace::TriBool next_is_word_character = Trace::UNKNOWN; 3053 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE); 3054 BoyerMooreLookahead* lookahead = bm_info(not_at_start); 3055 if (lookahead == NULL) { 3056 int eats_at_least = 3057 Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore, 3058 kRecursionBudget, 3059 not_at_start)); 3060 if (eats_at_least >= 1) { 3061 BoyerMooreLookahead* bm = 3062 new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone()); 3063 FillInBMInfo(0, kRecursionBudget, bm, not_at_start); 3064 if (bm->at(0)->is_non_word()) 3065 next_is_word_character = Trace::FALSE_VALUE; 3066 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE; 3067 } 3068 } else { 3069 if (lookahead->at(0)->is_non_word()) 3070 next_is_word_character = Trace::FALSE_VALUE; 3071 if (lookahead->at(0)->is_word()) 3072 next_is_word_character = Trace::TRUE_VALUE; 3073 } 3074 bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY); 3075 if (next_is_word_character == Trace::UNKNOWN) { 3076 Label before_non_word; 3077 Label before_word; 3078 if (trace->characters_preloaded() != 1) { 3079 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word); 3080 } 3081 // Fall through on non-word. 3082 EmitWordCheck(assembler, &before_word, &before_non_word, false); 3083 // Next character is not a word character. 3084 assembler->Bind(&before_non_word); 3085 Label ok; 3086 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord); 3087 assembler->GoTo(&ok); 3088 3089 assembler->Bind(&before_word); 3090 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord); 3091 assembler->Bind(&ok); 3092 } else if (next_is_word_character == Trace::TRUE_VALUE) { 3093 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord); 3094 } else { 3095 ASSERT(next_is_word_character == Trace::FALSE_VALUE); 3096 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord); 3097 } 3098 } 3099 3100 3101 void AssertionNode::BacktrackIfPrevious( 3102 RegExpCompiler* compiler, 3103 Trace* trace, 3104 AssertionNode::IfPrevious backtrack_if_previous) { 3105 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3106 Trace new_trace(*trace); 3107 new_trace.InvalidateCurrentCharacter(); 3108 3109 Label fall_through, dummy; 3110 3111 Label* non_word = backtrack_if_previous == kIsNonWord ? 3112 new_trace.backtrack() : 3113 &fall_through; 3114 Label* word = backtrack_if_previous == kIsNonWord ? 3115 &fall_through : 3116 new_trace.backtrack(); 3117 3118 if (new_trace.cp_offset() == 0) { 3119 // The start of input counts as a non-word character, so the question is 3120 // decided if we are at the start. 3121 assembler->CheckAtStart(non_word); 3122 } 3123 // We already checked that we are not at the start of input so it must be 3124 // OK to load the previous character. 3125 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false); 3126 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord); 3127 3128 assembler->Bind(&fall_through); 3129 on_success()->Emit(compiler, &new_trace); 3130 } 3131 3132 3133 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details, 3134 RegExpCompiler* compiler, 3135 int filled_in, 3136 bool not_at_start) { 3137 if (assertion_type_ == AT_START && not_at_start) { 3138 details->set_cannot_match(); 3139 return; 3140 } 3141 return on_success()->GetQuickCheckDetails(details, 3142 compiler, 3143 filled_in, 3144 not_at_start); 3145 } 3146 3147 3148 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3149 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3150 switch (assertion_type_) { 3151 case AT_END: { 3152 Label ok; 3153 assembler->CheckPosition(trace->cp_offset(), &ok); 3154 assembler->GoTo(trace->backtrack()); 3155 assembler->Bind(&ok); 3156 break; 3157 } 3158 case AT_START: { 3159 if (trace->at_start() == Trace::FALSE_VALUE) { 3160 assembler->GoTo(trace->backtrack()); 3161 return; 3162 } 3163 if (trace->at_start() == Trace::UNKNOWN) { 3164 assembler->CheckNotAtStart(trace->backtrack()); 3165 Trace at_start_trace = *trace; 3166 at_start_trace.set_at_start(true); 3167 on_success()->Emit(compiler, &at_start_trace); 3168 return; 3169 } 3170 } 3171 break; 3172 case AFTER_NEWLINE: 3173 EmitHat(compiler, on_success(), trace); 3174 return; 3175 case AT_BOUNDARY: 3176 case AT_NON_BOUNDARY: { 3177 EmitBoundaryCheck(compiler, trace); 3178 return; 3179 } 3180 } 3181 on_success()->Emit(compiler, trace); 3182 } 3183 3184 3185 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) { 3186 if (quick_check == NULL) return false; 3187 if (offset >= quick_check->characters()) return false; 3188 return quick_check->positions(offset)->determines_perfectly; 3189 } 3190 3191 3192 static void UpdateBoundsCheck(int index, int* checked_up_to) { 3193 if (index > *checked_up_to) { 3194 *checked_up_to = index; 3195 } 3196 } 3197 3198 3199 // We call this repeatedly to generate code for each pass over the text node. 3200 // The passes are in increasing order of difficulty because we hope one 3201 // of the first passes will fail in which case we are saved the work of the 3202 // later passes. for example for the case independent regexp /%[asdfghjkl]a/ 3203 // we will check the '%' in the first pass, the case independent 'a' in the 3204 // second pass and the character class in the last pass. 3205 // 3206 // The passes are done from right to left, so for example to test for /bar/ 3207 // we will first test for an 'r' with offset 2, then an 'a' with offset 1 3208 // and then a 'b' with offset 0. This means we can avoid the end-of-input 3209 // bounds check most of the time. In the example we only need to check for 3210 // end-of-input when loading the putative 'r'. 3211 // 3212 // A slight complication involves the fact that the first character may already 3213 // be fetched into a register by the previous node. In this case we want to 3214 // do the test for that character first. We do this in separate passes. The 3215 // 'preloaded' argument indicates that we are doing such a 'pass'. If such a 3216 // pass has been performed then subsequent passes will have true in 3217 // first_element_checked to indicate that that character does not need to be 3218 // checked again. 3219 // 3220 // In addition to all this we are passed a Trace, which can 3221 // contain an AlternativeGeneration object. In this AlternativeGeneration 3222 // object we can see details of any quick check that was already passed in 3223 // order to get to the code we are now generating. The quick check can involve 3224 // loading characters, which means we do not need to recheck the bounds 3225 // up to the limit the quick check already checked. In addition the quick 3226 // check can have involved a mask and compare operation which may simplify 3227 // or obviate the need for further checks at some character positions. 3228 void TextNode::TextEmitPass(RegExpCompiler* compiler, 3229 TextEmitPassType pass, 3230 bool preloaded, 3231 Trace* trace, 3232 bool first_element_checked, 3233 int* checked_up_to) { 3234 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3235 Isolate* isolate = assembler->zone()->isolate(); 3236 bool ascii = compiler->ascii(); 3237 Label* backtrack = trace->backtrack(); 3238 QuickCheckDetails* quick_check = trace->quick_check_performed(); 3239 int element_count = elms_->length(); 3240 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) { 3241 TextElement elm = elms_->at(i); 3242 int cp_offset = trace->cp_offset() + elm.cp_offset(); 3243 if (elm.text_type() == TextElement::ATOM) { 3244 Vector<const uc16> quarks = elm.atom()->data(); 3245 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { 3246 if (first_element_checked && i == 0 && j == 0) continue; 3247 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue; 3248 EmitCharacterFunction* emit_function = NULL; 3249 switch (pass) { 3250 case NON_ASCII_MATCH: 3251 ASSERT(ascii); 3252 if (quarks[j] > String::kMaxOneByteCharCode) { 3253 assembler->GoTo(backtrack); 3254 return; 3255 } 3256 break; 3257 case NON_LETTER_CHARACTER_MATCH: 3258 emit_function = &EmitAtomNonLetter; 3259 break; 3260 case SIMPLE_CHARACTER_MATCH: 3261 emit_function = &EmitSimpleCharacter; 3262 break; 3263 case CASE_CHARACTER_MATCH: 3264 emit_function = &EmitAtomLetter; 3265 break; 3266 default: 3267 break; 3268 } 3269 if (emit_function != NULL) { 3270 bool bound_checked = emit_function(isolate, 3271 compiler, 3272 quarks[j], 3273 backtrack, 3274 cp_offset + j, 3275 *checked_up_to < cp_offset + j, 3276 preloaded); 3277 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); 3278 } 3279 } 3280 } else { 3281 ASSERT_EQ(TextElement::CHAR_CLASS, elm.text_type()); 3282 if (pass == CHARACTER_CLASS_MATCH) { 3283 if (first_element_checked && i == 0) continue; 3284 if (DeterminedAlready(quick_check, elm.cp_offset())) continue; 3285 RegExpCharacterClass* cc = elm.char_class(); 3286 EmitCharClass(assembler, 3287 cc, 3288 ascii, 3289 backtrack, 3290 cp_offset, 3291 *checked_up_to < cp_offset, 3292 preloaded, 3293 zone()); 3294 UpdateBoundsCheck(cp_offset, checked_up_to); 3295 } 3296 } 3297 } 3298 } 3299 3300 3301 int TextNode::Length() { 3302 TextElement elm = elms_->last(); 3303 ASSERT(elm.cp_offset() >= 0); 3304 return elm.cp_offset() + elm.length(); 3305 } 3306 3307 3308 bool TextNode::SkipPass(int int_pass, bool ignore_case) { 3309 TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass); 3310 if (ignore_case) { 3311 return pass == SIMPLE_CHARACTER_MATCH; 3312 } else { 3313 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH; 3314 } 3315 } 3316 3317 3318 // This generates the code to match a text node. A text node can contain 3319 // straight character sequences (possibly to be matched in a case-independent 3320 // way) and character classes. For efficiency we do not do this in a single 3321 // pass from left to right. Instead we pass over the text node several times, 3322 // emitting code for some character positions every time. See the comment on 3323 // TextEmitPass for details. 3324 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3325 LimitResult limit_result = LimitVersions(compiler, trace); 3326 if (limit_result == DONE) return; 3327 ASSERT(limit_result == CONTINUE); 3328 3329 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) { 3330 compiler->SetRegExpTooBig(); 3331 return; 3332 } 3333 3334 if (compiler->ascii()) { 3335 int dummy = 0; 3336 TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy); 3337 } 3338 3339 bool first_elt_done = false; 3340 int bound_checked_to = trace->cp_offset() - 1; 3341 bound_checked_to += trace->bound_checked_up_to(); 3342 3343 // If a character is preloaded into the current character register then 3344 // check that now. 3345 if (trace->characters_preloaded() == 1) { 3346 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { 3347 if (!SkipPass(pass, compiler->ignore_case())) { 3348 TextEmitPass(compiler, 3349 static_cast<TextEmitPassType>(pass), 3350 true, 3351 trace, 3352 false, 3353 &bound_checked_to); 3354 } 3355 } 3356 first_elt_done = true; 3357 } 3358 3359 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { 3360 if (!SkipPass(pass, compiler->ignore_case())) { 3361 TextEmitPass(compiler, 3362 static_cast<TextEmitPassType>(pass), 3363 false, 3364 trace, 3365 first_elt_done, 3366 &bound_checked_to); 3367 } 3368 } 3369 3370 Trace successor_trace(*trace); 3371 successor_trace.set_at_start(false); 3372 successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler); 3373 RecursionCheck rc(compiler); 3374 on_success()->Emit(compiler, &successor_trace); 3375 } 3376 3377 3378 void Trace::InvalidateCurrentCharacter() { 3379 characters_preloaded_ = 0; 3380 } 3381 3382 3383 void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) { 3384 ASSERT(by > 0); 3385 // We don't have an instruction for shifting the current character register 3386 // down or for using a shifted value for anything so lets just forget that 3387 // we preloaded any characters into it. 3388 characters_preloaded_ = 0; 3389 // Adjust the offsets of the quick check performed information. This 3390 // information is used to find out what we already determined about the 3391 // characters by means of mask and compare. 3392 quick_check_performed_.Advance(by, compiler->ascii()); 3393 cp_offset_ += by; 3394 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) { 3395 compiler->SetRegExpTooBig(); 3396 cp_offset_ = 0; 3397 } 3398 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by); 3399 } 3400 3401 3402 void TextNode::MakeCaseIndependent(bool is_ascii) { 3403 int element_count = elms_->length(); 3404 for (int i = 0; i < element_count; i++) { 3405 TextElement elm = elms_->at(i); 3406 if (elm.text_type() == TextElement::CHAR_CLASS) { 3407 RegExpCharacterClass* cc = elm.char_class(); 3408 // None of the standard character classes is different in the case 3409 // independent case and it slows us down if we don't know that. 3410 if (cc->is_standard(zone())) continue; 3411 ZoneList<CharacterRange>* ranges = cc->ranges(zone()); 3412 int range_count = ranges->length(); 3413 for (int j = 0; j < range_count; j++) { 3414 ranges->at(j).AddCaseEquivalents(ranges, is_ascii, zone()); 3415 } 3416 } 3417 } 3418 } 3419 3420 3421 int TextNode::GreedyLoopTextLength() { 3422 TextElement elm = elms_->at(elms_->length() - 1); 3423 return elm.cp_offset() + elm.length(); 3424 } 3425 3426 3427 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode( 3428 RegExpCompiler* compiler) { 3429 if (elms_->length() != 1) return NULL; 3430 TextElement elm = elms_->at(0); 3431 if (elm.text_type() != TextElement::CHAR_CLASS) return NULL; 3432 RegExpCharacterClass* node = elm.char_class(); 3433 ZoneList<CharacterRange>* ranges = node->ranges(zone()); 3434 if (!CharacterRange::IsCanonical(ranges)) { 3435 CharacterRange::Canonicalize(ranges); 3436 } 3437 if (node->is_negated()) { 3438 return ranges->length() == 0 ? on_success() : NULL; 3439 } 3440 if (ranges->length() != 1) return NULL; 3441 uint32_t max_char; 3442 if (compiler->ascii()) { 3443 max_char = String::kMaxOneByteCharCode; 3444 } else { 3445 max_char = String::kMaxUtf16CodeUnit; 3446 } 3447 return ranges->at(0).IsEverything(max_char) ? on_success() : NULL; 3448 } 3449 3450 3451 // Finds the fixed match length of a sequence of nodes that goes from 3452 // this alternative and back to this choice node. If there are variable 3453 // length nodes or other complications in the way then return a sentinel 3454 // value indicating that a greedy loop cannot be constructed. 3455 int ChoiceNode::GreedyLoopTextLengthForAlternative( 3456 GuardedAlternative* alternative) { 3457 int length = 0; 3458 RegExpNode* node = alternative->node(); 3459 // Later we will generate code for all these text nodes using recursion 3460 // so we have to limit the max number. 3461 int recursion_depth = 0; 3462 while (node != this) { 3463 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) { 3464 return kNodeIsTooComplexForGreedyLoops; 3465 } 3466 int node_length = node->GreedyLoopTextLength(); 3467 if (node_length == kNodeIsTooComplexForGreedyLoops) { 3468 return kNodeIsTooComplexForGreedyLoops; 3469 } 3470 length += node_length; 3471 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node); 3472 node = seq_node->on_success(); 3473 } 3474 return length; 3475 } 3476 3477 3478 void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) { 3479 ASSERT_EQ(loop_node_, NULL); 3480 AddAlternative(alt); 3481 loop_node_ = alt.node(); 3482 } 3483 3484 3485 void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) { 3486 ASSERT_EQ(continue_node_, NULL); 3487 AddAlternative(alt); 3488 continue_node_ = alt.node(); 3489 } 3490 3491 3492 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3493 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 3494 if (trace->stop_node() == this) { 3495 int text_length = 3496 GreedyLoopTextLengthForAlternative(&(alternatives_->at(0))); 3497 ASSERT(text_length != kNodeIsTooComplexForGreedyLoops); 3498 // Update the counter-based backtracking info on the stack. This is an 3499 // optimization for greedy loops (see below). 3500 ASSERT(trace->cp_offset() == text_length); 3501 macro_assembler->AdvanceCurrentPosition(text_length); 3502 macro_assembler->GoTo(trace->loop_label()); 3503 return; 3504 } 3505 ASSERT(trace->stop_node() == NULL); 3506 if (!trace->is_trivial()) { 3507 trace->Flush(compiler, this); 3508 return; 3509 } 3510 ChoiceNode::Emit(compiler, trace); 3511 } 3512 3513 3514 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler, 3515 int eats_at_least) { 3516 int preload_characters = Min(4, eats_at_least); 3517 if (compiler->macro_assembler()->CanReadUnaligned()) { 3518 bool ascii = compiler->ascii(); 3519 if (ascii) { 3520 if (preload_characters > 4) preload_characters = 4; 3521 // We can't preload 3 characters because there is no machine instruction 3522 // to do that. We can't just load 4 because we could be reading 3523 // beyond the end of the string, which could cause a memory fault. 3524 if (preload_characters == 3) preload_characters = 2; 3525 } else { 3526 if (preload_characters > 2) preload_characters = 2; 3527 } 3528 } else { 3529 if (preload_characters > 1) preload_characters = 1; 3530 } 3531 return preload_characters; 3532 } 3533 3534 3535 // This class is used when generating the alternatives in a choice node. It 3536 // records the way the alternative is being code generated. 3537 class AlternativeGeneration: public Malloced { 3538 public: 3539 AlternativeGeneration() 3540 : possible_success(), 3541 expects_preload(false), 3542 after(), 3543 quick_check_details() { } 3544 Label possible_success; 3545 bool expects_preload; 3546 Label after; 3547 QuickCheckDetails quick_check_details; 3548 }; 3549 3550 3551 // Creates a list of AlternativeGenerations. If the list has a reasonable 3552 // size then it is on the stack, otherwise the excess is on the heap. 3553 class AlternativeGenerationList { 3554 public: 3555 AlternativeGenerationList(int count, Zone* zone) 3556 : alt_gens_(count, zone) { 3557 for (int i = 0; i < count && i < kAFew; i++) { 3558 alt_gens_.Add(a_few_alt_gens_ + i, zone); 3559 } 3560 for (int i = kAFew; i < count; i++) { 3561 alt_gens_.Add(new AlternativeGeneration(), zone); 3562 } 3563 } 3564 ~AlternativeGenerationList() { 3565 for (int i = kAFew; i < alt_gens_.length(); i++) { 3566 delete alt_gens_[i]; 3567 alt_gens_[i] = NULL; 3568 } 3569 } 3570 3571 AlternativeGeneration* at(int i) { 3572 return alt_gens_[i]; 3573 } 3574 3575 private: 3576 static const int kAFew = 10; 3577 ZoneList<AlternativeGeneration*> alt_gens_; 3578 AlternativeGeneration a_few_alt_gens_[kAFew]; 3579 }; 3580 3581 3582 // The '2' variant is has inclusive from and exclusive to. 3583 // This covers \s as defined in ECMA-262 5.1, 15.10.2.12, 3584 // which include WhiteSpace (7.2) or LineTerminator (7.3) values. 3585 static const int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1, 3586 0x00A0, 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B, 3587 0x2028, 0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001, 3588 0xFEFF, 0xFF00, 0x10000 }; 3589 static const int kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges); 3590 3591 static const int kWordRanges[] = { 3592 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 }; 3593 static const int kWordRangeCount = ARRAY_SIZE(kWordRanges); 3594 static const int kDigitRanges[] = { '0', '9' + 1, 0x10000 }; 3595 static const int kDigitRangeCount = ARRAY_SIZE(kDigitRanges); 3596 static const int kSurrogateRanges[] = { 0xd800, 0xe000, 0x10000 }; 3597 static const int kSurrogateRangeCount = ARRAY_SIZE(kSurrogateRanges); 3598 static const int kLineTerminatorRanges[] = { 0x000A, 0x000B, 0x000D, 0x000E, 3599 0x2028, 0x202A, 0x10000 }; 3600 static const int kLineTerminatorRangeCount = ARRAY_SIZE(kLineTerminatorRanges); 3601 3602 3603 void BoyerMoorePositionInfo::Set(int character) { 3604 SetInterval(Interval(character, character)); 3605 } 3606 3607 3608 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) { 3609 s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval); 3610 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval); 3611 d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval); 3612 surrogate_ = 3613 AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval); 3614 if (interval.to() - interval.from() >= kMapSize - 1) { 3615 if (map_count_ != kMapSize) { 3616 map_count_ = kMapSize; 3617 for (int i = 0; i < kMapSize; i++) map_->at(i) = true; 3618 } 3619 return; 3620 } 3621 for (int i = interval.from(); i <= interval.to(); i++) { 3622 int mod_character = (i & kMask); 3623 if (!map_->at(mod_character)) { 3624 map_count_++; 3625 map_->at(mod_character) = true; 3626 } 3627 if (map_count_ == kMapSize) return; 3628 } 3629 } 3630 3631 3632 void BoyerMoorePositionInfo::SetAll() { 3633 s_ = w_ = d_ = kLatticeUnknown; 3634 if (map_count_ != kMapSize) { 3635 map_count_ = kMapSize; 3636 for (int i = 0; i < kMapSize; i++) map_->at(i) = true; 3637 } 3638 } 3639 3640 3641 BoyerMooreLookahead::BoyerMooreLookahead( 3642 int length, RegExpCompiler* compiler, Zone* zone) 3643 : length_(length), 3644 compiler_(compiler) { 3645 if (compiler->ascii()) { 3646 max_char_ = String::kMaxOneByteCharCode; 3647 } else { 3648 max_char_ = String::kMaxUtf16CodeUnit; 3649 } 3650 bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone); 3651 for (int i = 0; i < length; i++) { 3652 bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone); 3653 } 3654 } 3655 3656 3657 // Find the longest range of lookahead that has the fewest number of different 3658 // characters that can occur at a given position. Since we are optimizing two 3659 // different parameters at once this is a tradeoff. 3660 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) { 3661 int biggest_points = 0; 3662 // If more than 32 characters out of 128 can occur it is unlikely that we can 3663 // be lucky enough to step forwards much of the time. 3664 const int kMaxMax = 32; 3665 for (int max_number_of_chars = 4; 3666 max_number_of_chars < kMaxMax; 3667 max_number_of_chars *= 2) { 3668 biggest_points = 3669 FindBestInterval(max_number_of_chars, biggest_points, from, to); 3670 } 3671 if (biggest_points == 0) return false; 3672 return true; 3673 } 3674 3675 3676 // Find the highest-points range between 0 and length_ where the character 3677 // information is not too vague. 'Too vague' means that there are more than 3678 // max_number_of_chars that can occur at this position. Calculates the number 3679 // of points as the product of width-of-the-range and 3680 // probability-of-finding-one-of-the-characters, where the probability is 3681 // calculated using the frequency distribution of the sample subject string. 3682 int BoyerMooreLookahead::FindBestInterval( 3683 int max_number_of_chars, int old_biggest_points, int* from, int* to) { 3684 int biggest_points = old_biggest_points; 3685 static const int kSize = RegExpMacroAssembler::kTableSize; 3686 for (int i = 0; i < length_; ) { 3687 while (i < length_ && Count(i) > max_number_of_chars) i++; 3688 if (i == length_) break; 3689 int remembered_from = i; 3690 bool union_map[kSize]; 3691 for (int j = 0; j < kSize; j++) union_map[j] = false; 3692 while (i < length_ && Count(i) <= max_number_of_chars) { 3693 BoyerMoorePositionInfo* map = bitmaps_->at(i); 3694 for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j); 3695 i++; 3696 } 3697 int frequency = 0; 3698 for (int j = 0; j < kSize; j++) { 3699 if (union_map[j]) { 3700 // Add 1 to the frequency to give a small per-character boost for 3701 // the cases where our sampling is not good enough and many 3702 // characters have a frequency of zero. This means the frequency 3703 // can theoretically be up to 2*kSize though we treat it mostly as 3704 // a fraction of kSize. 3705 frequency += compiler_->frequency_collator()->Frequency(j) + 1; 3706 } 3707 } 3708 // We use the probability of skipping times the distance we are skipping to 3709 // judge the effectiveness of this. Actually we have a cut-off: By 3710 // dividing by 2 we switch off the skipping if the probability of skipping 3711 // is less than 50%. This is because the multibyte mask-and-compare 3712 // skipping in quickcheck is more likely to do well on this case. 3713 bool in_quickcheck_range = ((i - remembered_from < 4) || 3714 (compiler_->ascii() ? remembered_from <= 4 : remembered_from <= 2)); 3715 // Called 'probability' but it is only a rough estimate and can actually 3716 // be outside the 0-kSize range. 3717 int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency; 3718 int points = (i - remembered_from) * probability; 3719 if (points > biggest_points) { 3720 *from = remembered_from; 3721 *to = i - 1; 3722 biggest_points = points; 3723 } 3724 } 3725 return biggest_points; 3726 } 3727 3728 3729 // Take all the characters that will not prevent a successful match if they 3730 // occur in the subject string in the range between min_lookahead and 3731 // max_lookahead (inclusive) measured from the current position. If the 3732 // character at max_lookahead offset is not one of these characters, then we 3733 // can safely skip forwards by the number of characters in the range. 3734 int BoyerMooreLookahead::GetSkipTable(int min_lookahead, 3735 int max_lookahead, 3736 Handle<ByteArray> boolean_skip_table) { 3737 const int kSize = RegExpMacroAssembler::kTableSize; 3738 3739 const int kSkipArrayEntry = 0; 3740 const int kDontSkipArrayEntry = 1; 3741 3742 for (int i = 0; i < kSize; i++) { 3743 boolean_skip_table->set(i, kSkipArrayEntry); 3744 } 3745 int skip = max_lookahead + 1 - min_lookahead; 3746 3747 for (int i = max_lookahead; i >= min_lookahead; i--) { 3748 BoyerMoorePositionInfo* map = bitmaps_->at(i); 3749 for (int j = 0; j < kSize; j++) { 3750 if (map->at(j)) { 3751 boolean_skip_table->set(j, kDontSkipArrayEntry); 3752 } 3753 } 3754 } 3755 3756 return skip; 3757 } 3758 3759 3760 // See comment above on the implementation of GetSkipTable. 3761 bool BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) { 3762 const int kSize = RegExpMacroAssembler::kTableSize; 3763 3764 int min_lookahead = 0; 3765 int max_lookahead = 0; 3766 3767 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return false; 3768 3769 bool found_single_character = false; 3770 int single_character = 0; 3771 for (int i = max_lookahead; i >= min_lookahead; i--) { 3772 BoyerMoorePositionInfo* map = bitmaps_->at(i); 3773 if (map->map_count() > 1 || 3774 (found_single_character && map->map_count() != 0)) { 3775 found_single_character = false; 3776 break; 3777 } 3778 for (int j = 0; j < kSize; j++) { 3779 if (map->at(j)) { 3780 found_single_character = true; 3781 single_character = j; 3782 break; 3783 } 3784 } 3785 } 3786 3787 int lookahead_width = max_lookahead + 1 - min_lookahead; 3788 3789 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) { 3790 // The mask-compare can probably handle this better. 3791 return false; 3792 } 3793 3794 if (found_single_character) { 3795 Label cont, again; 3796 masm->Bind(&again); 3797 masm->LoadCurrentCharacter(max_lookahead, &cont, true); 3798 if (max_char_ > kSize) { 3799 masm->CheckCharacterAfterAnd(single_character, 3800 RegExpMacroAssembler::kTableMask, 3801 &cont); 3802 } else { 3803 masm->CheckCharacter(single_character, &cont); 3804 } 3805 masm->AdvanceCurrentPosition(lookahead_width); 3806 masm->GoTo(&again); 3807 masm->Bind(&cont); 3808 return true; 3809 } 3810 3811 Factory* factory = masm->zone()->isolate()->factory(); 3812 Handle<ByteArray> boolean_skip_table = factory->NewByteArray(kSize, TENURED); 3813 int skip_distance = GetSkipTable( 3814 min_lookahead, max_lookahead, boolean_skip_table); 3815 ASSERT(skip_distance != 0); 3816 3817 Label cont, again; 3818 masm->Bind(&again); 3819 masm->LoadCurrentCharacter(max_lookahead, &cont, true); 3820 masm->CheckBitInTable(boolean_skip_table, &cont); 3821 masm->AdvanceCurrentPosition(skip_distance); 3822 masm->GoTo(&again); 3823 masm->Bind(&cont); 3824 3825 return true; 3826 } 3827 3828 3829 /* Code generation for choice nodes. 3830 * 3831 * We generate quick checks that do a mask and compare to eliminate a 3832 * choice. If the quick check succeeds then it jumps to the continuation to 3833 * do slow checks and check subsequent nodes. If it fails (the common case) 3834 * it falls through to the next choice. 3835 * 3836 * Here is the desired flow graph. Nodes directly below each other imply 3837 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative 3838 * 3 doesn't have a quick check so we have to call the slow check. 3839 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire 3840 * regexp continuation is generated directly after the Sn node, up to the 3841 * next GoTo if we decide to reuse some already generated code. Some 3842 * nodes expect preload_characters to be preloaded into the current 3843 * character register. R nodes do this preloading. Vertices are marked 3844 * F for failures and S for success (possible success in the case of quick 3845 * nodes). L, V, < and > are used as arrow heads. 3846 * 3847 * ----------> R 3848 * | 3849 * V 3850 * Q1 -----> S1 3851 * | S / 3852 * F| / 3853 * | F/ 3854 * | / 3855 * | R 3856 * | / 3857 * V L 3858 * Q2 -----> S2 3859 * | S / 3860 * F| / 3861 * | F/ 3862 * | / 3863 * | R 3864 * | / 3865 * V L 3866 * S3 3867 * | 3868 * F| 3869 * | 3870 * R 3871 * | 3872 * backtrack V 3873 * <----------Q4 3874 * \ F | 3875 * \ |S 3876 * \ F V 3877 * \-----S4 3878 * 3879 * For greedy loops we reverse our expectation and expect to match rather 3880 * than fail. Therefore we want the loop code to look like this (U is the 3881 * unwind code that steps back in the greedy loop). The following alternatives 3882 * look the same as above. 3883 * _____ 3884 * / \ 3885 * V | 3886 * ----------> S1 | 3887 * /| | 3888 * / |S | 3889 * F/ \_____/ 3890 * / 3891 * |<----------- 3892 * | \ 3893 * V \ 3894 * Q2 ---> S2 \ 3895 * | S / | 3896 * F| / | 3897 * | F/ | 3898 * | / | 3899 * | R | 3900 * | / | 3901 * F VL | 3902 * <------U | 3903 * back |S | 3904 * \______________/ 3905 */ 3906 3907 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3908 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 3909 int choice_count = alternatives_->length(); 3910 #ifdef DEBUG 3911 for (int i = 0; i < choice_count - 1; i++) { 3912 GuardedAlternative alternative = alternatives_->at(i); 3913 ZoneList<Guard*>* guards = alternative.guards(); 3914 int guard_count = (guards == NULL) ? 0 : guards->length(); 3915 for (int j = 0; j < guard_count; j++) { 3916 ASSERT(!trace->mentions_reg(guards->at(j)->reg())); 3917 } 3918 } 3919 #endif 3920 3921 LimitResult limit_result = LimitVersions(compiler, trace); 3922 if (limit_result == DONE) return; 3923 ASSERT(limit_result == CONTINUE); 3924 3925 int new_flush_budget = trace->flush_budget() / choice_count; 3926 if (trace->flush_budget() == 0 && trace->actions() != NULL) { 3927 trace->Flush(compiler, this); 3928 return; 3929 } 3930 3931 RecursionCheck rc(compiler); 3932 3933 Trace* current_trace = trace; 3934 3935 int text_length = GreedyLoopTextLengthForAlternative(&(alternatives_->at(0))); 3936 bool greedy_loop = false; 3937 Label greedy_loop_label; 3938 Trace counter_backtrack_trace; 3939 counter_backtrack_trace.set_backtrack(&greedy_loop_label); 3940 if (not_at_start()) counter_backtrack_trace.set_at_start(false); 3941 3942 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) { 3943 // Here we have special handling for greedy loops containing only text nodes 3944 // and other simple nodes. These are handled by pushing the current 3945 // position on the stack and then incrementing the current position each 3946 // time around the switch. On backtrack we decrement the current position 3947 // and check it against the pushed value. This avoids pushing backtrack 3948 // information for each iteration of the loop, which could take up a lot of 3949 // space. 3950 greedy_loop = true; 3951 ASSERT(trace->stop_node() == NULL); 3952 macro_assembler->PushCurrentPosition(); 3953 current_trace = &counter_backtrack_trace; 3954 Label greedy_match_failed; 3955 Trace greedy_match_trace; 3956 if (not_at_start()) greedy_match_trace.set_at_start(false); 3957 greedy_match_trace.set_backtrack(&greedy_match_failed); 3958 Label loop_label; 3959 macro_assembler->Bind(&loop_label); 3960 greedy_match_trace.set_stop_node(this); 3961 greedy_match_trace.set_loop_label(&loop_label); 3962 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace); 3963 macro_assembler->Bind(&greedy_match_failed); 3964 } 3965 3966 Label second_choice; // For use in greedy matches. 3967 macro_assembler->Bind(&second_choice); 3968 3969 int first_normal_choice = greedy_loop ? 1 : 0; 3970 3971 bool not_at_start = current_trace->at_start() == Trace::FALSE_VALUE; 3972 const int kEatsAtLeastNotYetInitialized = -1; 3973 int eats_at_least = kEatsAtLeastNotYetInitialized; 3974 3975 bool skip_was_emitted = false; 3976 3977 if (!greedy_loop && choice_count == 2) { 3978 GuardedAlternative alt1 = alternatives_->at(1); 3979 if (alt1.guards() == NULL || alt1.guards()->length() == 0) { 3980 RegExpNode* eats_anything_node = alt1.node(); 3981 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) == 3982 this) { 3983 // At this point we know that we are at a non-greedy loop that will eat 3984 // any character one at a time. Any non-anchored regexp has such a 3985 // loop prepended to it in order to find where it starts. We look for 3986 // a pattern of the form ...abc... where we can look 6 characters ahead 3987 // and step forwards 3 if the character is not one of abc. Abc need 3988 // not be atoms, they can be any reasonably limited character class or 3989 // small alternation. 3990 ASSERT(trace->is_trivial()); // This is the case on LoopChoiceNodes. 3991 BoyerMooreLookahead* lookahead = bm_info(not_at_start); 3992 if (lookahead == NULL) { 3993 eats_at_least = Min(kMaxLookaheadForBoyerMoore, 3994 EatsAtLeast(kMaxLookaheadForBoyerMoore, 3995 kRecursionBudget, 3996 not_at_start)); 3997 if (eats_at_least >= 1) { 3998 BoyerMooreLookahead* bm = 3999 new(zone()) BoyerMooreLookahead(eats_at_least, 4000 compiler, 4001 zone()); 4002 GuardedAlternative alt0 = alternatives_->at(0); 4003 alt0.node()->FillInBMInfo(0, kRecursionBudget, bm, not_at_start); 4004 skip_was_emitted = bm->EmitSkipInstructions(macro_assembler); 4005 } 4006 } else { 4007 skip_was_emitted = lookahead->EmitSkipInstructions(macro_assembler); 4008 } 4009 } 4010 } 4011 } 4012 4013 if (eats_at_least == kEatsAtLeastNotYetInitialized) { 4014 // Save some time by looking at most one machine word ahead. 4015 eats_at_least = 4016 EatsAtLeast(compiler->ascii() ? 4 : 2, kRecursionBudget, not_at_start); 4017 } 4018 int preload_characters = CalculatePreloadCharacters(compiler, eats_at_least); 4019 4020 bool preload_is_current = !skip_was_emitted && 4021 (current_trace->characters_preloaded() == preload_characters); 4022 bool preload_has_checked_bounds = preload_is_current; 4023 4024 AlternativeGenerationList alt_gens(choice_count, zone()); 4025 4026 // For now we just call all choices one after the other. The idea ultimately 4027 // is to use the Dispatch table to try only the relevant ones. 4028 for (int i = first_normal_choice; i < choice_count; i++) { 4029 GuardedAlternative alternative = alternatives_->at(i); 4030 AlternativeGeneration* alt_gen = alt_gens.at(i); 4031 alt_gen->quick_check_details.set_characters(preload_characters); 4032 ZoneList<Guard*>* guards = alternative.guards(); 4033 int guard_count = (guards == NULL) ? 0 : guards->length(); 4034 Trace new_trace(*current_trace); 4035 new_trace.set_characters_preloaded(preload_is_current ? 4036 preload_characters : 4037 0); 4038 if (preload_has_checked_bounds) { 4039 new_trace.set_bound_checked_up_to(preload_characters); 4040 } 4041 new_trace.quick_check_performed()->Clear(); 4042 if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE); 4043 alt_gen->expects_preload = preload_is_current; 4044 bool generate_full_check_inline = false; 4045 if (FLAG_regexp_optimization && 4046 try_to_emit_quick_check_for_alternative(i) && 4047 alternative.node()->EmitQuickCheck(compiler, 4048 &new_trace, 4049 preload_has_checked_bounds, 4050 &alt_gen->possible_success, 4051 &alt_gen->quick_check_details, 4052 i < choice_count - 1)) { 4053 // Quick check was generated for this choice. 4054 preload_is_current = true; 4055 preload_has_checked_bounds = true; 4056 // On the last choice in the ChoiceNode we generated the quick 4057 // check to fall through on possible success. So now we need to 4058 // generate the full check inline. 4059 if (i == choice_count - 1) { 4060 macro_assembler->Bind(&alt_gen->possible_success); 4061 new_trace.set_quick_check_performed(&alt_gen->quick_check_details); 4062 new_trace.set_characters_preloaded(preload_characters); 4063 new_trace.set_bound_checked_up_to(preload_characters); 4064 generate_full_check_inline = true; 4065 } 4066 } else if (alt_gen->quick_check_details.cannot_match()) { 4067 if (i == choice_count - 1 && !greedy_loop) { 4068 macro_assembler->GoTo(trace->backtrack()); 4069 } 4070 continue; 4071 } else { 4072 // No quick check was generated. Put the full code here. 4073 // If this is not the first choice then there could be slow checks from 4074 // previous cases that go here when they fail. There's no reason to 4075 // insist that they preload characters since the slow check we are about 4076 // to generate probably can't use it. 4077 if (i != first_normal_choice) { 4078 alt_gen->expects_preload = false; 4079 new_trace.InvalidateCurrentCharacter(); 4080 } 4081 if (i < choice_count - 1) { 4082 new_trace.set_backtrack(&alt_gen->after); 4083 } 4084 generate_full_check_inline = true; 4085 } 4086 if (generate_full_check_inline) { 4087 if (new_trace.actions() != NULL) { 4088 new_trace.set_flush_budget(new_flush_budget); 4089 } 4090 for (int j = 0; j < guard_count; j++) { 4091 GenerateGuard(macro_assembler, guards->at(j), &new_trace); 4092 } 4093 alternative.node()->Emit(compiler, &new_trace); 4094 preload_is_current = false; 4095 } 4096 macro_assembler->Bind(&alt_gen->after); 4097 } 4098 if (greedy_loop) { 4099 macro_assembler->Bind(&greedy_loop_label); 4100 // If we have unwound to the bottom then backtrack. 4101 macro_assembler->CheckGreedyLoop(trace->backtrack()); 4102 // Otherwise try the second priority at an earlier position. 4103 macro_assembler->AdvanceCurrentPosition(-text_length); 4104 macro_assembler->GoTo(&second_choice); 4105 } 4106 4107 // At this point we need to generate slow checks for the alternatives where 4108 // the quick check was inlined. We can recognize these because the associated 4109 // label was bound. 4110 for (int i = first_normal_choice; i < choice_count - 1; i++) { 4111 AlternativeGeneration* alt_gen = alt_gens.at(i); 4112 Trace new_trace(*current_trace); 4113 // If there are actions to be flushed we have to limit how many times 4114 // they are flushed. Take the budget of the parent trace and distribute 4115 // it fairly amongst the children. 4116 if (new_trace.actions() != NULL) { 4117 new_trace.set_flush_budget(new_flush_budget); 4118 } 4119 EmitOutOfLineContinuation(compiler, 4120 &new_trace, 4121 alternatives_->at(i), 4122 alt_gen, 4123 preload_characters, 4124 alt_gens.at(i + 1)->expects_preload); 4125 } 4126 } 4127 4128 4129 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler, 4130 Trace* trace, 4131 GuardedAlternative alternative, 4132 AlternativeGeneration* alt_gen, 4133 int preload_characters, 4134 bool next_expects_preload) { 4135 if (!alt_gen->possible_success.is_linked()) return; 4136 4137 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 4138 macro_assembler->Bind(&alt_gen->possible_success); 4139 Trace out_of_line_trace(*trace); 4140 out_of_line_trace.set_characters_preloaded(preload_characters); 4141 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details); 4142 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE); 4143 ZoneList<Guard*>* guards = alternative.guards(); 4144 int guard_count = (guards == NULL) ? 0 : guards->length(); 4145 if (next_expects_preload) { 4146 Label reload_current_char; 4147 out_of_line_trace.set_backtrack(&reload_current_char); 4148 for (int j = 0; j < guard_count; j++) { 4149 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace); 4150 } 4151 alternative.node()->Emit(compiler, &out_of_line_trace); 4152 macro_assembler->Bind(&reload_current_char); 4153 // Reload the current character, since the next quick check expects that. 4154 // We don't need to check bounds here because we only get into this 4155 // code through a quick check which already did the checked load. 4156 macro_assembler->LoadCurrentCharacter(trace->cp_offset(), 4157 NULL, 4158 false, 4159 preload_characters); 4160 macro_assembler->GoTo(&(alt_gen->after)); 4161 } else { 4162 out_of_line_trace.set_backtrack(&(alt_gen->after)); 4163 for (int j = 0; j < guard_count; j++) { 4164 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace); 4165 } 4166 alternative.node()->Emit(compiler, &out_of_line_trace); 4167 } 4168 } 4169 4170 4171 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) { 4172 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 4173 LimitResult limit_result = LimitVersions(compiler, trace); 4174 if (limit_result == DONE) return; 4175 ASSERT(limit_result == CONTINUE); 4176 4177 RecursionCheck rc(compiler); 4178 4179 switch (action_type_) { 4180 case STORE_POSITION: { 4181 Trace::DeferredCapture 4182 new_capture(data_.u_position_register.reg, 4183 data_.u_position_register.is_capture, 4184 trace); 4185 Trace new_trace = *trace; 4186 new_trace.add_action(&new_capture); 4187 on_success()->Emit(compiler, &new_trace); 4188 break; 4189 } 4190 case INCREMENT_REGISTER: { 4191 Trace::DeferredIncrementRegister 4192 new_increment(data_.u_increment_register.reg); 4193 Trace new_trace = *trace; 4194 new_trace.add_action(&new_increment); 4195 on_success()->Emit(compiler, &new_trace); 4196 break; 4197 } 4198 case SET_REGISTER: { 4199 Trace::DeferredSetRegister 4200 new_set(data_.u_store_register.reg, data_.u_store_register.value); 4201 Trace new_trace = *trace; 4202 new_trace.add_action(&new_set); 4203 on_success()->Emit(compiler, &new_trace); 4204 break; 4205 } 4206 case CLEAR_CAPTURES: { 4207 Trace::DeferredClearCaptures 4208 new_capture(Interval(data_.u_clear_captures.range_from, 4209 data_.u_clear_captures.range_to)); 4210 Trace new_trace = *trace; 4211 new_trace.add_action(&new_capture); 4212 on_success()->Emit(compiler, &new_trace); 4213 break; 4214 } 4215 case BEGIN_SUBMATCH: 4216 if (!trace->is_trivial()) { 4217 trace->Flush(compiler, this); 4218 } else { 4219 assembler->WriteCurrentPositionToRegister( 4220 data_.u_submatch.current_position_register, 0); 4221 assembler->WriteStackPointerToRegister( 4222 data_.u_submatch.stack_pointer_register); 4223 on_success()->Emit(compiler, trace); 4224 } 4225 break; 4226 case EMPTY_MATCH_CHECK: { 4227 int start_pos_reg = data_.u_empty_match_check.start_register; 4228 int stored_pos = 0; 4229 int rep_reg = data_.u_empty_match_check.repetition_register; 4230 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister); 4231 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos); 4232 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) { 4233 // If we know we haven't advanced and there is no minimum we 4234 // can just backtrack immediately. 4235 assembler->GoTo(trace->backtrack()); 4236 } else if (know_dist && stored_pos < trace->cp_offset()) { 4237 // If we know we've advanced we can generate the continuation 4238 // immediately. 4239 on_success()->Emit(compiler, trace); 4240 } else if (!trace->is_trivial()) { 4241 trace->Flush(compiler, this); 4242 } else { 4243 Label skip_empty_check; 4244 // If we have a minimum number of repetitions we check the current 4245 // number first and skip the empty check if it's not enough. 4246 if (has_minimum) { 4247 int limit = data_.u_empty_match_check.repetition_limit; 4248 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check); 4249 } 4250 // If the match is empty we bail out, otherwise we fall through 4251 // to the on-success continuation. 4252 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register, 4253 trace->backtrack()); 4254 assembler->Bind(&skip_empty_check); 4255 on_success()->Emit(compiler, trace); 4256 } 4257 break; 4258 } 4259 case POSITIVE_SUBMATCH_SUCCESS: { 4260 if (!trace->is_trivial()) { 4261 trace->Flush(compiler, this); 4262 return; 4263 } 4264 assembler->ReadCurrentPositionFromRegister( 4265 data_.u_submatch.current_position_register); 4266 assembler->ReadStackPointerFromRegister( 4267 data_.u_submatch.stack_pointer_register); 4268 int clear_register_count = data_.u_submatch.clear_register_count; 4269 if (clear_register_count == 0) { 4270 on_success()->Emit(compiler, trace); 4271 return; 4272 } 4273 int clear_registers_from = data_.u_submatch.clear_register_from; 4274 Label clear_registers_backtrack; 4275 Trace new_trace = *trace; 4276 new_trace.set_backtrack(&clear_registers_backtrack); 4277 on_success()->Emit(compiler, &new_trace); 4278 4279 assembler->Bind(&clear_registers_backtrack); 4280 int clear_registers_to = clear_registers_from + clear_register_count - 1; 4281 assembler->ClearRegisters(clear_registers_from, clear_registers_to); 4282 4283 ASSERT(trace->backtrack() == NULL); 4284 assembler->Backtrack(); 4285 return; 4286 } 4287 default: 4288 UNREACHABLE(); 4289 } 4290 } 4291 4292 4293 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 4294 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 4295 if (!trace->is_trivial()) { 4296 trace->Flush(compiler, this); 4297 return; 4298 } 4299 4300 LimitResult limit_result = LimitVersions(compiler, trace); 4301 if (limit_result == DONE) return; 4302 ASSERT(limit_result == CONTINUE); 4303 4304 RecursionCheck rc(compiler); 4305 4306 ASSERT_EQ(start_reg_ + 1, end_reg_); 4307 if (compiler->ignore_case()) { 4308 assembler->CheckNotBackReferenceIgnoreCase(start_reg_, 4309 trace->backtrack()); 4310 } else { 4311 assembler->CheckNotBackReference(start_reg_, trace->backtrack()); 4312 } 4313 on_success()->Emit(compiler, trace); 4314 } 4315 4316 4317 // ------------------------------------------------------------------- 4318 // Dot/dotty output 4319 4320 4321 #ifdef DEBUG 4322 4323 4324 class DotPrinter: public NodeVisitor { 4325 public: 4326 explicit DotPrinter(bool ignore_case) 4327 : ignore_case_(ignore_case), 4328 stream_(&alloc_) { } 4329 void PrintNode(const char* label, RegExpNode* node); 4330 void Visit(RegExpNode* node); 4331 void PrintAttributes(RegExpNode* from); 4332 StringStream* stream() { return &stream_; } 4333 void PrintOnFailure(RegExpNode* from, RegExpNode* to); 4334 #define DECLARE_VISIT(Type) \ 4335 virtual void Visit##Type(Type##Node* that); 4336 FOR_EACH_NODE_TYPE(DECLARE_VISIT) 4337 #undef DECLARE_VISIT 4338 private: 4339 bool ignore_case_; 4340 HeapStringAllocator alloc_; 4341 StringStream stream_; 4342 }; 4343 4344 4345 void DotPrinter::PrintNode(const char* label, RegExpNode* node) { 4346 stream()->Add("digraph G {\n graph [label=\""); 4347 for (int i = 0; label[i]; i++) { 4348 switch (label[i]) { 4349 case '\\': 4350 stream()->Add("\\\\"); 4351 break; 4352 case '"': 4353 stream()->Add("\""); 4354 break; 4355 default: 4356 stream()->Put(label[i]); 4357 break; 4358 } 4359 } 4360 stream()->Add("\"];\n"); 4361 Visit(node); 4362 stream()->Add("}\n"); 4363 printf("%s", stream()->ToCString().get()); 4364 } 4365 4366 4367 void DotPrinter::Visit(RegExpNode* node) { 4368 if (node->info()->visited) return; 4369 node->info()->visited = true; 4370 node->Accept(this); 4371 } 4372 4373 4374 void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) { 4375 stream()->Add(" n%p -> n%p [style=dotted];\n", from, on_failure); 4376 Visit(on_failure); 4377 } 4378 4379 4380 class TableEntryBodyPrinter { 4381 public: 4382 TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice) 4383 : stream_(stream), choice_(choice) { } 4384 void Call(uc16 from, DispatchTable::Entry entry) { 4385 OutSet* out_set = entry.out_set(); 4386 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { 4387 if (out_set->Get(i)) { 4388 stream()->Add(" n%p:s%io%i -> n%p;\n", 4389 choice(), 4390 from, 4391 i, 4392 choice()->alternatives()->at(i).node()); 4393 } 4394 } 4395 } 4396 private: 4397 StringStream* stream() { return stream_; } 4398 ChoiceNode* choice() { return choice_; } 4399 StringStream* stream_; 4400 ChoiceNode* choice_; 4401 }; 4402 4403 4404 class TableEntryHeaderPrinter { 4405 public: 4406 explicit TableEntryHeaderPrinter(StringStream* stream) 4407 : first_(true), stream_(stream) { } 4408 void Call(uc16 from, DispatchTable::Entry entry) { 4409 if (first_) { 4410 first_ = false; 4411 } else { 4412 stream()->Add("|"); 4413 } 4414 stream()->Add("{\\%k-\\%k|{", from, entry.to()); 4415 OutSet* out_set = entry.out_set(); 4416 int priority = 0; 4417 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { 4418 if (out_set->Get(i)) { 4419 if (priority > 0) stream()->Add("|"); 4420 stream()->Add("<s%io%i> %i", from, i, priority); 4421 priority++; 4422 } 4423 } 4424 stream()->Add("}}"); 4425 } 4426 4427 private: 4428 bool first_; 4429 StringStream* stream() { return stream_; } 4430 StringStream* stream_; 4431 }; 4432 4433 4434 class AttributePrinter { 4435 public: 4436 explicit AttributePrinter(DotPrinter* out) 4437 : out_(out), first_(true) { } 4438 void PrintSeparator() { 4439 if (first_) { 4440 first_ = false; 4441 } else { 4442 out_->stream()->Add("|"); 4443 } 4444 } 4445 void PrintBit(const char* name, bool value) { 4446 if (!value) return; 4447 PrintSeparator(); 4448 out_->stream()->Add("{%s}", name); 4449 } 4450 void PrintPositive(const char* name, int value) { 4451 if (value < 0) return; 4452 PrintSeparator(); 4453 out_->stream()->Add("{%s|%x}", name, value); 4454 } 4455 private: 4456 DotPrinter* out_; 4457 bool first_; 4458 }; 4459 4460 4461 void DotPrinter::PrintAttributes(RegExpNode* that) { 4462 stream()->Add(" a%p [shape=Mrecord, color=grey, fontcolor=grey, " 4463 "margin=0.1, fontsize=10, label=\"{", 4464 that); 4465 AttributePrinter printer(this); 4466 NodeInfo* info = that->info(); 4467 printer.PrintBit("NI", info->follows_newline_interest); 4468 printer.PrintBit("WI", info->follows_word_interest); 4469 printer.PrintBit("SI", info->follows_start_interest); 4470 Label* label = that->label(); 4471 if (label->is_bound()) 4472 printer.PrintPositive("@", label->pos()); 4473 stream()->Add("}\"];\n"); 4474 stream()->Add(" a%p -> n%p [style=dashed, color=grey, " 4475 "arrowhead=none];\n", that, that); 4476 } 4477 4478 4479 static const bool kPrintDispatchTable = false; 4480 void DotPrinter::VisitChoice(ChoiceNode* that) { 4481 if (kPrintDispatchTable) { 4482 stream()->Add(" n%p [shape=Mrecord, label=\"", that); 4483 TableEntryHeaderPrinter header_printer(stream()); 4484 that->GetTable(ignore_case_)->ForEach(&header_printer); 4485 stream()->Add("\"]\n", that); 4486 PrintAttributes(that); 4487 TableEntryBodyPrinter body_printer(stream(), that); 4488 that->GetTable(ignore_case_)->ForEach(&body_printer); 4489 } else { 4490 stream()->Add(" n%p [shape=Mrecord, label=\"?\"];\n", that); 4491 for (int i = 0; i < that->alternatives()->length(); i++) { 4492 GuardedAlternative alt = that->alternatives()->at(i); 4493 stream()->Add(" n%p -> n%p;\n", that, alt.node()); 4494 } 4495 } 4496 for (int i = 0; i < that->alternatives()->length(); i++) { 4497 GuardedAlternative alt = that->alternatives()->at(i); 4498 alt.node()->Accept(this); 4499 } 4500 } 4501 4502 4503 void DotPrinter::VisitText(TextNode* that) { 4504 Zone* zone = that->zone(); 4505 stream()->Add(" n%p [label=\"", that); 4506 for (int i = 0; i < that->elements()->length(); i++) { 4507 if (i > 0) stream()->Add(" "); 4508 TextElement elm = that->elements()->at(i); 4509 switch (elm.text_type()) { 4510 case TextElement::ATOM: { 4511 stream()->Add("'%w'", elm.atom()->data()); 4512 break; 4513 } 4514 case TextElement::CHAR_CLASS: { 4515 RegExpCharacterClass* node = elm.char_class(); 4516 stream()->Add("["); 4517 if (node->is_negated()) 4518 stream()->Add("^"); 4519 for (int j = 0; j < node->ranges(zone)->length(); j++) { 4520 CharacterRange range = node->ranges(zone)->at(j); 4521 stream()->Add("%k-%k", range.from(), range.to()); 4522 } 4523 stream()->Add("]"); 4524 break; 4525 } 4526 default: 4527 UNREACHABLE(); 4528 } 4529 } 4530 stream()->Add("\", shape=box, peripheries=2];\n"); 4531 PrintAttributes(that); 4532 stream()->Add(" n%p -> n%p;\n", that, that->on_success()); 4533 Visit(that->on_success()); 4534 } 4535 4536 4537 void DotPrinter::VisitBackReference(BackReferenceNode* that) { 4538 stream()->Add(" n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n", 4539 that, 4540 that->start_register(), 4541 that->end_register()); 4542 PrintAttributes(that); 4543 stream()->Add(" n%p -> n%p;\n", that, that->on_success()); 4544 Visit(that->on_success()); 4545 } 4546 4547 4548 void DotPrinter::VisitEnd(EndNode* that) { 4549 stream()->Add(" n%p [style=bold, shape=point];\n", that); 4550 PrintAttributes(that); 4551 } 4552 4553 4554 void DotPrinter::VisitAssertion(AssertionNode* that) { 4555 stream()->Add(" n%p [", that); 4556 switch (that->assertion_type()) { 4557 case AssertionNode::AT_END: 4558 stream()->Add("label=\"$\", shape=septagon"); 4559 break; 4560 case AssertionNode::AT_START: 4561 stream()->Add("label=\"^\", shape=septagon"); 4562 break; 4563 case AssertionNode::AT_BOUNDARY: 4564 stream()->Add("label=\"\\b\", shape=septagon"); 4565 break; 4566 case AssertionNode::AT_NON_BOUNDARY: 4567 stream()->Add("label=\"\\B\", shape=septagon"); 4568 break; 4569 case AssertionNode::AFTER_NEWLINE: 4570 stream()->Add("label=\"(?<=\\n)\", shape=septagon"); 4571 break; 4572 } 4573 stream()->Add("];\n"); 4574 PrintAttributes(that); 4575 RegExpNode* successor = that->on_success(); 4576 stream()->Add(" n%p -> n%p;\n", that, successor); 4577 Visit(successor); 4578 } 4579 4580 4581 void DotPrinter::VisitAction(ActionNode* that) { 4582 stream()->Add(" n%p [", that); 4583 switch (that->action_type_) { 4584 case ActionNode::SET_REGISTER: 4585 stream()->Add("label=\"$%i:=%i\", shape=octagon", 4586 that->data_.u_store_register.reg, 4587 that->data_.u_store_register.value); 4588 break; 4589 case ActionNode::INCREMENT_REGISTER: 4590 stream()->Add("label=\"$%i++\", shape=octagon", 4591 that->data_.u_increment_register.reg); 4592 break; 4593 case ActionNode::STORE_POSITION: 4594 stream()->Add("label=\"$%i:=$pos\", shape=octagon", 4595 that->data_.u_position_register.reg); 4596 break; 4597 case ActionNode::BEGIN_SUBMATCH: 4598 stream()->Add("label=\"$%i:=$pos,begin\", shape=septagon", 4599 that->data_.u_submatch.current_position_register); 4600 break; 4601 case ActionNode::POSITIVE_SUBMATCH_SUCCESS: 4602 stream()->Add("label=\"escape\", shape=septagon"); 4603 break; 4604 case ActionNode::EMPTY_MATCH_CHECK: 4605 stream()->Add("label=\"$%i=$pos?,$%i<%i?\", shape=septagon", 4606 that->data_.u_empty_match_check.start_register, 4607 that->data_.u_empty_match_check.repetition_register, 4608 that->data_.u_empty_match_check.repetition_limit); 4609 break; 4610 case ActionNode::CLEAR_CAPTURES: { 4611 stream()->Add("label=\"clear $%i to $%i\", shape=septagon", 4612 that->data_.u_clear_captures.range_from, 4613 that->data_.u_clear_captures.range_to); 4614 break; 4615 } 4616 } 4617 stream()->Add("];\n"); 4618 PrintAttributes(that); 4619 RegExpNode* successor = that->on_success(); 4620 stream()->Add(" n%p -> n%p;\n", that, successor); 4621 Visit(successor); 4622 } 4623 4624 4625 class DispatchTableDumper { 4626 public: 4627 explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { } 4628 void Call(uc16 key, DispatchTable::Entry entry); 4629 StringStream* stream() { return stream_; } 4630 private: 4631 StringStream* stream_; 4632 }; 4633 4634 4635 void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) { 4636 stream()->Add("[%k-%k]: {", key, entry.to()); 4637 OutSet* set = entry.out_set(); 4638 bool first = true; 4639 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { 4640 if (set->Get(i)) { 4641 if (first) { 4642 first = false; 4643 } else { 4644 stream()->Add(", "); 4645 } 4646 stream()->Add("%i", i); 4647 } 4648 } 4649 stream()->Add("}\n"); 4650 } 4651 4652 4653 void DispatchTable::Dump() { 4654 HeapStringAllocator alloc; 4655 StringStream stream(&alloc); 4656 DispatchTableDumper dumper(&stream); 4657 tree()->ForEach(&dumper); 4658 OS::PrintError("%s", stream.ToCString().get()); 4659 } 4660 4661 4662 void RegExpEngine::DotPrint(const char* label, 4663 RegExpNode* node, 4664 bool ignore_case) { 4665 DotPrinter printer(ignore_case); 4666 printer.PrintNode(label, node); 4667 } 4668 4669 4670 #endif // DEBUG 4671 4672 4673 // ------------------------------------------------------------------- 4674 // Tree to graph conversion 4675 4676 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler, 4677 RegExpNode* on_success) { 4678 ZoneList<TextElement>* elms = 4679 new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone()); 4680 elms->Add(TextElement::Atom(this), compiler->zone()); 4681 return new(compiler->zone()) TextNode(elms, on_success); 4682 } 4683 4684 4685 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, 4686 RegExpNode* on_success) { 4687 return new(compiler->zone()) TextNode(elements(), on_success); 4688 } 4689 4690 4691 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges, 4692 const int* special_class, 4693 int length) { 4694 length--; // Remove final 0x10000. 4695 ASSERT(special_class[length] == 0x10000); 4696 ASSERT(ranges->length() != 0); 4697 ASSERT(length != 0); 4698 ASSERT(special_class[0] != 0); 4699 if (ranges->length() != (length >> 1) + 1) { 4700 return false; 4701 } 4702 CharacterRange range = ranges->at(0); 4703 if (range.from() != 0) { 4704 return false; 4705 } 4706 for (int i = 0; i < length; i += 2) { 4707 if (special_class[i] != (range.to() + 1)) { 4708 return false; 4709 } 4710 range = ranges->at((i >> 1) + 1); 4711 if (special_class[i+1] != range.from()) { 4712 return false; 4713 } 4714 } 4715 if (range.to() != 0xffff) { 4716 return false; 4717 } 4718 return true; 4719 } 4720 4721 4722 static bool CompareRanges(ZoneList<CharacterRange>* ranges, 4723 const int* special_class, 4724 int length) { 4725 length--; // Remove final 0x10000. 4726 ASSERT(special_class[length] == 0x10000); 4727 if (ranges->length() * 2 != length) { 4728 return false; 4729 } 4730 for (int i = 0; i < length; i += 2) { 4731 CharacterRange range = ranges->at(i >> 1); 4732 if (range.from() != special_class[i] || 4733 range.to() != special_class[i + 1] - 1) { 4734 return false; 4735 } 4736 } 4737 return true; 4738 } 4739 4740 4741 bool RegExpCharacterClass::is_standard(Zone* zone) { 4742 // TODO(lrn): Remove need for this function, by not throwing away information 4743 // along the way. 4744 if (is_negated_) { 4745 return false; 4746 } 4747 if (set_.is_standard()) { 4748 return true; 4749 } 4750 if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) { 4751 set_.set_standard_set_type('s'); 4752 return true; 4753 } 4754 if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) { 4755 set_.set_standard_set_type('S'); 4756 return true; 4757 } 4758 if (CompareInverseRanges(set_.ranges(zone), 4759 kLineTerminatorRanges, 4760 kLineTerminatorRangeCount)) { 4761 set_.set_standard_set_type('.'); 4762 return true; 4763 } 4764 if (CompareRanges(set_.ranges(zone), 4765 kLineTerminatorRanges, 4766 kLineTerminatorRangeCount)) { 4767 set_.set_standard_set_type('n'); 4768 return true; 4769 } 4770 if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) { 4771 set_.set_standard_set_type('w'); 4772 return true; 4773 } 4774 if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) { 4775 set_.set_standard_set_type('W'); 4776 return true; 4777 } 4778 return false; 4779 } 4780 4781 4782 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, 4783 RegExpNode* on_success) { 4784 return new(compiler->zone()) TextNode(this, on_success); 4785 } 4786 4787 4788 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler, 4789 RegExpNode* on_success) { 4790 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 4791 int length = alternatives->length(); 4792 ChoiceNode* result = 4793 new(compiler->zone()) ChoiceNode(length, compiler->zone()); 4794 for (int i = 0; i < length; i++) { 4795 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler, 4796 on_success)); 4797 result->AddAlternative(alternative); 4798 } 4799 return result; 4800 } 4801 4802 4803 RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler, 4804 RegExpNode* on_success) { 4805 return ToNode(min(), 4806 max(), 4807 is_greedy(), 4808 body(), 4809 compiler, 4810 on_success); 4811 } 4812 4813 4814 // Scoped object to keep track of how much we unroll quantifier loops in the 4815 // regexp graph generator. 4816 class RegExpExpansionLimiter { 4817 public: 4818 static const int kMaxExpansionFactor = 6; 4819 RegExpExpansionLimiter(RegExpCompiler* compiler, int factor) 4820 : compiler_(compiler), 4821 saved_expansion_factor_(compiler->current_expansion_factor()), 4822 ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) { 4823 ASSERT(factor > 0); 4824 if (ok_to_expand_) { 4825 if (factor > kMaxExpansionFactor) { 4826 // Avoid integer overflow of the current expansion factor. 4827 ok_to_expand_ = false; 4828 compiler->set_current_expansion_factor(kMaxExpansionFactor + 1); 4829 } else { 4830 int new_factor = saved_expansion_factor_ * factor; 4831 ok_to_expand_ = (new_factor <= kMaxExpansionFactor); 4832 compiler->set_current_expansion_factor(new_factor); 4833 } 4834 } 4835 } 4836 4837 ~RegExpExpansionLimiter() { 4838 compiler_->set_current_expansion_factor(saved_expansion_factor_); 4839 } 4840 4841 bool ok_to_expand() { return ok_to_expand_; } 4842 4843 private: 4844 RegExpCompiler* compiler_; 4845 int saved_expansion_factor_; 4846 bool ok_to_expand_; 4847 4848 DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter); 4849 }; 4850 4851 4852 RegExpNode* RegExpQuantifier::ToNode(int min, 4853 int max, 4854 bool is_greedy, 4855 RegExpTree* body, 4856 RegExpCompiler* compiler, 4857 RegExpNode* on_success, 4858 bool not_at_start) { 4859 // x{f, t} becomes this: 4860 // 4861 // (r++)<-. 4862 // | ` 4863 // | (x) 4864 // v ^ 4865 // (r=0)-->(?)---/ [if r < t] 4866 // | 4867 // [if r >= f] \----> ... 4868 // 4869 4870 // 15.10.2.5 RepeatMatcher algorithm. 4871 // The parser has already eliminated the case where max is 0. In the case 4872 // where max_match is zero the parser has removed the quantifier if min was 4873 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom. 4874 4875 // If we know that we cannot match zero length then things are a little 4876 // simpler since we don't need to make the special zero length match check 4877 // from step 2.1. If the min and max are small we can unroll a little in 4878 // this case. 4879 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,} 4880 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3} 4881 if (max == 0) return on_success; // This can happen due to recursion. 4882 bool body_can_be_empty = (body->min_match() == 0); 4883 int body_start_reg = RegExpCompiler::kNoRegister; 4884 Interval capture_registers = body->CaptureRegisters(); 4885 bool needs_capture_clearing = !capture_registers.is_empty(); 4886 Zone* zone = compiler->zone(); 4887 4888 if (body_can_be_empty) { 4889 body_start_reg = compiler->AllocateRegister(); 4890 } else if (FLAG_regexp_optimization && !needs_capture_clearing) { 4891 // Only unroll if there are no captures and the body can't be 4892 // empty. 4893 { 4894 RegExpExpansionLimiter limiter( 4895 compiler, min + ((max != min) ? 1 : 0)); 4896 if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) { 4897 int new_max = (max == kInfinity) ? max : max - min; 4898 // Recurse once to get the loop or optional matches after the fixed 4899 // ones. 4900 RegExpNode* answer = ToNode( 4901 0, new_max, is_greedy, body, compiler, on_success, true); 4902 // Unroll the forced matches from 0 to min. This can cause chains of 4903 // TextNodes (which the parser does not generate). These should be 4904 // combined if it turns out they hinder good code generation. 4905 for (int i = 0; i < min; i++) { 4906 answer = body->ToNode(compiler, answer); 4907 } 4908 return answer; 4909 } 4910 } 4911 if (max <= kMaxUnrolledMaxMatches && min == 0) { 4912 ASSERT(max > 0); // Due to the 'if' above. 4913 RegExpExpansionLimiter limiter(compiler, max); 4914 if (limiter.ok_to_expand()) { 4915 // Unroll the optional matches up to max. 4916 RegExpNode* answer = on_success; 4917 for (int i = 0; i < max; i++) { 4918 ChoiceNode* alternation = new(zone) ChoiceNode(2, zone); 4919 if (is_greedy) { 4920 alternation->AddAlternative( 4921 GuardedAlternative(body->ToNode(compiler, answer))); 4922 alternation->AddAlternative(GuardedAlternative(on_success)); 4923 } else { 4924 alternation->AddAlternative(GuardedAlternative(on_success)); 4925 alternation->AddAlternative( 4926 GuardedAlternative(body->ToNode(compiler, answer))); 4927 } 4928 answer = alternation; 4929 if (not_at_start) alternation->set_not_at_start(); 4930 } 4931 return answer; 4932 } 4933 } 4934 } 4935 bool has_min = min > 0; 4936 bool has_max = max < RegExpTree::kInfinity; 4937 bool needs_counter = has_min || has_max; 4938 int reg_ctr = needs_counter 4939 ? compiler->AllocateRegister() 4940 : RegExpCompiler::kNoRegister; 4941 LoopChoiceNode* center = new(zone) LoopChoiceNode(body->min_match() == 0, 4942 zone); 4943 if (not_at_start) center->set_not_at_start(); 4944 RegExpNode* loop_return = needs_counter 4945 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center)) 4946 : static_cast<RegExpNode*>(center); 4947 if (body_can_be_empty) { 4948 // If the body can be empty we need to check if it was and then 4949 // backtrack. 4950 loop_return = ActionNode::EmptyMatchCheck(body_start_reg, 4951 reg_ctr, 4952 min, 4953 loop_return); 4954 } 4955 RegExpNode* body_node = body->ToNode(compiler, loop_return); 4956 if (body_can_be_empty) { 4957 // If the body can be empty we need to store the start position 4958 // so we can bail out if it was empty. 4959 body_node = ActionNode::StorePosition(body_start_reg, false, body_node); 4960 } 4961 if (needs_capture_clearing) { 4962 // Before entering the body of this loop we need to clear captures. 4963 body_node = ActionNode::ClearCaptures(capture_registers, body_node); 4964 } 4965 GuardedAlternative body_alt(body_node); 4966 if (has_max) { 4967 Guard* body_guard = 4968 new(zone) Guard(reg_ctr, Guard::LT, max); 4969 body_alt.AddGuard(body_guard, zone); 4970 } 4971 GuardedAlternative rest_alt(on_success); 4972 if (has_min) { 4973 Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min); 4974 rest_alt.AddGuard(rest_guard, zone); 4975 } 4976 if (is_greedy) { 4977 center->AddLoopAlternative(body_alt); 4978 center->AddContinueAlternative(rest_alt); 4979 } else { 4980 center->AddContinueAlternative(rest_alt); 4981 center->AddLoopAlternative(body_alt); 4982 } 4983 if (needs_counter) { 4984 return ActionNode::SetRegister(reg_ctr, 0, center); 4985 } else { 4986 return center; 4987 } 4988 } 4989 4990 4991 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, 4992 RegExpNode* on_success) { 4993 NodeInfo info; 4994 Zone* zone = compiler->zone(); 4995 4996 switch (assertion_type()) { 4997 case START_OF_LINE: 4998 return AssertionNode::AfterNewline(on_success); 4999 case START_OF_INPUT: 5000 return AssertionNode::AtStart(on_success); 5001 case BOUNDARY: 5002 return AssertionNode::AtBoundary(on_success); 5003 case NON_BOUNDARY: 5004 return AssertionNode::AtNonBoundary(on_success); 5005 case END_OF_INPUT: 5006 return AssertionNode::AtEnd(on_success); 5007 case END_OF_LINE: { 5008 // Compile $ in multiline regexps as an alternation with a positive 5009 // lookahead in one side and an end-of-input on the other side. 5010 // We need two registers for the lookahead. 5011 int stack_pointer_register = compiler->AllocateRegister(); 5012 int position_register = compiler->AllocateRegister(); 5013 // The ChoiceNode to distinguish between a newline and end-of-input. 5014 ChoiceNode* result = new(zone) ChoiceNode(2, zone); 5015 // Create a newline atom. 5016 ZoneList<CharacterRange>* newline_ranges = 5017 new(zone) ZoneList<CharacterRange>(3, zone); 5018 CharacterRange::AddClassEscape('n', newline_ranges, zone); 5019 RegExpCharacterClass* newline_atom = new(zone) RegExpCharacterClass('n'); 5020 TextNode* newline_matcher = new(zone) TextNode( 5021 newline_atom, 5022 ActionNode::PositiveSubmatchSuccess(stack_pointer_register, 5023 position_register, 5024 0, // No captures inside. 5025 -1, // Ignored if no captures. 5026 on_success)); 5027 // Create an end-of-input matcher. 5028 RegExpNode* end_of_line = ActionNode::BeginSubmatch( 5029 stack_pointer_register, 5030 position_register, 5031 newline_matcher); 5032 // Add the two alternatives to the ChoiceNode. 5033 GuardedAlternative eol_alternative(end_of_line); 5034 result->AddAlternative(eol_alternative); 5035 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success)); 5036 result->AddAlternative(end_alternative); 5037 return result; 5038 } 5039 default: 5040 UNREACHABLE(); 5041 } 5042 return on_success; 5043 } 5044 5045 5046 RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler, 5047 RegExpNode* on_success) { 5048 return new(compiler->zone()) 5049 BackReferenceNode(RegExpCapture::StartRegister(index()), 5050 RegExpCapture::EndRegister(index()), 5051 on_success); 5052 } 5053 5054 5055 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler, 5056 RegExpNode* on_success) { 5057 return on_success; 5058 } 5059 5060 5061 RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler, 5062 RegExpNode* on_success) { 5063 int stack_pointer_register = compiler->AllocateRegister(); 5064 int position_register = compiler->AllocateRegister(); 5065 5066 const int registers_per_capture = 2; 5067 const int register_of_first_capture = 2; 5068 int register_count = capture_count_ * registers_per_capture; 5069 int register_start = 5070 register_of_first_capture + capture_from_ * registers_per_capture; 5071 5072 RegExpNode* success; 5073 if (is_positive()) { 5074 RegExpNode* node = ActionNode::BeginSubmatch( 5075 stack_pointer_register, 5076 position_register, 5077 body()->ToNode( 5078 compiler, 5079 ActionNode::PositiveSubmatchSuccess(stack_pointer_register, 5080 position_register, 5081 register_count, 5082 register_start, 5083 on_success))); 5084 return node; 5085 } else { 5086 // We use a ChoiceNode for a negative lookahead because it has most of 5087 // the characteristics we need. It has the body of the lookahead as its 5088 // first alternative and the expression after the lookahead of the second 5089 // alternative. If the first alternative succeeds then the 5090 // NegativeSubmatchSuccess will unwind the stack including everything the 5091 // choice node set up and backtrack. If the first alternative fails then 5092 // the second alternative is tried, which is exactly the desired result 5093 // for a negative lookahead. The NegativeLookaheadChoiceNode is a special 5094 // ChoiceNode that knows to ignore the first exit when calculating quick 5095 // checks. 5096 Zone* zone = compiler->zone(); 5097 5098 GuardedAlternative body_alt( 5099 body()->ToNode( 5100 compiler, 5101 success = new(zone) NegativeSubmatchSuccess(stack_pointer_register, 5102 position_register, 5103 register_count, 5104 register_start, 5105 zone))); 5106 ChoiceNode* choice_node = 5107 new(zone) NegativeLookaheadChoiceNode(body_alt, 5108 GuardedAlternative(on_success), 5109 zone); 5110 return ActionNode::BeginSubmatch(stack_pointer_register, 5111 position_register, 5112 choice_node); 5113 } 5114 } 5115 5116 5117 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler, 5118 RegExpNode* on_success) { 5119 return ToNode(body(), index(), compiler, on_success); 5120 } 5121 5122 5123 RegExpNode* RegExpCapture::ToNode(RegExpTree* body, 5124 int index, 5125 RegExpCompiler* compiler, 5126 RegExpNode* on_success) { 5127 int start_reg = RegExpCapture::StartRegister(index); 5128 int end_reg = RegExpCapture::EndRegister(index); 5129 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success); 5130 RegExpNode* body_node = body->ToNode(compiler, store_end); 5131 return ActionNode::StorePosition(start_reg, true, body_node); 5132 } 5133 5134 5135 RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler, 5136 RegExpNode* on_success) { 5137 ZoneList<RegExpTree*>* children = nodes(); 5138 RegExpNode* current = on_success; 5139 for (int i = children->length() - 1; i >= 0; i--) { 5140 current = children->at(i)->ToNode(compiler, current); 5141 } 5142 return current; 5143 } 5144 5145 5146 static void AddClass(const int* elmv, 5147 int elmc, 5148 ZoneList<CharacterRange>* ranges, 5149 Zone* zone) { 5150 elmc--; 5151 ASSERT(elmv[elmc] == 0x10000); 5152 for (int i = 0; i < elmc; i += 2) { 5153 ASSERT(elmv[i] < elmv[i + 1]); 5154 ranges->Add(CharacterRange(elmv[i], elmv[i + 1] - 1), zone); 5155 } 5156 } 5157 5158 5159 static void AddClassNegated(const int *elmv, 5160 int elmc, 5161 ZoneList<CharacterRange>* ranges, 5162 Zone* zone) { 5163 elmc--; 5164 ASSERT(elmv[elmc] == 0x10000); 5165 ASSERT(elmv[0] != 0x0000); 5166 ASSERT(elmv[elmc-1] != String::kMaxUtf16CodeUnit); 5167 uc16 last = 0x0000; 5168 for (int i = 0; i < elmc; i += 2) { 5169 ASSERT(last <= elmv[i] - 1); 5170 ASSERT(elmv[i] < elmv[i + 1]); 5171 ranges->Add(CharacterRange(last, elmv[i] - 1), zone); 5172 last = elmv[i + 1]; 5173 } 5174 ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit), zone); 5175 } 5176 5177 5178 void CharacterRange::AddClassEscape(uc16 type, 5179 ZoneList<CharacterRange>* ranges, 5180 Zone* zone) { 5181 switch (type) { 5182 case 's': 5183 AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone); 5184 break; 5185 case 'S': 5186 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone); 5187 break; 5188 case 'w': 5189 AddClass(kWordRanges, kWordRangeCount, ranges, zone); 5190 break; 5191 case 'W': 5192 AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone); 5193 break; 5194 case 'd': 5195 AddClass(kDigitRanges, kDigitRangeCount, ranges, zone); 5196 break; 5197 case 'D': 5198 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone); 5199 break; 5200 case '.': 5201 AddClassNegated(kLineTerminatorRanges, 5202 kLineTerminatorRangeCount, 5203 ranges, 5204 zone); 5205 break; 5206 // This is not a character range as defined by the spec but a 5207 // convenient shorthand for a character class that matches any 5208 // character. 5209 case '*': 5210 ranges->Add(CharacterRange::Everything(), zone); 5211 break; 5212 // This is the set of characters matched by the $ and ^ symbols 5213 // in multiline mode. 5214 case 'n': 5215 AddClass(kLineTerminatorRanges, 5216 kLineTerminatorRangeCount, 5217 ranges, 5218 zone); 5219 break; 5220 default: 5221 UNREACHABLE(); 5222 } 5223 } 5224 5225 5226 Vector<const int> CharacterRange::GetWordBounds() { 5227 return Vector<const int>(kWordRanges, kWordRangeCount - 1); 5228 } 5229 5230 5231 class CharacterRangeSplitter { 5232 public: 5233 CharacterRangeSplitter(ZoneList<CharacterRange>** included, 5234 ZoneList<CharacterRange>** excluded, 5235 Zone* zone) 5236 : included_(included), 5237 excluded_(excluded), 5238 zone_(zone) { } 5239 void Call(uc16 from, DispatchTable::Entry entry); 5240 5241 static const int kInBase = 0; 5242 static const int kInOverlay = 1; 5243 5244 private: 5245 ZoneList<CharacterRange>** included_; 5246 ZoneList<CharacterRange>** excluded_; 5247 Zone* zone_; 5248 }; 5249 5250 5251 void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) { 5252 if (!entry.out_set()->Get(kInBase)) return; 5253 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay) 5254 ? included_ 5255 : excluded_; 5256 if (*target == NULL) *target = new(zone_) ZoneList<CharacterRange>(2, zone_); 5257 (*target)->Add(CharacterRange(entry.from(), entry.to()), zone_); 5258 } 5259 5260 5261 void CharacterRange::Split(ZoneList<CharacterRange>* base, 5262 Vector<const int> overlay, 5263 ZoneList<CharacterRange>** included, 5264 ZoneList<CharacterRange>** excluded, 5265 Zone* zone) { 5266 ASSERT_EQ(NULL, *included); 5267 ASSERT_EQ(NULL, *excluded); 5268 DispatchTable table(zone); 5269 for (int i = 0; i < base->length(); i++) 5270 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase, zone); 5271 for (int i = 0; i < overlay.length(); i += 2) { 5272 table.AddRange(CharacterRange(overlay[i], overlay[i + 1] - 1), 5273 CharacterRangeSplitter::kInOverlay, zone); 5274 } 5275 CharacterRangeSplitter callback(included, excluded, zone); 5276 table.ForEach(&callback); 5277 } 5278 5279 5280 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges, 5281 bool is_ascii, 5282 Zone* zone) { 5283 Isolate* isolate = zone->isolate(); 5284 uc16 bottom = from(); 5285 uc16 top = to(); 5286 if (is_ascii && !RangeContainsLatin1Equivalents(*this)) { 5287 if (bottom > String::kMaxOneByteCharCode) return; 5288 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode; 5289 } 5290 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 5291 if (top == bottom) { 5292 // If this is a singleton we just expand the one character. 5293 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars); 5294 for (int i = 0; i < length; i++) { 5295 uc32 chr = chars[i]; 5296 if (chr != bottom) { 5297 ranges->Add(CharacterRange::Singleton(chars[i]), zone); 5298 } 5299 } 5300 } else { 5301 // If this is a range we expand the characters block by block, 5302 // expanding contiguous subranges (blocks) one at a time. 5303 // The approach is as follows. For a given start character we 5304 // look up the remainder of the block that contains it (represented 5305 // by the end point), for instance we find 'z' if the character 5306 // is 'c'. A block is characterized by the property 5307 // that all characters uncanonicalize in the same way, except that 5308 // each entry in the result is incremented by the distance from the first 5309 // element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and 5310 // the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. 5311 // Once we've found the end point we look up its uncanonicalization 5312 // and produce a range for each element. For instance for [c-f] 5313 // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only 5314 // add a range if it is not already contained in the input, so [c-f] 5315 // will be skipped but [C-F] will be added. If this range is not 5316 // completely contained in a block we do this for all the blocks 5317 // covered by the range (handling characters that is not in a block 5318 // as a "singleton block"). 5319 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 5320 int pos = bottom; 5321 while (pos <= top) { 5322 int length = isolate->jsregexp_canonrange()->get(pos, '\0', range); 5323 uc16 block_end; 5324 if (length == 0) { 5325 block_end = pos; 5326 } else { 5327 ASSERT_EQ(1, length); 5328 block_end = range[0]; 5329 } 5330 int end = (block_end > top) ? top : block_end; 5331 length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', range); 5332 for (int i = 0; i < length; i++) { 5333 uc32 c = range[i]; 5334 uc16 range_from = c - (block_end - pos); 5335 uc16 range_to = c - (block_end - end); 5336 if (!(bottom <= range_from && range_to <= top)) { 5337 ranges->Add(CharacterRange(range_from, range_to), zone); 5338 } 5339 } 5340 pos = end + 1; 5341 } 5342 } 5343 } 5344 5345 5346 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) { 5347 ASSERT_NOT_NULL(ranges); 5348 int n = ranges->length(); 5349 if (n <= 1) return true; 5350 int max = ranges->at(0).to(); 5351 for (int i = 1; i < n; i++) { 5352 CharacterRange next_range = ranges->at(i); 5353 if (next_range.from() <= max + 1) return false; 5354 max = next_range.to(); 5355 } 5356 return true; 5357 } 5358 5359 5360 ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) { 5361 if (ranges_ == NULL) { 5362 ranges_ = new(zone) ZoneList<CharacterRange>(2, zone); 5363 CharacterRange::AddClassEscape(standard_set_type_, ranges_, zone); 5364 } 5365 return ranges_; 5366 } 5367 5368 5369 // Move a number of elements in a zonelist to another position 5370 // in the same list. Handles overlapping source and target areas. 5371 static void MoveRanges(ZoneList<CharacterRange>* list, 5372 int from, 5373 int to, 5374 int count) { 5375 // Ranges are potentially overlapping. 5376 if (from < to) { 5377 for (int i = count - 1; i >= 0; i--) { 5378 list->at(to + i) = list->at(from + i); 5379 } 5380 } else { 5381 for (int i = 0; i < count; i++) { 5382 list->at(to + i) = list->at(from + i); 5383 } 5384 } 5385 } 5386 5387 5388 static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list, 5389 int count, 5390 CharacterRange insert) { 5391 // Inserts a range into list[0..count[, which must be sorted 5392 // by from value and non-overlapping and non-adjacent, using at most 5393 // list[0..count] for the result. Returns the number of resulting 5394 // canonicalized ranges. Inserting a range may collapse existing ranges into 5395 // fewer ranges, so the return value can be anything in the range 1..count+1. 5396 uc16 from = insert.from(); 5397 uc16 to = insert.to(); 5398 int start_pos = 0; 5399 int end_pos = count; 5400 for (int i = count - 1; i >= 0; i--) { 5401 CharacterRange current = list->at(i); 5402 if (current.from() > to + 1) { 5403 end_pos = i; 5404 } else if (current.to() + 1 < from) { 5405 start_pos = i + 1; 5406 break; 5407 } 5408 } 5409 5410 // Inserted range overlaps, or is adjacent to, ranges at positions 5411 // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are 5412 // not affected by the insertion. 5413 // If start_pos == end_pos, the range must be inserted before start_pos. 5414 // if start_pos < end_pos, the entire range from start_pos to end_pos 5415 // must be merged with the insert range. 5416 5417 if (start_pos == end_pos) { 5418 // Insert between existing ranges at position start_pos. 5419 if (start_pos < count) { 5420 MoveRanges(list, start_pos, start_pos + 1, count - start_pos); 5421 } 5422 list->at(start_pos) = insert; 5423 return count + 1; 5424 } 5425 if (start_pos + 1 == end_pos) { 5426 // Replace single existing range at position start_pos. 5427 CharacterRange to_replace = list->at(start_pos); 5428 int new_from = Min(to_replace.from(), from); 5429 int new_to = Max(to_replace.to(), to); 5430 list->at(start_pos) = CharacterRange(new_from, new_to); 5431 return count; 5432 } 5433 // Replace a number of existing ranges from start_pos to end_pos - 1. 5434 // Move the remaining ranges down. 5435 5436 int new_from = Min(list->at(start_pos).from(), from); 5437 int new_to = Max(list->at(end_pos - 1).to(), to); 5438 if (end_pos < count) { 5439 MoveRanges(list, end_pos, start_pos + 1, count - end_pos); 5440 } 5441 list->at(start_pos) = CharacterRange(new_from, new_to); 5442 return count - (end_pos - start_pos) + 1; 5443 } 5444 5445 5446 void CharacterSet::Canonicalize() { 5447 // Special/default classes are always considered canonical. The result 5448 // of calling ranges() will be sorted. 5449 if (ranges_ == NULL) return; 5450 CharacterRange::Canonicalize(ranges_); 5451 } 5452 5453 5454 void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) { 5455 if (character_ranges->length() <= 1) return; 5456 // Check whether ranges are already canonical (increasing, non-overlapping, 5457 // non-adjacent). 5458 int n = character_ranges->length(); 5459 int max = character_ranges->at(0).to(); 5460 int i = 1; 5461 while (i < n) { 5462 CharacterRange current = character_ranges->at(i); 5463 if (current.from() <= max + 1) { 5464 break; 5465 } 5466 max = current.to(); 5467 i++; 5468 } 5469 // Canonical until the i'th range. If that's all of them, we are done. 5470 if (i == n) return; 5471 5472 // The ranges at index i and forward are not canonicalized. Make them so by 5473 // doing the equivalent of insertion sort (inserting each into the previous 5474 // list, in order). 5475 // Notice that inserting a range can reduce the number of ranges in the 5476 // result due to combining of adjacent and overlapping ranges. 5477 int read = i; // Range to insert. 5478 int num_canonical = i; // Length of canonicalized part of list. 5479 do { 5480 num_canonical = InsertRangeInCanonicalList(character_ranges, 5481 num_canonical, 5482 character_ranges->at(read)); 5483 read++; 5484 } while (read < n); 5485 character_ranges->Rewind(num_canonical); 5486 5487 ASSERT(CharacterRange::IsCanonical(character_ranges)); 5488 } 5489 5490 5491 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges, 5492 ZoneList<CharacterRange>* negated_ranges, 5493 Zone* zone) { 5494 ASSERT(CharacterRange::IsCanonical(ranges)); 5495 ASSERT_EQ(0, negated_ranges->length()); 5496 int range_count = ranges->length(); 5497 uc16 from = 0; 5498 int i = 0; 5499 if (range_count > 0 && ranges->at(0).from() == 0) { 5500 from = ranges->at(0).to(); 5501 i = 1; 5502 } 5503 while (i < range_count) { 5504 CharacterRange range = ranges->at(i); 5505 negated_ranges->Add(CharacterRange(from + 1, range.from() - 1), zone); 5506 from = range.to(); 5507 i++; 5508 } 5509 if (from < String::kMaxUtf16CodeUnit) { 5510 negated_ranges->Add(CharacterRange(from + 1, String::kMaxUtf16CodeUnit), 5511 zone); 5512 } 5513 } 5514 5515 5516 // ------------------------------------------------------------------- 5517 // Splay tree 5518 5519 5520 OutSet* OutSet::Extend(unsigned value, Zone* zone) { 5521 if (Get(value)) 5522 return this; 5523 if (successors(zone) != NULL) { 5524 for (int i = 0; i < successors(zone)->length(); i++) { 5525 OutSet* successor = successors(zone)->at(i); 5526 if (successor->Get(value)) 5527 return successor; 5528 } 5529 } else { 5530 successors_ = new(zone) ZoneList<OutSet*>(2, zone); 5531 } 5532 OutSet* result = new(zone) OutSet(first_, remaining_); 5533 result->Set(value, zone); 5534 successors(zone)->Add(result, zone); 5535 return result; 5536 } 5537 5538 5539 void OutSet::Set(unsigned value, Zone *zone) { 5540 if (value < kFirstLimit) { 5541 first_ |= (1 << value); 5542 } else { 5543 if (remaining_ == NULL) 5544 remaining_ = new(zone) ZoneList<unsigned>(1, zone); 5545 if (remaining_->is_empty() || !remaining_->Contains(value)) 5546 remaining_->Add(value, zone); 5547 } 5548 } 5549 5550 5551 bool OutSet::Get(unsigned value) const { 5552 if (value < kFirstLimit) { 5553 return (first_ & (1 << value)) != 0; 5554 } else if (remaining_ == NULL) { 5555 return false; 5556 } else { 5557 return remaining_->Contains(value); 5558 } 5559 } 5560 5561 5562 const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar; 5563 5564 5565 void DispatchTable::AddRange(CharacterRange full_range, int value, 5566 Zone* zone) { 5567 CharacterRange current = full_range; 5568 if (tree()->is_empty()) { 5569 // If this is the first range we just insert into the table. 5570 ZoneSplayTree<Config>::Locator loc; 5571 ASSERT_RESULT(tree()->Insert(current.from(), &loc)); 5572 loc.set_value(Entry(current.from(), current.to(), 5573 empty()->Extend(value, zone))); 5574 return; 5575 } 5576 // First see if there is a range to the left of this one that 5577 // overlaps. 5578 ZoneSplayTree<Config>::Locator loc; 5579 if (tree()->FindGreatestLessThan(current.from(), &loc)) { 5580 Entry* entry = &loc.value(); 5581 // If we've found a range that overlaps with this one, and it 5582 // starts strictly to the left of this one, we have to fix it 5583 // because the following code only handles ranges that start on 5584 // or after the start point of the range we're adding. 5585 if (entry->from() < current.from() && entry->to() >= current.from()) { 5586 // Snap the overlapping range in half around the start point of 5587 // the range we're adding. 5588 CharacterRange left(entry->from(), current.from() - 1); 5589 CharacterRange right(current.from(), entry->to()); 5590 // The left part of the overlapping range doesn't overlap. 5591 // Truncate the whole entry to be just the left part. 5592 entry->set_to(left.to()); 5593 // The right part is the one that overlaps. We add this part 5594 // to the map and let the next step deal with merging it with 5595 // the range we're adding. 5596 ZoneSplayTree<Config>::Locator loc; 5597 ASSERT_RESULT(tree()->Insert(right.from(), &loc)); 5598 loc.set_value(Entry(right.from(), 5599 right.to(), 5600 entry->out_set())); 5601 } 5602 } 5603 while (current.is_valid()) { 5604 if (tree()->FindLeastGreaterThan(current.from(), &loc) && 5605 (loc.value().from() <= current.to()) && 5606 (loc.value().to() >= current.from())) { 5607 Entry* entry = &loc.value(); 5608 // We have overlap. If there is space between the start point of 5609 // the range we're adding and where the overlapping range starts 5610 // then we have to add a range covering just that space. 5611 if (current.from() < entry->from()) { 5612 ZoneSplayTree<Config>::Locator ins; 5613 ASSERT_RESULT(tree()->Insert(current.from(), &ins)); 5614 ins.set_value(Entry(current.from(), 5615 entry->from() - 1, 5616 empty()->Extend(value, zone))); 5617 current.set_from(entry->from()); 5618 } 5619 ASSERT_EQ(current.from(), entry->from()); 5620 // If the overlapping range extends beyond the one we want to add 5621 // we have to snap the right part off and add it separately. 5622 if (entry->to() > current.to()) { 5623 ZoneSplayTree<Config>::Locator ins; 5624 ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins)); 5625 ins.set_value(Entry(current.to() + 1, 5626 entry->to(), 5627 entry->out_set())); 5628 entry->set_to(current.to()); 5629 } 5630 ASSERT(entry->to() <= current.to()); 5631 // The overlapping range is now completely contained by the range 5632 // we're adding so we can just update it and move the start point 5633 // of the range we're adding just past it. 5634 entry->AddValue(value, zone); 5635 // Bail out if the last interval ended at 0xFFFF since otherwise 5636 // adding 1 will wrap around to 0. 5637 if (entry->to() == String::kMaxUtf16CodeUnit) 5638 break; 5639 ASSERT(entry->to() + 1 > current.from()); 5640 current.set_from(entry->to() + 1); 5641 } else { 5642 // There is no overlap so we can just add the range 5643 ZoneSplayTree<Config>::Locator ins; 5644 ASSERT_RESULT(tree()->Insert(current.from(), &ins)); 5645 ins.set_value(Entry(current.from(), 5646 current.to(), 5647 empty()->Extend(value, zone))); 5648 break; 5649 } 5650 } 5651 } 5652 5653 5654 OutSet* DispatchTable::Get(uc16 value) { 5655 ZoneSplayTree<Config>::Locator loc; 5656 if (!tree()->FindGreatestLessThan(value, &loc)) 5657 return empty(); 5658 Entry* entry = &loc.value(); 5659 if (value <= entry->to()) 5660 return entry->out_set(); 5661 else 5662 return empty(); 5663 } 5664 5665 5666 // ------------------------------------------------------------------- 5667 // Analysis 5668 5669 5670 void Analysis::EnsureAnalyzed(RegExpNode* that) { 5671 StackLimitCheck check(that->zone()->isolate()); 5672 if (check.HasOverflowed()) { 5673 fail("Stack overflow"); 5674 return; 5675 } 5676 if (that->info()->been_analyzed || that->info()->being_analyzed) 5677 return; 5678 that->info()->being_analyzed = true; 5679 that->Accept(this); 5680 that->info()->being_analyzed = false; 5681 that->info()->been_analyzed = true; 5682 } 5683 5684 5685 void Analysis::VisitEnd(EndNode* that) { 5686 // nothing to do 5687 } 5688 5689 5690 void TextNode::CalculateOffsets() { 5691 int element_count = elements()->length(); 5692 // Set up the offsets of the elements relative to the start. This is a fixed 5693 // quantity since a TextNode can only contain fixed-width things. 5694 int cp_offset = 0; 5695 for (int i = 0; i < element_count; i++) { 5696 TextElement& elm = elements()->at(i); 5697 elm.set_cp_offset(cp_offset); 5698 cp_offset += elm.length(); 5699 } 5700 } 5701 5702 5703 void Analysis::VisitText(TextNode* that) { 5704 if (ignore_case_) { 5705 that->MakeCaseIndependent(is_ascii_); 5706 } 5707 EnsureAnalyzed(that->on_success()); 5708 if (!has_failed()) { 5709 that->CalculateOffsets(); 5710 } 5711 } 5712 5713 5714 void Analysis::VisitAction(ActionNode* that) { 5715 RegExpNode* target = that->on_success(); 5716 EnsureAnalyzed(target); 5717 if (!has_failed()) { 5718 // If the next node is interested in what it follows then this node 5719 // has to be interested too so it can pass the information on. 5720 that->info()->AddFromFollowing(target->info()); 5721 } 5722 } 5723 5724 5725 void Analysis::VisitChoice(ChoiceNode* that) { 5726 NodeInfo* info = that->info(); 5727 for (int i = 0; i < that->alternatives()->length(); i++) { 5728 RegExpNode* node = that->alternatives()->at(i).node(); 5729 EnsureAnalyzed(node); 5730 if (has_failed()) return; 5731 // Anything the following nodes need to know has to be known by 5732 // this node also, so it can pass it on. 5733 info->AddFromFollowing(node->info()); 5734 } 5735 } 5736 5737 5738 void Analysis::VisitLoopChoice(LoopChoiceNode* that) { 5739 NodeInfo* info = that->info(); 5740 for (int i = 0; i < that->alternatives()->length(); i++) { 5741 RegExpNode* node = that->alternatives()->at(i).node(); 5742 if (node != that->loop_node()) { 5743 EnsureAnalyzed(node); 5744 if (has_failed()) return; 5745 info->AddFromFollowing(node->info()); 5746 } 5747 } 5748 // Check the loop last since it may need the value of this node 5749 // to get a correct result. 5750 EnsureAnalyzed(that->loop_node()); 5751 if (!has_failed()) { 5752 info->AddFromFollowing(that->loop_node()->info()); 5753 } 5754 } 5755 5756 5757 void Analysis::VisitBackReference(BackReferenceNode* that) { 5758 EnsureAnalyzed(that->on_success()); 5759 } 5760 5761 5762 void Analysis::VisitAssertion(AssertionNode* that) { 5763 EnsureAnalyzed(that->on_success()); 5764 } 5765 5766 5767 void BackReferenceNode::FillInBMInfo(int offset, 5768 int budget, 5769 BoyerMooreLookahead* bm, 5770 bool not_at_start) { 5771 // Working out the set of characters that a backreference can match is too 5772 // hard, so we just say that any character can match. 5773 bm->SetRest(offset); 5774 SaveBMInfo(bm, not_at_start, offset); 5775 } 5776 5777 5778 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize == 5779 RegExpMacroAssembler::kTableSize); 5780 5781 5782 void ChoiceNode::FillInBMInfo(int offset, 5783 int budget, 5784 BoyerMooreLookahead* bm, 5785 bool not_at_start) { 5786 ZoneList<GuardedAlternative>* alts = alternatives(); 5787 budget = (budget - 1) / alts->length(); 5788 for (int i = 0; i < alts->length(); i++) { 5789 GuardedAlternative& alt = alts->at(i); 5790 if (alt.guards() != NULL && alt.guards()->length() != 0) { 5791 bm->SetRest(offset); // Give up trying to fill in info. 5792 SaveBMInfo(bm, not_at_start, offset); 5793 return; 5794 } 5795 alt.node()->FillInBMInfo(offset, budget, bm, not_at_start); 5796 } 5797 SaveBMInfo(bm, not_at_start, offset); 5798 } 5799 5800 5801 void TextNode::FillInBMInfo(int initial_offset, 5802 int budget, 5803 BoyerMooreLookahead* bm, 5804 bool not_at_start) { 5805 if (initial_offset >= bm->length()) return; 5806 int offset = initial_offset; 5807 int max_char = bm->max_char(); 5808 for (int i = 0; i < elements()->length(); i++) { 5809 if (offset >= bm->length()) { 5810 if (initial_offset == 0) set_bm_info(not_at_start, bm); 5811 return; 5812 } 5813 TextElement text = elements()->at(i); 5814 if (text.text_type() == TextElement::ATOM) { 5815 RegExpAtom* atom = text.atom(); 5816 for (int j = 0; j < atom->length(); j++, offset++) { 5817 if (offset >= bm->length()) { 5818 if (initial_offset == 0) set_bm_info(not_at_start, bm); 5819 return; 5820 } 5821 uc16 character = atom->data()[j]; 5822 if (bm->compiler()->ignore_case()) { 5823 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 5824 int length = GetCaseIndependentLetters( 5825 Isolate::Current(), 5826 character, 5827 bm->max_char() == String::kMaxOneByteCharCode, 5828 chars); 5829 for (int j = 0; j < length; j++) { 5830 bm->Set(offset, chars[j]); 5831 } 5832 } else { 5833 if (character <= max_char) bm->Set(offset, character); 5834 } 5835 } 5836 } else { 5837 ASSERT_EQ(TextElement::CHAR_CLASS, text.text_type()); 5838 RegExpCharacterClass* char_class = text.char_class(); 5839 ZoneList<CharacterRange>* ranges = char_class->ranges(zone()); 5840 if (char_class->is_negated()) { 5841 bm->SetAll(offset); 5842 } else { 5843 for (int k = 0; k < ranges->length(); k++) { 5844 CharacterRange& range = ranges->at(k); 5845 if (range.from() > max_char) continue; 5846 int to = Min(max_char, static_cast<int>(range.to())); 5847 bm->SetInterval(offset, Interval(range.from(), to)); 5848 } 5849 } 5850 offset++; 5851 } 5852 } 5853 if (offset >= bm->length()) { 5854 if (initial_offset == 0) set_bm_info(not_at_start, bm); 5855 return; 5856 } 5857 on_success()->FillInBMInfo(offset, 5858 budget - 1, 5859 bm, 5860 true); // Not at start after a text node. 5861 if (initial_offset == 0) set_bm_info(not_at_start, bm); 5862 } 5863 5864 5865 // ------------------------------------------------------------------- 5866 // Dispatch table construction 5867 5868 5869 void DispatchTableConstructor::VisitEnd(EndNode* that) { 5870 AddRange(CharacterRange::Everything()); 5871 } 5872 5873 5874 void DispatchTableConstructor::BuildTable(ChoiceNode* node) { 5875 node->set_being_calculated(true); 5876 ZoneList<GuardedAlternative>* alternatives = node->alternatives(); 5877 for (int i = 0; i < alternatives->length(); i++) { 5878 set_choice_index(i); 5879 alternatives->at(i).node()->Accept(this); 5880 } 5881 node->set_being_calculated(false); 5882 } 5883 5884 5885 class AddDispatchRange { 5886 public: 5887 explicit AddDispatchRange(DispatchTableConstructor* constructor) 5888 : constructor_(constructor) { } 5889 void Call(uc32 from, DispatchTable::Entry entry); 5890 private: 5891 DispatchTableConstructor* constructor_; 5892 }; 5893 5894 5895 void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) { 5896 CharacterRange range(from, entry.to()); 5897 constructor_->AddRange(range); 5898 } 5899 5900 5901 void DispatchTableConstructor::VisitChoice(ChoiceNode* node) { 5902 if (node->being_calculated()) 5903 return; 5904 DispatchTable* table = node->GetTable(ignore_case_); 5905 AddDispatchRange adder(this); 5906 table->ForEach(&adder); 5907 } 5908 5909 5910 void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) { 5911 // TODO(160): Find the node that we refer back to and propagate its start 5912 // set back to here. For now we just accept anything. 5913 AddRange(CharacterRange::Everything()); 5914 } 5915 5916 5917 void DispatchTableConstructor::VisitAssertion(AssertionNode* that) { 5918 RegExpNode* target = that->on_success(); 5919 target->Accept(this); 5920 } 5921 5922 5923 static int CompareRangeByFrom(const CharacterRange* a, 5924 const CharacterRange* b) { 5925 return Compare<uc16>(a->from(), b->from()); 5926 } 5927 5928 5929 void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) { 5930 ranges->Sort(CompareRangeByFrom); 5931 uc16 last = 0; 5932 for (int i = 0; i < ranges->length(); i++) { 5933 CharacterRange range = ranges->at(i); 5934 if (last < range.from()) 5935 AddRange(CharacterRange(last, range.from() - 1)); 5936 if (range.to() >= last) { 5937 if (range.to() == String::kMaxUtf16CodeUnit) { 5938 return; 5939 } else { 5940 last = range.to() + 1; 5941 } 5942 } 5943 } 5944 AddRange(CharacterRange(last, String::kMaxUtf16CodeUnit)); 5945 } 5946 5947 5948 void DispatchTableConstructor::VisitText(TextNode* that) { 5949 TextElement elm = that->elements()->at(0); 5950 switch (elm.text_type()) { 5951 case TextElement::ATOM: { 5952 uc16 c = elm.atom()->data()[0]; 5953 AddRange(CharacterRange(c, c)); 5954 break; 5955 } 5956 case TextElement::CHAR_CLASS: { 5957 RegExpCharacterClass* tree = elm.char_class(); 5958 ZoneList<CharacterRange>* ranges = tree->ranges(that->zone()); 5959 if (tree->is_negated()) { 5960 AddInverse(ranges); 5961 } else { 5962 for (int i = 0; i < ranges->length(); i++) 5963 AddRange(ranges->at(i)); 5964 } 5965 break; 5966 } 5967 default: { 5968 UNIMPLEMENTED(); 5969 } 5970 } 5971 } 5972 5973 5974 void DispatchTableConstructor::VisitAction(ActionNode* that) { 5975 RegExpNode* target = that->on_success(); 5976 target->Accept(this); 5977 } 5978 5979 5980 RegExpEngine::CompilationResult RegExpEngine::Compile( 5981 RegExpCompileData* data, 5982 bool ignore_case, 5983 bool is_global, 5984 bool is_multiline, 5985 Handle<String> pattern, 5986 Handle<String> sample_subject, 5987 bool is_ascii, 5988 Zone* zone) { 5989 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) { 5990 return IrregexpRegExpTooBig(zone->isolate()); 5991 } 5992 RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii, zone); 5993 5994 // Sample some characters from the middle of the string. 5995 static const int kSampleSize = 128; 5996 5997 sample_subject = String::Flatten(sample_subject); 5998 int chars_sampled = 0; 5999 int half_way = (sample_subject->length() - kSampleSize) / 2; 6000 for (int i = Max(0, half_way); 6001 i < sample_subject->length() && chars_sampled < kSampleSize; 6002 i++, chars_sampled++) { 6003 compiler.frequency_collator()->CountCharacter(sample_subject->Get(i)); 6004 } 6005 6006 // Wrap the body of the regexp in capture #0. 6007 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree, 6008 0, 6009 &compiler, 6010 compiler.accept()); 6011 RegExpNode* node = captured_body; 6012 bool is_end_anchored = data->tree->IsAnchoredAtEnd(); 6013 bool is_start_anchored = data->tree->IsAnchoredAtStart(); 6014 int max_length = data->tree->max_match(); 6015 if (!is_start_anchored) { 6016 // Add a .*? at the beginning, outside the body capture, unless 6017 // this expression is anchored at the beginning. 6018 RegExpNode* loop_node = 6019 RegExpQuantifier::ToNode(0, 6020 RegExpTree::kInfinity, 6021 false, 6022 new(zone) RegExpCharacterClass('*'), 6023 &compiler, 6024 captured_body, 6025 data->contains_anchor); 6026 6027 if (data->contains_anchor) { 6028 // Unroll loop once, to take care of the case that might start 6029 // at the start of input. 6030 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone); 6031 first_step_node->AddAlternative(GuardedAlternative(captured_body)); 6032 first_step_node->AddAlternative(GuardedAlternative( 6033 new(zone) TextNode(new(zone) RegExpCharacterClass('*'), loop_node))); 6034 node = first_step_node; 6035 } else { 6036 node = loop_node; 6037 } 6038 } 6039 if (is_ascii) { 6040 node = node->FilterASCII(RegExpCompiler::kMaxRecursion, ignore_case); 6041 // Do it again to propagate the new nodes to places where they were not 6042 // put because they had not been calculated yet. 6043 if (node != NULL) { 6044 node = node->FilterASCII(RegExpCompiler::kMaxRecursion, ignore_case); 6045 } 6046 } 6047 6048 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone); 6049 data->node = node; 6050 Analysis analysis(ignore_case, is_ascii); 6051 analysis.EnsureAnalyzed(node); 6052 if (analysis.has_failed()) { 6053 const char* error_message = analysis.error_message(); 6054 return CompilationResult(zone->isolate(), error_message); 6055 } 6056 6057 // Create the correct assembler for the architecture. 6058 #ifndef V8_INTERPRETED_REGEXP 6059 // Native regexp implementation. 6060 6061 NativeRegExpMacroAssembler::Mode mode = 6062 is_ascii ? NativeRegExpMacroAssembler::ASCII 6063 : NativeRegExpMacroAssembler::UC16; 6064 6065 #if V8_TARGET_ARCH_IA32 6066 RegExpMacroAssemblerIA32 macro_assembler(mode, (data->capture_count + 1) * 2, 6067 zone); 6068 #elif V8_TARGET_ARCH_X64 6069 RegExpMacroAssemblerX64 macro_assembler(mode, (data->capture_count + 1) * 2, 6070 zone); 6071 #elif V8_TARGET_ARCH_ARM 6072 RegExpMacroAssemblerARM macro_assembler(mode, (data->capture_count + 1) * 2, 6073 zone); 6074 #elif V8_TARGET_ARCH_ARM64 6075 RegExpMacroAssemblerARM64 macro_assembler(mode, (data->capture_count + 1) * 2, 6076 zone); 6077 #elif V8_TARGET_ARCH_MIPS 6078 RegExpMacroAssemblerMIPS macro_assembler(mode, (data->capture_count + 1) * 2, 6079 zone); 6080 #elif V8_TARGET_ARCH_X87 6081 RegExpMacroAssemblerX87 macro_assembler(mode, (data->capture_count + 1) * 2, 6082 zone); 6083 #else 6084 #error "Unsupported architecture" 6085 #endif 6086 6087 #else // V8_INTERPRETED_REGEXP 6088 // Interpreted regexp implementation. 6089 EmbeddedVector<byte, 1024> codes; 6090 RegExpMacroAssemblerIrregexp macro_assembler(codes, zone); 6091 #endif // V8_INTERPRETED_REGEXP 6092 6093 // Inserted here, instead of in Assembler, because it depends on information 6094 // in the AST that isn't replicated in the Node structure. 6095 static const int kMaxBacksearchLimit = 1024; 6096 if (is_end_anchored && 6097 !is_start_anchored && 6098 max_length < kMaxBacksearchLimit) { 6099 macro_assembler.SetCurrentPositionFromEnd(max_length); 6100 } 6101 6102 if (is_global) { 6103 macro_assembler.set_global_mode( 6104 (data->tree->min_match() > 0) 6105 ? RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK 6106 : RegExpMacroAssembler::GLOBAL); 6107 } 6108 6109 return compiler.Assemble(¯o_assembler, 6110 node, 6111 data->capture_count, 6112 pattern); 6113 } 6114 6115 6116 }} // namespace v8::internal 6117