1 // Copyright 2012 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "src/v8.h" 6 7 #include "src/ast.h" 8 #include "src/base/platform/platform.h" 9 #include "src/compilation-cache.h" 10 #include "src/compiler.h" 11 #include "src/execution.h" 12 #include "src/factory.h" 13 #include "src/jsregexp-inl.h" 14 #include "src/jsregexp.h" 15 #include "src/ostreams.h" 16 #include "src/parser.h" 17 #include "src/regexp-macro-assembler.h" 18 #include "src/regexp-macro-assembler-irregexp.h" 19 #include "src/regexp-macro-assembler-tracer.h" 20 #include "src/regexp-stack.h" 21 #include "src/runtime.h" 22 #include "src/string-search.h" 23 24 #ifndef V8_INTERPRETED_REGEXP 25 #if V8_TARGET_ARCH_IA32 26 #include "src/ia32/regexp-macro-assembler-ia32.h" // NOLINT 27 #elif V8_TARGET_ARCH_X64 28 #include "src/x64/regexp-macro-assembler-x64.h" // NOLINT 29 #elif V8_TARGET_ARCH_ARM64 30 #include "src/arm64/regexp-macro-assembler-arm64.h" // NOLINT 31 #elif V8_TARGET_ARCH_ARM 32 #include "src/arm/regexp-macro-assembler-arm.h" // NOLINT 33 #elif V8_TARGET_ARCH_MIPS 34 #include "src/mips/regexp-macro-assembler-mips.h" // NOLINT 35 #elif V8_TARGET_ARCH_MIPS64 36 #include "src/mips64/regexp-macro-assembler-mips64.h" // NOLINT 37 #elif V8_TARGET_ARCH_X87 38 #include "src/x87/regexp-macro-assembler-x87.h" // NOLINT 39 #else 40 #error Unsupported target architecture. 41 #endif 42 #endif 43 44 #include "src/interpreter-irregexp.h" 45 46 47 namespace v8 { 48 namespace internal { 49 50 MaybeHandle<Object> RegExpImpl::CreateRegExpLiteral( 51 Handle<JSFunction> constructor, 52 Handle<String> pattern, 53 Handle<String> flags) { 54 // Call the construct code with 2 arguments. 55 Handle<Object> argv[] = { pattern, flags }; 56 return Execution::New(constructor, arraysize(argv), argv); 57 } 58 59 60 static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) { 61 int flags = JSRegExp::NONE; 62 for (int i = 0; i < str->length(); i++) { 63 switch (str->Get(i)) { 64 case 'i': 65 flags |= JSRegExp::IGNORE_CASE; 66 break; 67 case 'g': 68 flags |= JSRegExp::GLOBAL; 69 break; 70 case 'm': 71 flags |= JSRegExp::MULTILINE; 72 break; 73 case 'y': 74 if (FLAG_harmony_regexps) flags |= JSRegExp::STICKY; 75 break; 76 } 77 } 78 return JSRegExp::Flags(flags); 79 } 80 81 82 MUST_USE_RESULT 83 static inline MaybeHandle<Object> ThrowRegExpException( 84 Handle<JSRegExp> re, 85 Handle<String> pattern, 86 Handle<String> error_text, 87 const char* message) { 88 Isolate* isolate = re->GetIsolate(); 89 Factory* factory = isolate->factory(); 90 Handle<FixedArray> elements = factory->NewFixedArray(2); 91 elements->set(0, *pattern); 92 elements->set(1, *error_text); 93 Handle<JSArray> array = factory->NewJSArrayWithElements(elements); 94 Handle<Object> regexp_err; 95 THROW_NEW_ERROR(isolate, NewSyntaxError(message, array), Object); 96 } 97 98 99 ContainedInLattice AddRange(ContainedInLattice containment, 100 const int* ranges, 101 int ranges_length, 102 Interval new_range) { 103 DCHECK((ranges_length & 1) == 1); 104 DCHECK(ranges[ranges_length - 1] == String::kMaxUtf16CodeUnit + 1); 105 if (containment == kLatticeUnknown) return containment; 106 bool inside = false; 107 int last = 0; 108 for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) { 109 // Consider the range from last to ranges[i]. 110 // We haven't got to the new range yet. 111 if (ranges[i] <= new_range.from()) continue; 112 // New range is wholly inside last-ranges[i]. Note that new_range.to() is 113 // inclusive, but the values in ranges are not. 114 if (last <= new_range.from() && new_range.to() < ranges[i]) { 115 return Combine(containment, inside ? kLatticeIn : kLatticeOut); 116 } 117 return kLatticeUnknown; 118 } 119 return containment; 120 } 121 122 123 // More makes code generation slower, less makes V8 benchmark score lower. 124 const int kMaxLookaheadForBoyerMoore = 8; 125 // In a 3-character pattern you can maximally step forwards 3 characters 126 // at a time, which is not always enough to pay for the extra logic. 127 const int kPatternTooShortForBoyerMoore = 2; 128 129 130 // Identifies the sort of regexps where the regexp engine is faster 131 // than the code used for atom matches. 132 static bool HasFewDifferentCharacters(Handle<String> pattern) { 133 int length = Min(kMaxLookaheadForBoyerMoore, pattern->length()); 134 if (length <= kPatternTooShortForBoyerMoore) return false; 135 const int kMod = 128; 136 bool character_found[kMod]; 137 int different = 0; 138 memset(&character_found[0], 0, sizeof(character_found)); 139 for (int i = 0; i < length; i++) { 140 int ch = (pattern->Get(i) & (kMod - 1)); 141 if (!character_found[ch]) { 142 character_found[ch] = true; 143 different++; 144 // We declare a regexp low-alphabet if it has at least 3 times as many 145 // characters as it has different characters. 146 if (different * 3 > length) return false; 147 } 148 } 149 return true; 150 } 151 152 153 // Generic RegExp methods. Dispatches to implementation specific methods. 154 155 156 MaybeHandle<Object> RegExpImpl::Compile(Handle<JSRegExp> re, 157 Handle<String> pattern, 158 Handle<String> flag_str) { 159 Isolate* isolate = re->GetIsolate(); 160 Zone zone(isolate); 161 JSRegExp::Flags flags = RegExpFlagsFromString(flag_str); 162 CompilationCache* compilation_cache = isolate->compilation_cache(); 163 MaybeHandle<FixedArray> maybe_cached = 164 compilation_cache->LookupRegExp(pattern, flags); 165 Handle<FixedArray> cached; 166 bool in_cache = maybe_cached.ToHandle(&cached); 167 LOG(isolate, RegExpCompileEvent(re, in_cache)); 168 169 Handle<Object> result; 170 if (in_cache) { 171 re->set_data(*cached); 172 return re; 173 } 174 pattern = String::Flatten(pattern); 175 PostponeInterruptsScope postpone(isolate); 176 RegExpCompileData parse_result; 177 FlatStringReader reader(isolate, pattern); 178 if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(), 179 &parse_result, &zone)) { 180 // Throw an exception if we fail to parse the pattern. 181 return ThrowRegExpException(re, 182 pattern, 183 parse_result.error, 184 "malformed_regexp"); 185 } 186 187 bool has_been_compiled = false; 188 189 if (parse_result.simple && 190 !flags.is_ignore_case() && 191 !flags.is_sticky() && 192 !HasFewDifferentCharacters(pattern)) { 193 // Parse-tree is a single atom that is equal to the pattern. 194 AtomCompile(re, pattern, flags, pattern); 195 has_been_compiled = true; 196 } else if (parse_result.tree->IsAtom() && 197 !flags.is_ignore_case() && 198 !flags.is_sticky() && 199 parse_result.capture_count == 0) { 200 RegExpAtom* atom = parse_result.tree->AsAtom(); 201 Vector<const uc16> atom_pattern = atom->data(); 202 Handle<String> atom_string; 203 ASSIGN_RETURN_ON_EXCEPTION( 204 isolate, atom_string, 205 isolate->factory()->NewStringFromTwoByte(atom_pattern), 206 Object); 207 if (!HasFewDifferentCharacters(atom_string)) { 208 AtomCompile(re, pattern, flags, atom_string); 209 has_been_compiled = true; 210 } 211 } 212 if (!has_been_compiled) { 213 IrregexpInitialize(re, pattern, flags, parse_result.capture_count); 214 } 215 DCHECK(re->data()->IsFixedArray()); 216 // Compilation succeeded so the data is set on the regexp 217 // and we can store it in the cache. 218 Handle<FixedArray> data(FixedArray::cast(re->data())); 219 compilation_cache->PutRegExp(pattern, flags, data); 220 221 return re; 222 } 223 224 225 MaybeHandle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp, 226 Handle<String> subject, 227 int index, 228 Handle<JSArray> last_match_info) { 229 switch (regexp->TypeTag()) { 230 case JSRegExp::ATOM: 231 return AtomExec(regexp, subject, index, last_match_info); 232 case JSRegExp::IRREGEXP: { 233 return IrregexpExec(regexp, subject, index, last_match_info); 234 } 235 default: 236 UNREACHABLE(); 237 return MaybeHandle<Object>(); 238 } 239 } 240 241 242 // RegExp Atom implementation: Simple string search using indexOf. 243 244 245 void RegExpImpl::AtomCompile(Handle<JSRegExp> re, 246 Handle<String> pattern, 247 JSRegExp::Flags flags, 248 Handle<String> match_pattern) { 249 re->GetIsolate()->factory()->SetRegExpAtomData(re, 250 JSRegExp::ATOM, 251 pattern, 252 flags, 253 match_pattern); 254 } 255 256 257 static void SetAtomLastCapture(FixedArray* array, 258 String* subject, 259 int from, 260 int to) { 261 SealHandleScope shs(array->GetIsolate()); 262 RegExpImpl::SetLastCaptureCount(array, 2); 263 RegExpImpl::SetLastSubject(array, subject); 264 RegExpImpl::SetLastInput(array, subject); 265 RegExpImpl::SetCapture(array, 0, from); 266 RegExpImpl::SetCapture(array, 1, to); 267 } 268 269 270 int RegExpImpl::AtomExecRaw(Handle<JSRegExp> regexp, 271 Handle<String> subject, 272 int index, 273 int32_t* output, 274 int output_size) { 275 Isolate* isolate = regexp->GetIsolate(); 276 277 DCHECK(0 <= index); 278 DCHECK(index <= subject->length()); 279 280 subject = String::Flatten(subject); 281 DisallowHeapAllocation no_gc; // ensure vectors stay valid 282 283 String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex)); 284 int needle_len = needle->length(); 285 DCHECK(needle->IsFlat()); 286 DCHECK_LT(0, needle_len); 287 288 if (index + needle_len > subject->length()) { 289 return RegExpImpl::RE_FAILURE; 290 } 291 292 for (int i = 0; i < output_size; i += 2) { 293 String::FlatContent needle_content = needle->GetFlatContent(); 294 String::FlatContent subject_content = subject->GetFlatContent(); 295 DCHECK(needle_content.IsFlat()); 296 DCHECK(subject_content.IsFlat()); 297 // dispatch on type of strings 298 index = 299 (needle_content.IsOneByte() 300 ? (subject_content.IsOneByte() 301 ? SearchString(isolate, subject_content.ToOneByteVector(), 302 needle_content.ToOneByteVector(), index) 303 : SearchString(isolate, subject_content.ToUC16Vector(), 304 needle_content.ToOneByteVector(), index)) 305 : (subject_content.IsOneByte() 306 ? SearchString(isolate, subject_content.ToOneByteVector(), 307 needle_content.ToUC16Vector(), index) 308 : SearchString(isolate, subject_content.ToUC16Vector(), 309 needle_content.ToUC16Vector(), index))); 310 if (index == -1) { 311 return i / 2; // Return number of matches. 312 } else { 313 output[i] = index; 314 output[i+1] = index + needle_len; 315 index += needle_len; 316 } 317 } 318 return output_size / 2; 319 } 320 321 322 Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re, 323 Handle<String> subject, 324 int index, 325 Handle<JSArray> last_match_info) { 326 Isolate* isolate = re->GetIsolate(); 327 328 static const int kNumRegisters = 2; 329 STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize); 330 int32_t* output_registers = isolate->jsregexp_static_offsets_vector(); 331 332 int res = AtomExecRaw(re, subject, index, output_registers, kNumRegisters); 333 334 if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value(); 335 336 DCHECK_EQ(res, RegExpImpl::RE_SUCCESS); 337 SealHandleScope shs(isolate); 338 FixedArray* array = FixedArray::cast(last_match_info->elements()); 339 SetAtomLastCapture(array, *subject, output_registers[0], output_registers[1]); 340 return last_match_info; 341 } 342 343 344 // Irregexp implementation. 345 346 // Ensures that the regexp object contains a compiled version of the 347 // source for either one-byte or two-byte subject strings. 348 // If the compiled version doesn't already exist, it is compiled 349 // from the source pattern. 350 // If compilation fails, an exception is thrown and this function 351 // returns false. 352 bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, 353 Handle<String> sample_subject, 354 bool is_one_byte) { 355 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_one_byte)); 356 #ifdef V8_INTERPRETED_REGEXP 357 if (compiled_code->IsByteArray()) return true; 358 #else // V8_INTERPRETED_REGEXP (RegExp native code) 359 if (compiled_code->IsCode()) return true; 360 #endif 361 // We could potentially have marked this as flushable, but have kept 362 // a saved version if we did not flush it yet. 363 Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_one_byte)); 364 if (saved_code->IsCode()) { 365 // Reinstate the code in the original place. 366 re->SetDataAt(JSRegExp::code_index(is_one_byte), saved_code); 367 DCHECK(compiled_code->IsSmi()); 368 return true; 369 } 370 return CompileIrregexp(re, sample_subject, is_one_byte); 371 } 372 373 374 static void CreateRegExpErrorObjectAndThrow(Handle<JSRegExp> re, 375 Handle<String> error_message, 376 Isolate* isolate) { 377 Factory* factory = isolate->factory(); 378 Handle<FixedArray> elements = factory->NewFixedArray(2); 379 elements->set(0, re->Pattern()); 380 elements->set(1, *error_message); 381 Handle<JSArray> array = factory->NewJSArrayWithElements(elements); 382 Handle<Object> error; 383 MaybeHandle<Object> maybe_error = 384 factory->NewSyntaxError("malformed_regexp", array); 385 if (maybe_error.ToHandle(&error)) isolate->Throw(*error); 386 } 387 388 389 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, 390 Handle<String> sample_subject, 391 bool is_one_byte) { 392 // Compile the RegExp. 393 Isolate* isolate = re->GetIsolate(); 394 Zone zone(isolate); 395 PostponeInterruptsScope postpone(isolate); 396 // If we had a compilation error the last time this is saved at the 397 // saved code index. 398 Object* entry = re->DataAt(JSRegExp::code_index(is_one_byte)); 399 // When arriving here entry can only be a smi, either representing an 400 // uncompiled regexp, a previous compilation error, or code that has 401 // been flushed. 402 DCHECK(entry->IsSmi()); 403 int entry_value = Smi::cast(entry)->value(); 404 DCHECK(entry_value == JSRegExp::kUninitializedValue || 405 entry_value == JSRegExp::kCompilationErrorValue || 406 (entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0)); 407 408 if (entry_value == JSRegExp::kCompilationErrorValue) { 409 // A previous compilation failed and threw an error which we store in 410 // the saved code index (we store the error message, not the actual 411 // error). Recreate the error object and throw it. 412 Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_one_byte)); 413 DCHECK(error_string->IsString()); 414 Handle<String> error_message(String::cast(error_string)); 415 CreateRegExpErrorObjectAndThrow(re, error_message, isolate); 416 return false; 417 } 418 419 JSRegExp::Flags flags = re->GetFlags(); 420 421 Handle<String> pattern(re->Pattern()); 422 pattern = String::Flatten(pattern); 423 RegExpCompileData compile_data; 424 FlatStringReader reader(isolate, pattern); 425 if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(), 426 &compile_data, 427 &zone)) { 428 // Throw an exception if we fail to parse the pattern. 429 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once. 430 USE(ThrowRegExpException(re, 431 pattern, 432 compile_data.error, 433 "malformed_regexp")); 434 return false; 435 } 436 RegExpEngine::CompilationResult result = RegExpEngine::Compile( 437 &compile_data, flags.is_ignore_case(), flags.is_global(), 438 flags.is_multiline(), flags.is_sticky(), pattern, sample_subject, 439 is_one_byte, &zone); 440 if (result.error_message != NULL) { 441 // Unable to compile regexp. 442 Handle<String> error_message = isolate->factory()->NewStringFromUtf8( 443 CStrVector(result.error_message)).ToHandleChecked(); 444 CreateRegExpErrorObjectAndThrow(re, error_message, isolate); 445 return false; 446 } 447 448 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data())); 449 data->set(JSRegExp::code_index(is_one_byte), result.code); 450 int register_max = IrregexpMaxRegisterCount(*data); 451 if (result.num_registers > register_max) { 452 SetIrregexpMaxRegisterCount(*data, result.num_registers); 453 } 454 455 return true; 456 } 457 458 459 int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) { 460 return Smi::cast( 461 re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value(); 462 } 463 464 465 void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) { 466 re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value)); 467 } 468 469 470 int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) { 471 return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value(); 472 } 473 474 475 int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) { 476 return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value(); 477 } 478 479 480 ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_one_byte) { 481 return ByteArray::cast(re->get(JSRegExp::code_index(is_one_byte))); 482 } 483 484 485 Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_one_byte) { 486 return Code::cast(re->get(JSRegExp::code_index(is_one_byte))); 487 } 488 489 490 void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re, 491 Handle<String> pattern, 492 JSRegExp::Flags flags, 493 int capture_count) { 494 // Initialize compiled code entries to null. 495 re->GetIsolate()->factory()->SetRegExpIrregexpData(re, 496 JSRegExp::IRREGEXP, 497 pattern, 498 flags, 499 capture_count); 500 } 501 502 503 int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp, 504 Handle<String> subject) { 505 subject = String::Flatten(subject); 506 507 // Check representation of the underlying storage. 508 bool is_one_byte = subject->IsOneByteRepresentationUnderneath(); 509 if (!EnsureCompiledIrregexp(regexp, subject, is_one_byte)) return -1; 510 511 #ifdef V8_INTERPRETED_REGEXP 512 // Byte-code regexp needs space allocated for all its registers. 513 // The result captures are copied to the start of the registers array 514 // if the match succeeds. This way those registers are not clobbered 515 // when we set the last match info from last successful match. 516 return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) + 517 (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2; 518 #else // V8_INTERPRETED_REGEXP 519 // Native regexp only needs room to output captures. Registers are handled 520 // internally. 521 return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2; 522 #endif // V8_INTERPRETED_REGEXP 523 } 524 525 526 int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp, 527 Handle<String> subject, 528 int index, 529 int32_t* output, 530 int output_size) { 531 Isolate* isolate = regexp->GetIsolate(); 532 533 Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate); 534 535 DCHECK(index >= 0); 536 DCHECK(index <= subject->length()); 537 DCHECK(subject->IsFlat()); 538 539 bool is_one_byte = subject->IsOneByteRepresentationUnderneath(); 540 541 #ifndef V8_INTERPRETED_REGEXP 542 DCHECK(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2); 543 do { 544 EnsureCompiledIrregexp(regexp, subject, is_one_byte); 545 Handle<Code> code(IrregexpNativeCode(*irregexp, is_one_byte), isolate); 546 // The stack is used to allocate registers for the compiled regexp code. 547 // This means that in case of failure, the output registers array is left 548 // untouched and contains the capture results from the previous successful 549 // match. We can use that to set the last match info lazily. 550 NativeRegExpMacroAssembler::Result res = 551 NativeRegExpMacroAssembler::Match(code, 552 subject, 553 output, 554 output_size, 555 index, 556 isolate); 557 if (res != NativeRegExpMacroAssembler::RETRY) { 558 DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION || 559 isolate->has_pending_exception()); 560 STATIC_ASSERT( 561 static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS); 562 STATIC_ASSERT( 563 static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE); 564 STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION) 565 == RE_EXCEPTION); 566 return static_cast<IrregexpResult>(res); 567 } 568 // If result is RETRY, the string has changed representation, and we 569 // must restart from scratch. 570 // In this case, it means we must make sure we are prepared to handle 571 // the, potentially, different subject (the string can switch between 572 // being internal and external, and even between being Latin1 and UC16, 573 // but the characters are always the same). 574 IrregexpPrepare(regexp, subject); 575 is_one_byte = subject->IsOneByteRepresentationUnderneath(); 576 } while (true); 577 UNREACHABLE(); 578 return RE_EXCEPTION; 579 #else // V8_INTERPRETED_REGEXP 580 581 DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp)); 582 // We must have done EnsureCompiledIrregexp, so we can get the number of 583 // registers. 584 int number_of_capture_registers = 585 (IrregexpNumberOfCaptures(*irregexp) + 1) * 2; 586 int32_t* raw_output = &output[number_of_capture_registers]; 587 // We do not touch the actual capture result registers until we know there 588 // has been a match so that we can use those capture results to set the 589 // last match info. 590 for (int i = number_of_capture_registers - 1; i >= 0; i--) { 591 raw_output[i] = -1; 592 } 593 Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_one_byte), 594 isolate); 595 596 IrregexpResult result = IrregexpInterpreter::Match(isolate, 597 byte_codes, 598 subject, 599 raw_output, 600 index); 601 if (result == RE_SUCCESS) { 602 // Copy capture results to the start of the registers array. 603 MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t)); 604 } 605 if (result == RE_EXCEPTION) { 606 DCHECK(!isolate->has_pending_exception()); 607 isolate->StackOverflow(); 608 } 609 return result; 610 #endif // V8_INTERPRETED_REGEXP 611 } 612 613 614 MaybeHandle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp, 615 Handle<String> subject, 616 int previous_index, 617 Handle<JSArray> last_match_info) { 618 Isolate* isolate = regexp->GetIsolate(); 619 DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP); 620 621 // Prepare space for the return values. 622 #if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG) 623 if (FLAG_trace_regexp_bytecodes) { 624 String* pattern = regexp->Pattern(); 625 PrintF("\n\nRegexp match: /%s/\n\n", pattern->ToCString().get()); 626 PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get()); 627 } 628 #endif 629 int required_registers = RegExpImpl::IrregexpPrepare(regexp, subject); 630 if (required_registers < 0) { 631 // Compiling failed with an exception. 632 DCHECK(isolate->has_pending_exception()); 633 return MaybeHandle<Object>(); 634 } 635 636 int32_t* output_registers = NULL; 637 if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) { 638 output_registers = NewArray<int32_t>(required_registers); 639 } 640 SmartArrayPointer<int32_t> auto_release(output_registers); 641 if (output_registers == NULL) { 642 output_registers = isolate->jsregexp_static_offsets_vector(); 643 } 644 645 int res = RegExpImpl::IrregexpExecRaw( 646 regexp, subject, previous_index, output_registers, required_registers); 647 if (res == RE_SUCCESS) { 648 int capture_count = 649 IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())); 650 return SetLastMatchInfo( 651 last_match_info, subject, capture_count, output_registers); 652 } 653 if (res == RE_EXCEPTION) { 654 DCHECK(isolate->has_pending_exception()); 655 return MaybeHandle<Object>(); 656 } 657 DCHECK(res == RE_FAILURE); 658 return isolate->factory()->null_value(); 659 } 660 661 662 Handle<JSArray> RegExpImpl::SetLastMatchInfo(Handle<JSArray> last_match_info, 663 Handle<String> subject, 664 int capture_count, 665 int32_t* match) { 666 DCHECK(last_match_info->HasFastObjectElements()); 667 int capture_register_count = (capture_count + 1) * 2; 668 JSArray::EnsureSize(last_match_info, 669 capture_register_count + kLastMatchOverhead); 670 DisallowHeapAllocation no_allocation; 671 FixedArray* array = FixedArray::cast(last_match_info->elements()); 672 if (match != NULL) { 673 for (int i = 0; i < capture_register_count; i += 2) { 674 SetCapture(array, i, match[i]); 675 SetCapture(array, i + 1, match[i + 1]); 676 } 677 } 678 SetLastCaptureCount(array, capture_register_count); 679 SetLastSubject(array, *subject); 680 SetLastInput(array, *subject); 681 return last_match_info; 682 } 683 684 685 RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp, 686 Handle<String> subject, 687 bool is_global, 688 Isolate* isolate) 689 : register_array_(NULL), 690 register_array_size_(0), 691 regexp_(regexp), 692 subject_(subject) { 693 #ifdef V8_INTERPRETED_REGEXP 694 bool interpreted = true; 695 #else 696 bool interpreted = false; 697 #endif // V8_INTERPRETED_REGEXP 698 699 if (regexp_->TypeTag() == JSRegExp::ATOM) { 700 static const int kAtomRegistersPerMatch = 2; 701 registers_per_match_ = kAtomRegistersPerMatch; 702 // There is no distinction between interpreted and native for atom regexps. 703 interpreted = false; 704 } else { 705 registers_per_match_ = RegExpImpl::IrregexpPrepare(regexp_, subject_); 706 if (registers_per_match_ < 0) { 707 num_matches_ = -1; // Signal exception. 708 return; 709 } 710 } 711 712 if (is_global && !interpreted) { 713 register_array_size_ = 714 Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize); 715 max_matches_ = register_array_size_ / registers_per_match_; 716 } else { 717 // Global loop in interpreted regexp is not implemented. We choose 718 // the size of the offsets vector so that it can only store one match. 719 register_array_size_ = registers_per_match_; 720 max_matches_ = 1; 721 } 722 723 if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) { 724 register_array_ = NewArray<int32_t>(register_array_size_); 725 } else { 726 register_array_ = isolate->jsregexp_static_offsets_vector(); 727 } 728 729 // Set state so that fetching the results the first time triggers a call 730 // to the compiled regexp. 731 current_match_index_ = max_matches_ - 1; 732 num_matches_ = max_matches_; 733 DCHECK(registers_per_match_ >= 2); // Each match has at least one capture. 734 DCHECK_GE(register_array_size_, registers_per_match_); 735 int32_t* last_match = 736 ®ister_array_[current_match_index_ * registers_per_match_]; 737 last_match[0] = -1; 738 last_match[1] = 0; 739 } 740 741 742 // ------------------------------------------------------------------- 743 // Implementation of the Irregexp regular expression engine. 744 // 745 // The Irregexp regular expression engine is intended to be a complete 746 // implementation of ECMAScript regular expressions. It generates either 747 // bytecodes or native code. 748 749 // The Irregexp regexp engine is structured in three steps. 750 // 1) The parser generates an abstract syntax tree. See ast.cc. 751 // 2) From the AST a node network is created. The nodes are all 752 // subclasses of RegExpNode. The nodes represent states when 753 // executing a regular expression. Several optimizations are 754 // performed on the node network. 755 // 3) From the nodes we generate either byte codes or native code 756 // that can actually execute the regular expression (perform 757 // the search). The code generation step is described in more 758 // detail below. 759 760 // Code generation. 761 // 762 // The nodes are divided into four main categories. 763 // * Choice nodes 764 // These represent places where the regular expression can 765 // match in more than one way. For example on entry to an 766 // alternation (foo|bar) or a repetition (*, +, ? or {}). 767 // * Action nodes 768 // These represent places where some action should be 769 // performed. Examples include recording the current position 770 // in the input string to a register (in order to implement 771 // captures) or other actions on register for example in order 772 // to implement the counters needed for {} repetitions. 773 // * Matching nodes 774 // These attempt to match some element part of the input string. 775 // Examples of elements include character classes, plain strings 776 // or back references. 777 // * End nodes 778 // These are used to implement the actions required on finding 779 // a successful match or failing to find a match. 780 // 781 // The code generated (whether as byte codes or native code) maintains 782 // some state as it runs. This consists of the following elements: 783 // 784 // * The capture registers. Used for string captures. 785 // * Other registers. Used for counters etc. 786 // * The current position. 787 // * The stack of backtracking information. Used when a matching node 788 // fails to find a match and needs to try an alternative. 789 // 790 // Conceptual regular expression execution model: 791 // 792 // There is a simple conceptual model of regular expression execution 793 // which will be presented first. The actual code generated is a more 794 // efficient simulation of the simple conceptual model: 795 // 796 // * Choice nodes are implemented as follows: 797 // For each choice except the last { 798 // push current position 799 // push backtrack code location 800 // <generate code to test for choice> 801 // backtrack code location: 802 // pop current position 803 // } 804 // <generate code to test for last choice> 805 // 806 // * Actions nodes are generated as follows 807 // <push affected registers on backtrack stack> 808 // <generate code to perform action> 809 // push backtrack code location 810 // <generate code to test for following nodes> 811 // backtrack code location: 812 // <pop affected registers to restore their state> 813 // <pop backtrack location from stack and go to it> 814 // 815 // * Matching nodes are generated as follows: 816 // if input string matches at current position 817 // update current position 818 // <generate code to test for following nodes> 819 // else 820 // <pop backtrack location from stack and go to it> 821 // 822 // Thus it can be seen that the current position is saved and restored 823 // by the choice nodes, whereas the registers are saved and restored by 824 // by the action nodes that manipulate them. 825 // 826 // The other interesting aspect of this model is that nodes are generated 827 // at the point where they are needed by a recursive call to Emit(). If 828 // the node has already been code generated then the Emit() call will 829 // generate a jump to the previously generated code instead. In order to 830 // limit recursion it is possible for the Emit() function to put the node 831 // on a work list for later generation and instead generate a jump. The 832 // destination of the jump is resolved later when the code is generated. 833 // 834 // Actual regular expression code generation. 835 // 836 // Code generation is actually more complicated than the above. In order 837 // to improve the efficiency of the generated code some optimizations are 838 // performed 839 // 840 // * Choice nodes have 1-character lookahead. 841 // A choice node looks at the following character and eliminates some of 842 // the choices immediately based on that character. This is not yet 843 // implemented. 844 // * Simple greedy loops store reduced backtracking information. 845 // A quantifier like /.*foo/m will greedily match the whole input. It will 846 // then need to backtrack to a point where it can match "foo". The naive 847 // implementation of this would push each character position onto the 848 // backtracking stack, then pop them off one by one. This would use space 849 // proportional to the length of the input string. However since the "." 850 // can only match in one way and always has a constant length (in this case 851 // of 1) it suffices to store the current position on the top of the stack 852 // once. Matching now becomes merely incrementing the current position and 853 // backtracking becomes decrementing the current position and checking the 854 // result against the stored current position. This is faster and saves 855 // space. 856 // * The current state is virtualized. 857 // This is used to defer expensive operations until it is clear that they 858 // are needed and to generate code for a node more than once, allowing 859 // specialized an efficient versions of the code to be created. This is 860 // explained in the section below. 861 // 862 // Execution state virtualization. 863 // 864 // Instead of emitting code, nodes that manipulate the state can record their 865 // manipulation in an object called the Trace. The Trace object can record a 866 // current position offset, an optional backtrack code location on the top of 867 // the virtualized backtrack stack and some register changes. When a node is 868 // to be emitted it can flush the Trace or update it. Flushing the Trace 869 // will emit code to bring the actual state into line with the virtual state. 870 // Avoiding flushing the state can postpone some work (e.g. updates of capture 871 // registers). Postponing work can save time when executing the regular 872 // expression since it may be found that the work never has to be done as a 873 // failure to match can occur. In addition it is much faster to jump to a 874 // known backtrack code location than it is to pop an unknown backtrack 875 // location from the stack and jump there. 876 // 877 // The virtual state found in the Trace affects code generation. For example 878 // the virtual state contains the difference between the actual current 879 // position and the virtual current position, and matching code needs to use 880 // this offset to attempt a match in the correct location of the input 881 // string. Therefore code generated for a non-trivial trace is specialized 882 // to that trace. The code generator therefore has the ability to generate 883 // code for each node several times. In order to limit the size of the 884 // generated code there is an arbitrary limit on how many specialized sets of 885 // code may be generated for a given node. If the limit is reached, the 886 // trace is flushed and a generic version of the code for a node is emitted. 887 // This is subsequently used for that node. The code emitted for non-generic 888 // trace is not recorded in the node and so it cannot currently be reused in 889 // the event that code generation is requested for an identical trace. 890 891 892 void RegExpTree::AppendToText(RegExpText* text, Zone* zone) { 893 UNREACHABLE(); 894 } 895 896 897 void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) { 898 text->AddElement(TextElement::Atom(this), zone); 899 } 900 901 902 void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) { 903 text->AddElement(TextElement::CharClass(this), zone); 904 } 905 906 907 void RegExpText::AppendToText(RegExpText* text, Zone* zone) { 908 for (int i = 0; i < elements()->length(); i++) 909 text->AddElement(elements()->at(i), zone); 910 } 911 912 913 TextElement TextElement::Atom(RegExpAtom* atom) { 914 return TextElement(ATOM, atom); 915 } 916 917 918 TextElement TextElement::CharClass(RegExpCharacterClass* char_class) { 919 return TextElement(CHAR_CLASS, char_class); 920 } 921 922 923 int TextElement::length() const { 924 switch (text_type()) { 925 case ATOM: 926 return atom()->length(); 927 928 case CHAR_CLASS: 929 return 1; 930 } 931 UNREACHABLE(); 932 return 0; 933 } 934 935 936 DispatchTable* ChoiceNode::GetTable(bool ignore_case) { 937 if (table_ == NULL) { 938 table_ = new(zone()) DispatchTable(zone()); 939 DispatchTableConstructor cons(table_, ignore_case, zone()); 940 cons.BuildTable(this); 941 } 942 return table_; 943 } 944 945 946 class FrequencyCollator { 947 public: 948 FrequencyCollator() : total_samples_(0) { 949 for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) { 950 frequencies_[i] = CharacterFrequency(i); 951 } 952 } 953 954 void CountCharacter(int character) { 955 int index = (character & RegExpMacroAssembler::kTableMask); 956 frequencies_[index].Increment(); 957 total_samples_++; 958 } 959 960 // Does not measure in percent, but rather per-128 (the table size from the 961 // regexp macro assembler). 962 int Frequency(int in_character) { 963 DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character); 964 if (total_samples_ < 1) return 1; // Division by zero. 965 int freq_in_per128 = 966 (frequencies_[in_character].counter() * 128) / total_samples_; 967 return freq_in_per128; 968 } 969 970 private: 971 class CharacterFrequency { 972 public: 973 CharacterFrequency() : counter_(0), character_(-1) { } 974 explicit CharacterFrequency(int character) 975 : counter_(0), character_(character) { } 976 977 void Increment() { counter_++; } 978 int counter() { return counter_; } 979 int character() { return character_; } 980 981 private: 982 int counter_; 983 int character_; 984 }; 985 986 987 private: 988 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize]; 989 int total_samples_; 990 }; 991 992 993 class RegExpCompiler { 994 public: 995 RegExpCompiler(int capture_count, bool ignore_case, bool is_one_byte, 996 Zone* zone); 997 998 int AllocateRegister() { 999 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) { 1000 reg_exp_too_big_ = true; 1001 return next_register_; 1002 } 1003 return next_register_++; 1004 } 1005 1006 RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler, 1007 RegExpNode* start, 1008 int capture_count, 1009 Handle<String> pattern); 1010 1011 inline void AddWork(RegExpNode* node) { work_list_->Add(node); } 1012 1013 static const int kImplementationOffset = 0; 1014 static const int kNumberOfRegistersOffset = 0; 1015 static const int kCodeOffset = 1; 1016 1017 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; } 1018 EndNode* accept() { return accept_; } 1019 1020 static const int kMaxRecursion = 100; 1021 inline int recursion_depth() { return recursion_depth_; } 1022 inline void IncrementRecursionDepth() { recursion_depth_++; } 1023 inline void DecrementRecursionDepth() { recursion_depth_--; } 1024 1025 void SetRegExpTooBig() { reg_exp_too_big_ = true; } 1026 1027 inline bool ignore_case() { return ignore_case_; } 1028 inline bool one_byte() { return one_byte_; } 1029 FrequencyCollator* frequency_collator() { return &frequency_collator_; } 1030 1031 int current_expansion_factor() { return current_expansion_factor_; } 1032 void set_current_expansion_factor(int value) { 1033 current_expansion_factor_ = value; 1034 } 1035 1036 Zone* zone() const { return zone_; } 1037 1038 static const int kNoRegister = -1; 1039 1040 private: 1041 EndNode* accept_; 1042 int next_register_; 1043 List<RegExpNode*>* work_list_; 1044 int recursion_depth_; 1045 RegExpMacroAssembler* macro_assembler_; 1046 bool ignore_case_; 1047 bool one_byte_; 1048 bool reg_exp_too_big_; 1049 int current_expansion_factor_; 1050 FrequencyCollator frequency_collator_; 1051 Zone* zone_; 1052 }; 1053 1054 1055 class RecursionCheck { 1056 public: 1057 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) { 1058 compiler->IncrementRecursionDepth(); 1059 } 1060 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); } 1061 private: 1062 RegExpCompiler* compiler_; 1063 }; 1064 1065 1066 static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) { 1067 return RegExpEngine::CompilationResult(isolate, "RegExp too big"); 1068 } 1069 1070 1071 // Attempts to compile the regexp using an Irregexp code generator. Returns 1072 // a fixed array or a null handle depending on whether it succeeded. 1073 RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, 1074 bool one_byte, Zone* zone) 1075 : next_register_(2 * (capture_count + 1)), 1076 work_list_(NULL), 1077 recursion_depth_(0), 1078 ignore_case_(ignore_case), 1079 one_byte_(one_byte), 1080 reg_exp_too_big_(false), 1081 current_expansion_factor_(1), 1082 frequency_collator_(), 1083 zone_(zone) { 1084 accept_ = new(zone) EndNode(EndNode::ACCEPT, zone); 1085 DCHECK(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister); 1086 } 1087 1088 1089 RegExpEngine::CompilationResult RegExpCompiler::Assemble( 1090 RegExpMacroAssembler* macro_assembler, 1091 RegExpNode* start, 1092 int capture_count, 1093 Handle<String> pattern) { 1094 Heap* heap = pattern->GetHeap(); 1095 1096 bool use_slow_safe_regexp_compiler = false; 1097 if (heap->total_regexp_code_generated() > 1098 RegExpImpl::kRegWxpCompiledLimit && 1099 heap->isolate()->memory_allocator()->SizeExecutable() > 1100 RegExpImpl::kRegExpExecutableMemoryLimit) { 1101 use_slow_safe_regexp_compiler = true; 1102 } 1103 1104 macro_assembler->set_slow_safe(use_slow_safe_regexp_compiler); 1105 1106 #ifdef DEBUG 1107 if (FLAG_trace_regexp_assembler) 1108 macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler); 1109 else 1110 #endif 1111 macro_assembler_ = macro_assembler; 1112 1113 List <RegExpNode*> work_list(0); 1114 work_list_ = &work_list; 1115 Label fail; 1116 macro_assembler_->PushBacktrack(&fail); 1117 Trace new_trace; 1118 start->Emit(this, &new_trace); 1119 macro_assembler_->Bind(&fail); 1120 macro_assembler_->Fail(); 1121 while (!work_list.is_empty()) { 1122 work_list.RemoveLast()->Emit(this, &new_trace); 1123 } 1124 if (reg_exp_too_big_) return IrregexpRegExpTooBig(zone_->isolate()); 1125 1126 Handle<HeapObject> code = macro_assembler_->GetCode(pattern); 1127 heap->IncreaseTotalRegexpCodeGenerated(code->Size()); 1128 work_list_ = NULL; 1129 #ifdef DEBUG 1130 if (FLAG_print_code) { 1131 CodeTracer::Scope trace_scope(heap->isolate()->GetCodeTracer()); 1132 OFStream os(trace_scope.file()); 1133 Handle<Code>::cast(code)->Disassemble(pattern->ToCString().get(), os); 1134 } 1135 if (FLAG_trace_regexp_assembler) { 1136 delete macro_assembler_; 1137 } 1138 #endif 1139 return RegExpEngine::CompilationResult(*code, next_register_); 1140 } 1141 1142 1143 bool Trace::DeferredAction::Mentions(int that) { 1144 if (action_type() == ActionNode::CLEAR_CAPTURES) { 1145 Interval range = static_cast<DeferredClearCaptures*>(this)->range(); 1146 return range.Contains(that); 1147 } else { 1148 return reg() == that; 1149 } 1150 } 1151 1152 1153 bool Trace::mentions_reg(int reg) { 1154 for (DeferredAction* action = actions_; 1155 action != NULL; 1156 action = action->next()) { 1157 if (action->Mentions(reg)) 1158 return true; 1159 } 1160 return false; 1161 } 1162 1163 1164 bool Trace::GetStoredPosition(int reg, int* cp_offset) { 1165 DCHECK_EQ(0, *cp_offset); 1166 for (DeferredAction* action = actions_; 1167 action != NULL; 1168 action = action->next()) { 1169 if (action->Mentions(reg)) { 1170 if (action->action_type() == ActionNode::STORE_POSITION) { 1171 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset(); 1172 return true; 1173 } else { 1174 return false; 1175 } 1176 } 1177 } 1178 return false; 1179 } 1180 1181 1182 int Trace::FindAffectedRegisters(OutSet* affected_registers, 1183 Zone* zone) { 1184 int max_register = RegExpCompiler::kNoRegister; 1185 for (DeferredAction* action = actions_; 1186 action != NULL; 1187 action = action->next()) { 1188 if (action->action_type() == ActionNode::CLEAR_CAPTURES) { 1189 Interval range = static_cast<DeferredClearCaptures*>(action)->range(); 1190 for (int i = range.from(); i <= range.to(); i++) 1191 affected_registers->Set(i, zone); 1192 if (range.to() > max_register) max_register = range.to(); 1193 } else { 1194 affected_registers->Set(action->reg(), zone); 1195 if (action->reg() > max_register) max_register = action->reg(); 1196 } 1197 } 1198 return max_register; 1199 } 1200 1201 1202 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler, 1203 int max_register, 1204 const OutSet& registers_to_pop, 1205 const OutSet& registers_to_clear) { 1206 for (int reg = max_register; reg >= 0; reg--) { 1207 if (registers_to_pop.Get(reg)) { 1208 assembler->PopRegister(reg); 1209 } else if (registers_to_clear.Get(reg)) { 1210 int clear_to = reg; 1211 while (reg > 0 && registers_to_clear.Get(reg - 1)) { 1212 reg--; 1213 } 1214 assembler->ClearRegisters(reg, clear_to); 1215 } 1216 } 1217 } 1218 1219 1220 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler, 1221 int max_register, 1222 const OutSet& affected_registers, 1223 OutSet* registers_to_pop, 1224 OutSet* registers_to_clear, 1225 Zone* zone) { 1226 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1. 1227 const int push_limit = (assembler->stack_limit_slack() + 1) / 2; 1228 1229 // Count pushes performed to force a stack limit check occasionally. 1230 int pushes = 0; 1231 1232 for (int reg = 0; reg <= max_register; reg++) { 1233 if (!affected_registers.Get(reg)) { 1234 continue; 1235 } 1236 1237 // The chronologically first deferred action in the trace 1238 // is used to infer the action needed to restore a register 1239 // to its previous state (or not, if it's safe to ignore it). 1240 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR }; 1241 DeferredActionUndoType undo_action = IGNORE; 1242 1243 int value = 0; 1244 bool absolute = false; 1245 bool clear = false; 1246 int store_position = -1; 1247 // This is a little tricky because we are scanning the actions in reverse 1248 // historical order (newest first). 1249 for (DeferredAction* action = actions_; 1250 action != NULL; 1251 action = action->next()) { 1252 if (action->Mentions(reg)) { 1253 switch (action->action_type()) { 1254 case ActionNode::SET_REGISTER: { 1255 Trace::DeferredSetRegister* psr = 1256 static_cast<Trace::DeferredSetRegister*>(action); 1257 if (!absolute) { 1258 value += psr->value(); 1259 absolute = true; 1260 } 1261 // SET_REGISTER is currently only used for newly introduced loop 1262 // counters. They can have a significant previous value if they 1263 // occour in a loop. TODO(lrn): Propagate this information, so 1264 // we can set undo_action to IGNORE if we know there is no value to 1265 // restore. 1266 undo_action = RESTORE; 1267 DCHECK_EQ(store_position, -1); 1268 DCHECK(!clear); 1269 break; 1270 } 1271 case ActionNode::INCREMENT_REGISTER: 1272 if (!absolute) { 1273 value++; 1274 } 1275 DCHECK_EQ(store_position, -1); 1276 DCHECK(!clear); 1277 undo_action = RESTORE; 1278 break; 1279 case ActionNode::STORE_POSITION: { 1280 Trace::DeferredCapture* pc = 1281 static_cast<Trace::DeferredCapture*>(action); 1282 if (!clear && store_position == -1) { 1283 store_position = pc->cp_offset(); 1284 } 1285 1286 // For captures we know that stores and clears alternate. 1287 // Other register, are never cleared, and if the occur 1288 // inside a loop, they might be assigned more than once. 1289 if (reg <= 1) { 1290 // Registers zero and one, aka "capture zero", is 1291 // always set correctly if we succeed. There is no 1292 // need to undo a setting on backtrack, because we 1293 // will set it again or fail. 1294 undo_action = IGNORE; 1295 } else { 1296 undo_action = pc->is_capture() ? CLEAR : RESTORE; 1297 } 1298 DCHECK(!absolute); 1299 DCHECK_EQ(value, 0); 1300 break; 1301 } 1302 case ActionNode::CLEAR_CAPTURES: { 1303 // Since we're scanning in reverse order, if we've already 1304 // set the position we have to ignore historically earlier 1305 // clearing operations. 1306 if (store_position == -1) { 1307 clear = true; 1308 } 1309 undo_action = RESTORE; 1310 DCHECK(!absolute); 1311 DCHECK_EQ(value, 0); 1312 break; 1313 } 1314 default: 1315 UNREACHABLE(); 1316 break; 1317 } 1318 } 1319 } 1320 // Prepare for the undo-action (e.g., push if it's going to be popped). 1321 if (undo_action == RESTORE) { 1322 pushes++; 1323 RegExpMacroAssembler::StackCheckFlag stack_check = 1324 RegExpMacroAssembler::kNoStackLimitCheck; 1325 if (pushes == push_limit) { 1326 stack_check = RegExpMacroAssembler::kCheckStackLimit; 1327 pushes = 0; 1328 } 1329 1330 assembler->PushRegister(reg, stack_check); 1331 registers_to_pop->Set(reg, zone); 1332 } else if (undo_action == CLEAR) { 1333 registers_to_clear->Set(reg, zone); 1334 } 1335 // Perform the chronologically last action (or accumulated increment) 1336 // for the register. 1337 if (store_position != -1) { 1338 assembler->WriteCurrentPositionToRegister(reg, store_position); 1339 } else if (clear) { 1340 assembler->ClearRegisters(reg, reg); 1341 } else if (absolute) { 1342 assembler->SetRegister(reg, value); 1343 } else if (value != 0) { 1344 assembler->AdvanceRegister(reg, value); 1345 } 1346 } 1347 } 1348 1349 1350 // This is called as we come into a loop choice node and some other tricky 1351 // nodes. It normalizes the state of the code generator to ensure we can 1352 // generate generic code. 1353 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) { 1354 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1355 1356 DCHECK(!is_trivial()); 1357 1358 if (actions_ == NULL && backtrack() == NULL) { 1359 // Here we just have some deferred cp advances to fix and we are back to 1360 // a normal situation. We may also have to forget some information gained 1361 // through a quick check that was already performed. 1362 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_); 1363 // Create a new trivial state and generate the node with that. 1364 Trace new_state; 1365 successor->Emit(compiler, &new_state); 1366 return; 1367 } 1368 1369 // Generate deferred actions here along with code to undo them again. 1370 OutSet affected_registers; 1371 1372 if (backtrack() != NULL) { 1373 // Here we have a concrete backtrack location. These are set up by choice 1374 // nodes and so they indicate that we have a deferred save of the current 1375 // position which we may need to emit here. 1376 assembler->PushCurrentPosition(); 1377 } 1378 1379 int max_register = FindAffectedRegisters(&affected_registers, 1380 compiler->zone()); 1381 OutSet registers_to_pop; 1382 OutSet registers_to_clear; 1383 PerformDeferredActions(assembler, 1384 max_register, 1385 affected_registers, 1386 ®isters_to_pop, 1387 ®isters_to_clear, 1388 compiler->zone()); 1389 if (cp_offset_ != 0) { 1390 assembler->AdvanceCurrentPosition(cp_offset_); 1391 } 1392 1393 // Create a new trivial state and generate the node with that. 1394 Label undo; 1395 assembler->PushBacktrack(&undo); 1396 Trace new_state; 1397 successor->Emit(compiler, &new_state); 1398 1399 // On backtrack we need to restore state. 1400 assembler->Bind(&undo); 1401 RestoreAffectedRegisters(assembler, 1402 max_register, 1403 registers_to_pop, 1404 registers_to_clear); 1405 if (backtrack() == NULL) { 1406 assembler->Backtrack(); 1407 } else { 1408 assembler->PopCurrentPosition(); 1409 assembler->GoTo(backtrack()); 1410 } 1411 } 1412 1413 1414 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) { 1415 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1416 1417 // Omit flushing the trace. We discard the entire stack frame anyway. 1418 1419 if (!label()->is_bound()) { 1420 // We are completely independent of the trace, since we ignore it, 1421 // so this code can be used as the generic version. 1422 assembler->Bind(label()); 1423 } 1424 1425 // Throw away everything on the backtrack stack since the start 1426 // of the negative submatch and restore the character position. 1427 assembler->ReadCurrentPositionFromRegister(current_position_register_); 1428 assembler->ReadStackPointerFromRegister(stack_pointer_register_); 1429 if (clear_capture_count_ > 0) { 1430 // Clear any captures that might have been performed during the success 1431 // of the body of the negative look-ahead. 1432 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1; 1433 assembler->ClearRegisters(clear_capture_start_, clear_capture_end); 1434 } 1435 // Now that we have unwound the stack we find at the top of the stack the 1436 // backtrack that the BeginSubmatch node got. 1437 assembler->Backtrack(); 1438 } 1439 1440 1441 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) { 1442 if (!trace->is_trivial()) { 1443 trace->Flush(compiler, this); 1444 return; 1445 } 1446 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1447 if (!label()->is_bound()) { 1448 assembler->Bind(label()); 1449 } 1450 switch (action_) { 1451 case ACCEPT: 1452 assembler->Succeed(); 1453 return; 1454 case BACKTRACK: 1455 assembler->GoTo(trace->backtrack()); 1456 return; 1457 case NEGATIVE_SUBMATCH_SUCCESS: 1458 // This case is handled in a different virtual method. 1459 UNREACHABLE(); 1460 } 1461 UNIMPLEMENTED(); 1462 } 1463 1464 1465 void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) { 1466 if (guards_ == NULL) 1467 guards_ = new(zone) ZoneList<Guard*>(1, zone); 1468 guards_->Add(guard, zone); 1469 } 1470 1471 1472 ActionNode* ActionNode::SetRegister(int reg, 1473 int val, 1474 RegExpNode* on_success) { 1475 ActionNode* result = 1476 new(on_success->zone()) ActionNode(SET_REGISTER, on_success); 1477 result->data_.u_store_register.reg = reg; 1478 result->data_.u_store_register.value = val; 1479 return result; 1480 } 1481 1482 1483 ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) { 1484 ActionNode* result = 1485 new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success); 1486 result->data_.u_increment_register.reg = reg; 1487 return result; 1488 } 1489 1490 1491 ActionNode* ActionNode::StorePosition(int reg, 1492 bool is_capture, 1493 RegExpNode* on_success) { 1494 ActionNode* result = 1495 new(on_success->zone()) ActionNode(STORE_POSITION, on_success); 1496 result->data_.u_position_register.reg = reg; 1497 result->data_.u_position_register.is_capture = is_capture; 1498 return result; 1499 } 1500 1501 1502 ActionNode* ActionNode::ClearCaptures(Interval range, 1503 RegExpNode* on_success) { 1504 ActionNode* result = 1505 new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success); 1506 result->data_.u_clear_captures.range_from = range.from(); 1507 result->data_.u_clear_captures.range_to = range.to(); 1508 return result; 1509 } 1510 1511 1512 ActionNode* ActionNode::BeginSubmatch(int stack_reg, 1513 int position_reg, 1514 RegExpNode* on_success) { 1515 ActionNode* result = 1516 new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success); 1517 result->data_.u_submatch.stack_pointer_register = stack_reg; 1518 result->data_.u_submatch.current_position_register = position_reg; 1519 return result; 1520 } 1521 1522 1523 ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg, 1524 int position_reg, 1525 int clear_register_count, 1526 int clear_register_from, 1527 RegExpNode* on_success) { 1528 ActionNode* result = 1529 new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success); 1530 result->data_.u_submatch.stack_pointer_register = stack_reg; 1531 result->data_.u_submatch.current_position_register = position_reg; 1532 result->data_.u_submatch.clear_register_count = clear_register_count; 1533 result->data_.u_submatch.clear_register_from = clear_register_from; 1534 return result; 1535 } 1536 1537 1538 ActionNode* ActionNode::EmptyMatchCheck(int start_register, 1539 int repetition_register, 1540 int repetition_limit, 1541 RegExpNode* on_success) { 1542 ActionNode* result = 1543 new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success); 1544 result->data_.u_empty_match_check.start_register = start_register; 1545 result->data_.u_empty_match_check.repetition_register = repetition_register; 1546 result->data_.u_empty_match_check.repetition_limit = repetition_limit; 1547 return result; 1548 } 1549 1550 1551 #define DEFINE_ACCEPT(Type) \ 1552 void Type##Node::Accept(NodeVisitor* visitor) { \ 1553 visitor->Visit##Type(this); \ 1554 } 1555 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT) 1556 #undef DEFINE_ACCEPT 1557 1558 1559 void LoopChoiceNode::Accept(NodeVisitor* visitor) { 1560 visitor->VisitLoopChoice(this); 1561 } 1562 1563 1564 // ------------------------------------------------------------------- 1565 // Emit code. 1566 1567 1568 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler, 1569 Guard* guard, 1570 Trace* trace) { 1571 switch (guard->op()) { 1572 case Guard::LT: 1573 DCHECK(!trace->mentions_reg(guard->reg())); 1574 macro_assembler->IfRegisterGE(guard->reg(), 1575 guard->value(), 1576 trace->backtrack()); 1577 break; 1578 case Guard::GEQ: 1579 DCHECK(!trace->mentions_reg(guard->reg())); 1580 macro_assembler->IfRegisterLT(guard->reg(), 1581 guard->value(), 1582 trace->backtrack()); 1583 break; 1584 } 1585 } 1586 1587 1588 // Returns the number of characters in the equivalence class, omitting those 1589 // that cannot occur in the source string because it is ASCII. 1590 static int GetCaseIndependentLetters(Isolate* isolate, uc16 character, 1591 bool one_byte_subject, 1592 unibrow::uchar* letters) { 1593 int length = 1594 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters); 1595 // Unibrow returns 0 or 1 for characters where case independence is 1596 // trivial. 1597 if (length == 0) { 1598 letters[0] = character; 1599 length = 1; 1600 } 1601 if (!one_byte_subject || character <= String::kMaxOneByteCharCode) { 1602 return length; 1603 } 1604 1605 // The standard requires that non-ASCII characters cannot have ASCII 1606 // character codes in their equivalence class. 1607 // TODO(dcarney): issue 3550 this is not actually true for Latin1 anymore, 1608 // is it? For example, \u00C5 is equivalent to \u212B. 1609 return 0; 1610 } 1611 1612 1613 static inline bool EmitSimpleCharacter(Isolate* isolate, 1614 RegExpCompiler* compiler, 1615 uc16 c, 1616 Label* on_failure, 1617 int cp_offset, 1618 bool check, 1619 bool preloaded) { 1620 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1621 bool bound_checked = false; 1622 if (!preloaded) { 1623 assembler->LoadCurrentCharacter( 1624 cp_offset, 1625 on_failure, 1626 check); 1627 bound_checked = true; 1628 } 1629 assembler->CheckNotCharacter(c, on_failure); 1630 return bound_checked; 1631 } 1632 1633 1634 // Only emits non-letters (things that don't have case). Only used for case 1635 // independent matches. 1636 static inline bool EmitAtomNonLetter(Isolate* isolate, 1637 RegExpCompiler* compiler, 1638 uc16 c, 1639 Label* on_failure, 1640 int cp_offset, 1641 bool check, 1642 bool preloaded) { 1643 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 1644 bool one_byte = compiler->one_byte(); 1645 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1646 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars); 1647 if (length < 1) { 1648 // This can't match. Must be an one-byte subject and a non-one-byte 1649 // character. We do not need to do anything since the one-byte pass 1650 // already handled this. 1651 return false; // Bounds not checked. 1652 } 1653 bool checked = false; 1654 // We handle the length > 1 case in a later pass. 1655 if (length == 1) { 1656 if (one_byte && c > String::kMaxOneByteCharCodeU) { 1657 // Can't match - see above. 1658 return false; // Bounds not checked. 1659 } 1660 if (!preloaded) { 1661 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 1662 checked = check; 1663 } 1664 macro_assembler->CheckNotCharacter(c, on_failure); 1665 } 1666 return checked; 1667 } 1668 1669 1670 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler, 1671 bool one_byte, uc16 c1, uc16 c2, 1672 Label* on_failure) { 1673 uc16 char_mask; 1674 if (one_byte) { 1675 char_mask = String::kMaxOneByteCharCode; 1676 } else { 1677 char_mask = String::kMaxUtf16CodeUnit; 1678 } 1679 uc16 exor = c1 ^ c2; 1680 // Check whether exor has only one bit set. 1681 if (((exor - 1) & exor) == 0) { 1682 // If c1 and c2 differ only by one bit. 1683 // Ecma262UnCanonicalize always gives the highest number last. 1684 DCHECK(c2 > c1); 1685 uc16 mask = char_mask ^ exor; 1686 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure); 1687 return true; 1688 } 1689 DCHECK(c2 > c1); 1690 uc16 diff = c2 - c1; 1691 if (((diff - 1) & diff) == 0 && c1 >= diff) { 1692 // If the characters differ by 2^n but don't differ by one bit then 1693 // subtract the difference from the found character, then do the or 1694 // trick. We avoid the theoretical case where negative numbers are 1695 // involved in order to simplify code generation. 1696 uc16 mask = char_mask ^ diff; 1697 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff, 1698 diff, 1699 mask, 1700 on_failure); 1701 return true; 1702 } 1703 return false; 1704 } 1705 1706 1707 typedef bool EmitCharacterFunction(Isolate* isolate, 1708 RegExpCompiler* compiler, 1709 uc16 c, 1710 Label* on_failure, 1711 int cp_offset, 1712 bool check, 1713 bool preloaded); 1714 1715 // Only emits letters (things that have case). Only used for case independent 1716 // matches. 1717 static inline bool EmitAtomLetter(Isolate* isolate, 1718 RegExpCompiler* compiler, 1719 uc16 c, 1720 Label* on_failure, 1721 int cp_offset, 1722 bool check, 1723 bool preloaded) { 1724 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 1725 bool one_byte = compiler->one_byte(); 1726 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1727 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars); 1728 if (length <= 1) return false; 1729 // We may not need to check against the end of the input string 1730 // if this character lies before a character that matched. 1731 if (!preloaded) { 1732 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 1733 } 1734 Label ok; 1735 DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4); 1736 switch (length) { 1737 case 2: { 1738 if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0], 1739 chars[1], on_failure)) { 1740 } else { 1741 macro_assembler->CheckCharacter(chars[0], &ok); 1742 macro_assembler->CheckNotCharacter(chars[1], on_failure); 1743 macro_assembler->Bind(&ok); 1744 } 1745 break; 1746 } 1747 case 4: 1748 macro_assembler->CheckCharacter(chars[3], &ok); 1749 // Fall through! 1750 case 3: 1751 macro_assembler->CheckCharacter(chars[0], &ok); 1752 macro_assembler->CheckCharacter(chars[1], &ok); 1753 macro_assembler->CheckNotCharacter(chars[2], on_failure); 1754 macro_assembler->Bind(&ok); 1755 break; 1756 default: 1757 UNREACHABLE(); 1758 break; 1759 } 1760 return true; 1761 } 1762 1763 1764 static void EmitBoundaryTest(RegExpMacroAssembler* masm, 1765 int border, 1766 Label* fall_through, 1767 Label* above_or_equal, 1768 Label* below) { 1769 if (below != fall_through) { 1770 masm->CheckCharacterLT(border, below); 1771 if (above_or_equal != fall_through) masm->GoTo(above_or_equal); 1772 } else { 1773 masm->CheckCharacterGT(border - 1, above_or_equal); 1774 } 1775 } 1776 1777 1778 static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm, 1779 int first, 1780 int last, 1781 Label* fall_through, 1782 Label* in_range, 1783 Label* out_of_range) { 1784 if (in_range == fall_through) { 1785 if (first == last) { 1786 masm->CheckNotCharacter(first, out_of_range); 1787 } else { 1788 masm->CheckCharacterNotInRange(first, last, out_of_range); 1789 } 1790 } else { 1791 if (first == last) { 1792 masm->CheckCharacter(first, in_range); 1793 } else { 1794 masm->CheckCharacterInRange(first, last, in_range); 1795 } 1796 if (out_of_range != fall_through) masm->GoTo(out_of_range); 1797 } 1798 } 1799 1800 1801 // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even. 1802 // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd. 1803 static void EmitUseLookupTable( 1804 RegExpMacroAssembler* masm, 1805 ZoneList<int>* ranges, 1806 int start_index, 1807 int end_index, 1808 int min_char, 1809 Label* fall_through, 1810 Label* even_label, 1811 Label* odd_label) { 1812 static const int kSize = RegExpMacroAssembler::kTableSize; 1813 static const int kMask = RegExpMacroAssembler::kTableMask; 1814 1815 int base = (min_char & ~kMask); 1816 USE(base); 1817 1818 // Assert that everything is on one kTableSize page. 1819 for (int i = start_index; i <= end_index; i++) { 1820 DCHECK_EQ(ranges->at(i) & ~kMask, base); 1821 } 1822 DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base); 1823 1824 char templ[kSize]; 1825 Label* on_bit_set; 1826 Label* on_bit_clear; 1827 int bit; 1828 if (even_label == fall_through) { 1829 on_bit_set = odd_label; 1830 on_bit_clear = even_label; 1831 bit = 1; 1832 } else { 1833 on_bit_set = even_label; 1834 on_bit_clear = odd_label; 1835 bit = 0; 1836 } 1837 for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) { 1838 templ[i] = bit; 1839 } 1840 int j = 0; 1841 bit ^= 1; 1842 for (int i = start_index; i < end_index; i++) { 1843 for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) { 1844 templ[j] = bit; 1845 } 1846 bit ^= 1; 1847 } 1848 for (int i = j; i < kSize; i++) { 1849 templ[i] = bit; 1850 } 1851 Factory* factory = masm->zone()->isolate()->factory(); 1852 // TODO(erikcorry): Cache these. 1853 Handle<ByteArray> ba = factory->NewByteArray(kSize, TENURED); 1854 for (int i = 0; i < kSize; i++) { 1855 ba->set(i, templ[i]); 1856 } 1857 masm->CheckBitInTable(ba, on_bit_set); 1858 if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear); 1859 } 1860 1861 1862 static void CutOutRange(RegExpMacroAssembler* masm, 1863 ZoneList<int>* ranges, 1864 int start_index, 1865 int end_index, 1866 int cut_index, 1867 Label* even_label, 1868 Label* odd_label) { 1869 bool odd = (((cut_index - start_index) & 1) == 1); 1870 Label* in_range_label = odd ? odd_label : even_label; 1871 Label dummy; 1872 EmitDoubleBoundaryTest(masm, 1873 ranges->at(cut_index), 1874 ranges->at(cut_index + 1) - 1, 1875 &dummy, 1876 in_range_label, 1877 &dummy); 1878 DCHECK(!dummy.is_linked()); 1879 // Cut out the single range by rewriting the array. This creates a new 1880 // range that is a merger of the two ranges on either side of the one we 1881 // are cutting out. The oddity of the labels is preserved. 1882 for (int j = cut_index; j > start_index; j--) { 1883 ranges->at(j) = ranges->at(j - 1); 1884 } 1885 for (int j = cut_index + 1; j < end_index; j++) { 1886 ranges->at(j) = ranges->at(j + 1); 1887 } 1888 } 1889 1890 1891 // Unicode case. Split the search space into kSize spaces that are handled 1892 // with recursion. 1893 static void SplitSearchSpace(ZoneList<int>* ranges, 1894 int start_index, 1895 int end_index, 1896 int* new_start_index, 1897 int* new_end_index, 1898 int* border) { 1899 static const int kSize = RegExpMacroAssembler::kTableSize; 1900 static const int kMask = RegExpMacroAssembler::kTableMask; 1901 1902 int first = ranges->at(start_index); 1903 int last = ranges->at(end_index) - 1; 1904 1905 *new_start_index = start_index; 1906 *border = (ranges->at(start_index) & ~kMask) + kSize; 1907 while (*new_start_index < end_index) { 1908 if (ranges->at(*new_start_index) > *border) break; 1909 (*new_start_index)++; 1910 } 1911 // new_start_index is the index of the first edge that is beyond the 1912 // current kSize space. 1913 1914 // For very large search spaces we do a binary chop search of the non-Latin1 1915 // space instead of just going to the end of the current kSize space. The 1916 // heuristics are complicated a little by the fact that any 128-character 1917 // encoding space can be quickly tested with a table lookup, so we don't 1918 // wish to do binary chop search at a smaller granularity than that. A 1919 // 128-character space can take up a lot of space in the ranges array if, 1920 // for example, we only want to match every second character (eg. the lower 1921 // case characters on some Unicode pages). 1922 int binary_chop_index = (end_index + start_index) / 2; 1923 // The first test ensures that we get to the code that handles the Latin1 1924 // range with a single not-taken branch, speeding up this important 1925 // character range (even non-Latin1 charset-based text has spaces and 1926 // punctuation). 1927 if (*border - 1 > String::kMaxOneByteCharCode && // Latin1 case. 1928 end_index - start_index > (*new_start_index - start_index) * 2 && 1929 last - first > kSize * 2 && binary_chop_index > *new_start_index && 1930 ranges->at(binary_chop_index) >= first + 2 * kSize) { 1931 int scan_forward_for_section_border = binary_chop_index;; 1932 int new_border = (ranges->at(binary_chop_index) | kMask) + 1; 1933 1934 while (scan_forward_for_section_border < end_index) { 1935 if (ranges->at(scan_forward_for_section_border) > new_border) { 1936 *new_start_index = scan_forward_for_section_border; 1937 *border = new_border; 1938 break; 1939 } 1940 scan_forward_for_section_border++; 1941 } 1942 } 1943 1944 DCHECK(*new_start_index > start_index); 1945 *new_end_index = *new_start_index - 1; 1946 if (ranges->at(*new_end_index) == *border) { 1947 (*new_end_index)--; 1948 } 1949 if (*border >= ranges->at(end_index)) { 1950 *border = ranges->at(end_index); 1951 *new_start_index = end_index; // Won't be used. 1952 *new_end_index = end_index - 1; 1953 } 1954 } 1955 1956 1957 // Gets a series of segment boundaries representing a character class. If the 1958 // character is in the range between an even and an odd boundary (counting from 1959 // start_index) then go to even_label, otherwise go to odd_label. We already 1960 // know that the character is in the range of min_char to max_char inclusive. 1961 // Either label can be NULL indicating backtracking. Either label can also be 1962 // equal to the fall_through label. 1963 static void GenerateBranches(RegExpMacroAssembler* masm, 1964 ZoneList<int>* ranges, 1965 int start_index, 1966 int end_index, 1967 uc16 min_char, 1968 uc16 max_char, 1969 Label* fall_through, 1970 Label* even_label, 1971 Label* odd_label) { 1972 int first = ranges->at(start_index); 1973 int last = ranges->at(end_index) - 1; 1974 1975 DCHECK_LT(min_char, first); 1976 1977 // Just need to test if the character is before or on-or-after 1978 // a particular character. 1979 if (start_index == end_index) { 1980 EmitBoundaryTest(masm, first, fall_through, even_label, odd_label); 1981 return; 1982 } 1983 1984 // Another almost trivial case: There is one interval in the middle that is 1985 // different from the end intervals. 1986 if (start_index + 1 == end_index) { 1987 EmitDoubleBoundaryTest( 1988 masm, first, last, fall_through, even_label, odd_label); 1989 return; 1990 } 1991 1992 // It's not worth using table lookup if there are very few intervals in the 1993 // character class. 1994 if (end_index - start_index <= 6) { 1995 // It is faster to test for individual characters, so we look for those 1996 // first, then try arbitrary ranges in the second round. 1997 static int kNoCutIndex = -1; 1998 int cut = kNoCutIndex; 1999 for (int i = start_index; i < end_index; i++) { 2000 if (ranges->at(i) == ranges->at(i + 1) - 1) { 2001 cut = i; 2002 break; 2003 } 2004 } 2005 if (cut == kNoCutIndex) cut = start_index; 2006 CutOutRange( 2007 masm, ranges, start_index, end_index, cut, even_label, odd_label); 2008 DCHECK_GE(end_index - start_index, 2); 2009 GenerateBranches(masm, 2010 ranges, 2011 start_index + 1, 2012 end_index - 1, 2013 min_char, 2014 max_char, 2015 fall_through, 2016 even_label, 2017 odd_label); 2018 return; 2019 } 2020 2021 // If there are a lot of intervals in the regexp, then we will use tables to 2022 // determine whether the character is inside or outside the character class. 2023 static const int kBits = RegExpMacroAssembler::kTableSizeBits; 2024 2025 if ((max_char >> kBits) == (min_char >> kBits)) { 2026 EmitUseLookupTable(masm, 2027 ranges, 2028 start_index, 2029 end_index, 2030 min_char, 2031 fall_through, 2032 even_label, 2033 odd_label); 2034 return; 2035 } 2036 2037 if ((min_char >> kBits) != (first >> kBits)) { 2038 masm->CheckCharacterLT(first, odd_label); 2039 GenerateBranches(masm, 2040 ranges, 2041 start_index + 1, 2042 end_index, 2043 first, 2044 max_char, 2045 fall_through, 2046 odd_label, 2047 even_label); 2048 return; 2049 } 2050 2051 int new_start_index = 0; 2052 int new_end_index = 0; 2053 int border = 0; 2054 2055 SplitSearchSpace(ranges, 2056 start_index, 2057 end_index, 2058 &new_start_index, 2059 &new_end_index, 2060 &border); 2061 2062 Label handle_rest; 2063 Label* above = &handle_rest; 2064 if (border == last + 1) { 2065 // We didn't find any section that started after the limit, so everything 2066 // above the border is one of the terminal labels. 2067 above = (end_index & 1) != (start_index & 1) ? odd_label : even_label; 2068 DCHECK(new_end_index == end_index - 1); 2069 } 2070 2071 DCHECK_LE(start_index, new_end_index); 2072 DCHECK_LE(new_start_index, end_index); 2073 DCHECK_LT(start_index, new_start_index); 2074 DCHECK_LT(new_end_index, end_index); 2075 DCHECK(new_end_index + 1 == new_start_index || 2076 (new_end_index + 2 == new_start_index && 2077 border == ranges->at(new_end_index + 1))); 2078 DCHECK_LT(min_char, border - 1); 2079 DCHECK_LT(border, max_char); 2080 DCHECK_LT(ranges->at(new_end_index), border); 2081 DCHECK(border < ranges->at(new_start_index) || 2082 (border == ranges->at(new_start_index) && 2083 new_start_index == end_index && 2084 new_end_index == end_index - 1 && 2085 border == last + 1)); 2086 DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1)); 2087 2088 masm->CheckCharacterGT(border - 1, above); 2089 Label dummy; 2090 GenerateBranches(masm, 2091 ranges, 2092 start_index, 2093 new_end_index, 2094 min_char, 2095 border - 1, 2096 &dummy, 2097 even_label, 2098 odd_label); 2099 if (handle_rest.is_linked()) { 2100 masm->Bind(&handle_rest); 2101 bool flip = (new_start_index & 1) != (start_index & 1); 2102 GenerateBranches(masm, 2103 ranges, 2104 new_start_index, 2105 end_index, 2106 border, 2107 max_char, 2108 &dummy, 2109 flip ? odd_label : even_label, 2110 flip ? even_label : odd_label); 2111 } 2112 } 2113 2114 2115 static void EmitCharClass(RegExpMacroAssembler* macro_assembler, 2116 RegExpCharacterClass* cc, bool one_byte, 2117 Label* on_failure, int cp_offset, bool check_offset, 2118 bool preloaded, Zone* zone) { 2119 ZoneList<CharacterRange>* ranges = cc->ranges(zone); 2120 if (!CharacterRange::IsCanonical(ranges)) { 2121 CharacterRange::Canonicalize(ranges); 2122 } 2123 2124 int max_char; 2125 if (one_byte) { 2126 max_char = String::kMaxOneByteCharCode; 2127 } else { 2128 max_char = String::kMaxUtf16CodeUnit; 2129 } 2130 2131 int range_count = ranges->length(); 2132 2133 int last_valid_range = range_count - 1; 2134 while (last_valid_range >= 0) { 2135 CharacterRange& range = ranges->at(last_valid_range); 2136 if (range.from() <= max_char) { 2137 break; 2138 } 2139 last_valid_range--; 2140 } 2141 2142 if (last_valid_range < 0) { 2143 if (!cc->is_negated()) { 2144 macro_assembler->GoTo(on_failure); 2145 } 2146 if (check_offset) { 2147 macro_assembler->CheckPosition(cp_offset, on_failure); 2148 } 2149 return; 2150 } 2151 2152 if (last_valid_range == 0 && 2153 ranges->at(0).IsEverything(max_char)) { 2154 if (cc->is_negated()) { 2155 macro_assembler->GoTo(on_failure); 2156 } else { 2157 // This is a common case hit by non-anchored expressions. 2158 if (check_offset) { 2159 macro_assembler->CheckPosition(cp_offset, on_failure); 2160 } 2161 } 2162 return; 2163 } 2164 if (last_valid_range == 0 && 2165 !cc->is_negated() && 2166 ranges->at(0).IsEverything(max_char)) { 2167 // This is a common case hit by non-anchored expressions. 2168 if (check_offset) { 2169 macro_assembler->CheckPosition(cp_offset, on_failure); 2170 } 2171 return; 2172 } 2173 2174 if (!preloaded) { 2175 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset); 2176 } 2177 2178 if (cc->is_standard(zone) && 2179 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(), 2180 on_failure)) { 2181 return; 2182 } 2183 2184 2185 // A new list with ascending entries. Each entry is a code unit 2186 // where there is a boundary between code units that are part of 2187 // the class and code units that are not. Normally we insert an 2188 // entry at zero which goes to the failure label, but if there 2189 // was already one there we fall through for success on that entry. 2190 // Subsequent entries have alternating meaning (success/failure). 2191 ZoneList<int>* range_boundaries = 2192 new(zone) ZoneList<int>(last_valid_range, zone); 2193 2194 bool zeroth_entry_is_failure = !cc->is_negated(); 2195 2196 for (int i = 0; i <= last_valid_range; i++) { 2197 CharacterRange& range = ranges->at(i); 2198 if (range.from() == 0) { 2199 DCHECK_EQ(i, 0); 2200 zeroth_entry_is_failure = !zeroth_entry_is_failure; 2201 } else { 2202 range_boundaries->Add(range.from(), zone); 2203 } 2204 range_boundaries->Add(range.to() + 1, zone); 2205 } 2206 int end_index = range_boundaries->length() - 1; 2207 if (range_boundaries->at(end_index) > max_char) { 2208 end_index--; 2209 } 2210 2211 Label fall_through; 2212 GenerateBranches(macro_assembler, 2213 range_boundaries, 2214 0, // start_index. 2215 end_index, 2216 0, // min_char. 2217 max_char, 2218 &fall_through, 2219 zeroth_entry_is_failure ? &fall_through : on_failure, 2220 zeroth_entry_is_failure ? on_failure : &fall_through); 2221 macro_assembler->Bind(&fall_through); 2222 } 2223 2224 2225 RegExpNode::~RegExpNode() { 2226 } 2227 2228 2229 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler, 2230 Trace* trace) { 2231 // If we are generating a greedy loop then don't stop and don't reuse code. 2232 if (trace->stop_node() != NULL) { 2233 return CONTINUE; 2234 } 2235 2236 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 2237 if (trace->is_trivial()) { 2238 if (label_.is_bound()) { 2239 // We are being asked to generate a generic version, but that's already 2240 // been done so just go to it. 2241 macro_assembler->GoTo(&label_); 2242 return DONE; 2243 } 2244 if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) { 2245 // To avoid too deep recursion we push the node to the work queue and just 2246 // generate a goto here. 2247 compiler->AddWork(this); 2248 macro_assembler->GoTo(&label_); 2249 return DONE; 2250 } 2251 // Generate generic version of the node and bind the label for later use. 2252 macro_assembler->Bind(&label_); 2253 return CONTINUE; 2254 } 2255 2256 // We are being asked to make a non-generic version. Keep track of how many 2257 // non-generic versions we generate so as not to overdo it. 2258 trace_count_++; 2259 if (FLAG_regexp_optimization && 2260 trace_count_ < kMaxCopiesCodeGenerated && 2261 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) { 2262 return CONTINUE; 2263 } 2264 2265 // If we get here code has been generated for this node too many times or 2266 // recursion is too deep. Time to switch to a generic version. The code for 2267 // generic versions above can handle deep recursion properly. 2268 trace->Flush(compiler, this); 2269 return DONE; 2270 } 2271 2272 2273 int ActionNode::EatsAtLeast(int still_to_find, 2274 int budget, 2275 bool not_at_start) { 2276 if (budget <= 0) return 0; 2277 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input! 2278 return on_success()->EatsAtLeast(still_to_find, 2279 budget - 1, 2280 not_at_start); 2281 } 2282 2283 2284 void ActionNode::FillInBMInfo(int offset, 2285 int budget, 2286 BoyerMooreLookahead* bm, 2287 bool not_at_start) { 2288 if (action_type_ == BEGIN_SUBMATCH) { 2289 bm->SetRest(offset); 2290 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) { 2291 on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start); 2292 } 2293 SaveBMInfo(bm, not_at_start, offset); 2294 } 2295 2296 2297 int AssertionNode::EatsAtLeast(int still_to_find, 2298 int budget, 2299 bool not_at_start) { 2300 if (budget <= 0) return 0; 2301 // If we know we are not at the start and we are asked "how many characters 2302 // will you match if you succeed?" then we can answer anything since false 2303 // implies false. So lets just return the max answer (still_to_find) since 2304 // that won't prevent us from preloading a lot of characters for the other 2305 // branches in the node graph. 2306 if (assertion_type() == AT_START && not_at_start) return still_to_find; 2307 return on_success()->EatsAtLeast(still_to_find, 2308 budget - 1, 2309 not_at_start); 2310 } 2311 2312 2313 void AssertionNode::FillInBMInfo(int offset, 2314 int budget, 2315 BoyerMooreLookahead* bm, 2316 bool not_at_start) { 2317 // Match the behaviour of EatsAtLeast on this node. 2318 if (assertion_type() == AT_START && not_at_start) return; 2319 on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start); 2320 SaveBMInfo(bm, not_at_start, offset); 2321 } 2322 2323 2324 int BackReferenceNode::EatsAtLeast(int still_to_find, 2325 int budget, 2326 bool not_at_start) { 2327 if (budget <= 0) return 0; 2328 return on_success()->EatsAtLeast(still_to_find, 2329 budget - 1, 2330 not_at_start); 2331 } 2332 2333 2334 int TextNode::EatsAtLeast(int still_to_find, 2335 int budget, 2336 bool not_at_start) { 2337 int answer = Length(); 2338 if (answer >= still_to_find) return answer; 2339 if (budget <= 0) return answer; 2340 // We are not at start after this node so we set the last argument to 'true'. 2341 return answer + on_success()->EatsAtLeast(still_to_find - answer, 2342 budget - 1, 2343 true); 2344 } 2345 2346 2347 int NegativeLookaheadChoiceNode::EatsAtLeast(int still_to_find, 2348 int budget, 2349 bool not_at_start) { 2350 if (budget <= 0) return 0; 2351 // Alternative 0 is the negative lookahead, alternative 1 is what comes 2352 // afterwards. 2353 RegExpNode* node = alternatives_->at(1).node(); 2354 return node->EatsAtLeast(still_to_find, budget - 1, not_at_start); 2355 } 2356 2357 2358 void NegativeLookaheadChoiceNode::GetQuickCheckDetails( 2359 QuickCheckDetails* details, 2360 RegExpCompiler* compiler, 2361 int filled_in, 2362 bool not_at_start) { 2363 // Alternative 0 is the negative lookahead, alternative 1 is what comes 2364 // afterwards. 2365 RegExpNode* node = alternatives_->at(1).node(); 2366 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start); 2367 } 2368 2369 2370 int ChoiceNode::EatsAtLeastHelper(int still_to_find, 2371 int budget, 2372 RegExpNode* ignore_this_node, 2373 bool not_at_start) { 2374 if (budget <= 0) return 0; 2375 int min = 100; 2376 int choice_count = alternatives_->length(); 2377 budget = (budget - 1) / choice_count; 2378 for (int i = 0; i < choice_count; i++) { 2379 RegExpNode* node = alternatives_->at(i).node(); 2380 if (node == ignore_this_node) continue; 2381 int node_eats_at_least = 2382 node->EatsAtLeast(still_to_find, budget, not_at_start); 2383 if (node_eats_at_least < min) min = node_eats_at_least; 2384 if (min == 0) return 0; 2385 } 2386 return min; 2387 } 2388 2389 2390 int LoopChoiceNode::EatsAtLeast(int still_to_find, 2391 int budget, 2392 bool not_at_start) { 2393 return EatsAtLeastHelper(still_to_find, 2394 budget - 1, 2395 loop_node_, 2396 not_at_start); 2397 } 2398 2399 2400 int ChoiceNode::EatsAtLeast(int still_to_find, 2401 int budget, 2402 bool not_at_start) { 2403 return EatsAtLeastHelper(still_to_find, 2404 budget, 2405 NULL, 2406 not_at_start); 2407 } 2408 2409 2410 // Takes the left-most 1-bit and smears it out, setting all bits to its right. 2411 static inline uint32_t SmearBitsRight(uint32_t v) { 2412 v |= v >> 1; 2413 v |= v >> 2; 2414 v |= v >> 4; 2415 v |= v >> 8; 2416 v |= v >> 16; 2417 return v; 2418 } 2419 2420 2421 bool QuickCheckDetails::Rationalize(bool asc) { 2422 bool found_useful_op = false; 2423 uint32_t char_mask; 2424 if (asc) { 2425 char_mask = String::kMaxOneByteCharCode; 2426 } else { 2427 char_mask = String::kMaxUtf16CodeUnit; 2428 } 2429 mask_ = 0; 2430 value_ = 0; 2431 int char_shift = 0; 2432 for (int i = 0; i < characters_; i++) { 2433 Position* pos = &positions_[i]; 2434 if ((pos->mask & String::kMaxOneByteCharCode) != 0) { 2435 found_useful_op = true; 2436 } 2437 mask_ |= (pos->mask & char_mask) << char_shift; 2438 value_ |= (pos->value & char_mask) << char_shift; 2439 char_shift += asc ? 8 : 16; 2440 } 2441 return found_useful_op; 2442 } 2443 2444 2445 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler, 2446 Trace* bounds_check_trace, 2447 Trace* trace, 2448 bool preload_has_checked_bounds, 2449 Label* on_possible_success, 2450 QuickCheckDetails* details, 2451 bool fall_through_on_failure) { 2452 if (details->characters() == 0) return false; 2453 GetQuickCheckDetails( 2454 details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE); 2455 if (details->cannot_match()) return false; 2456 if (!details->Rationalize(compiler->one_byte())) return false; 2457 DCHECK(details->characters() == 1 || 2458 compiler->macro_assembler()->CanReadUnaligned()); 2459 uint32_t mask = details->mask(); 2460 uint32_t value = details->value(); 2461 2462 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2463 2464 if (trace->characters_preloaded() != details->characters()) { 2465 DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset()); 2466 // We are attempting to preload the minimum number of characters 2467 // any choice would eat, so if the bounds check fails, then none of the 2468 // choices can succeed, so we can just immediately backtrack, rather 2469 // than go to the next choice. 2470 assembler->LoadCurrentCharacter(trace->cp_offset(), 2471 bounds_check_trace->backtrack(), 2472 !preload_has_checked_bounds, 2473 details->characters()); 2474 } 2475 2476 2477 bool need_mask = true; 2478 2479 if (details->characters() == 1) { 2480 // If number of characters preloaded is 1 then we used a byte or 16 bit 2481 // load so the value is already masked down. 2482 uint32_t char_mask; 2483 if (compiler->one_byte()) { 2484 char_mask = String::kMaxOneByteCharCode; 2485 } else { 2486 char_mask = String::kMaxUtf16CodeUnit; 2487 } 2488 if ((mask & char_mask) == char_mask) need_mask = false; 2489 mask &= char_mask; 2490 } else { 2491 // For 2-character preloads in one-byte mode or 1-character preloads in 2492 // two-byte mode we also use a 16 bit load with zero extend. 2493 if (details->characters() == 2 && compiler->one_byte()) { 2494 if ((mask & 0xffff) == 0xffff) need_mask = false; 2495 } else if (details->characters() == 1 && !compiler->one_byte()) { 2496 if ((mask & 0xffff) == 0xffff) need_mask = false; 2497 } else { 2498 if (mask == 0xffffffff) need_mask = false; 2499 } 2500 } 2501 2502 if (fall_through_on_failure) { 2503 if (need_mask) { 2504 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success); 2505 } else { 2506 assembler->CheckCharacter(value, on_possible_success); 2507 } 2508 } else { 2509 if (need_mask) { 2510 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack()); 2511 } else { 2512 assembler->CheckNotCharacter(value, trace->backtrack()); 2513 } 2514 } 2515 return true; 2516 } 2517 2518 2519 // Here is the meat of GetQuickCheckDetails (see also the comment on the 2520 // super-class in the .h file). 2521 // 2522 // We iterate along the text object, building up for each character a 2523 // mask and value that can be used to test for a quick failure to match. 2524 // The masks and values for the positions will be combined into a single 2525 // machine word for the current character width in order to be used in 2526 // generating a quick check. 2527 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, 2528 RegExpCompiler* compiler, 2529 int characters_filled_in, 2530 bool not_at_start) { 2531 Isolate* isolate = compiler->macro_assembler()->zone()->isolate(); 2532 DCHECK(characters_filled_in < details->characters()); 2533 int characters = details->characters(); 2534 int char_mask; 2535 if (compiler->one_byte()) { 2536 char_mask = String::kMaxOneByteCharCode; 2537 } else { 2538 char_mask = String::kMaxUtf16CodeUnit; 2539 } 2540 for (int k = 0; k < elms_->length(); k++) { 2541 TextElement elm = elms_->at(k); 2542 if (elm.text_type() == TextElement::ATOM) { 2543 Vector<const uc16> quarks = elm.atom()->data(); 2544 for (int i = 0; i < characters && i < quarks.length(); i++) { 2545 QuickCheckDetails::Position* pos = 2546 details->positions(characters_filled_in); 2547 uc16 c = quarks[i]; 2548 if (c > char_mask) { 2549 // If we expect a non-Latin1 character from an one-byte string, 2550 // there is no way we can match. Not even case-independent 2551 // matching can turn an Latin1 character into non-Latin1 or 2552 // vice versa. 2553 // TODO(dcarney): issue 3550. Verify that this works as expected. 2554 // For example, \u0178 is uppercase of \u00ff (y-umlaut). 2555 details->set_cannot_match(); 2556 pos->determines_perfectly = false; 2557 return; 2558 } 2559 if (compiler->ignore_case()) { 2560 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 2561 int length = GetCaseIndependentLetters(isolate, c, 2562 compiler->one_byte(), chars); 2563 DCHECK(length != 0); // Can only happen if c > char_mask (see above). 2564 if (length == 1) { 2565 // This letter has no case equivalents, so it's nice and simple 2566 // and the mask-compare will determine definitely whether we have 2567 // a match at this character position. 2568 pos->mask = char_mask; 2569 pos->value = c; 2570 pos->determines_perfectly = true; 2571 } else { 2572 uint32_t common_bits = char_mask; 2573 uint32_t bits = chars[0]; 2574 for (int j = 1; j < length; j++) { 2575 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits); 2576 common_bits ^= differing_bits; 2577 bits &= common_bits; 2578 } 2579 // If length is 2 and common bits has only one zero in it then 2580 // our mask and compare instruction will determine definitely 2581 // whether we have a match at this character position. Otherwise 2582 // it can only be an approximate check. 2583 uint32_t one_zero = (common_bits | ~char_mask); 2584 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) { 2585 pos->determines_perfectly = true; 2586 } 2587 pos->mask = common_bits; 2588 pos->value = bits; 2589 } 2590 } else { 2591 // Don't ignore case. Nice simple case where the mask-compare will 2592 // determine definitely whether we have a match at this character 2593 // position. 2594 pos->mask = char_mask; 2595 pos->value = c; 2596 pos->determines_perfectly = true; 2597 } 2598 characters_filled_in++; 2599 DCHECK(characters_filled_in <= details->characters()); 2600 if (characters_filled_in == details->characters()) { 2601 return; 2602 } 2603 } 2604 } else { 2605 QuickCheckDetails::Position* pos = 2606 details->positions(characters_filled_in); 2607 RegExpCharacterClass* tree = elm.char_class(); 2608 ZoneList<CharacterRange>* ranges = tree->ranges(zone()); 2609 if (tree->is_negated()) { 2610 // A quick check uses multi-character mask and compare. There is no 2611 // useful way to incorporate a negative char class into this scheme 2612 // so we just conservatively create a mask and value that will always 2613 // succeed. 2614 pos->mask = 0; 2615 pos->value = 0; 2616 } else { 2617 int first_range = 0; 2618 while (ranges->at(first_range).from() > char_mask) { 2619 first_range++; 2620 if (first_range == ranges->length()) { 2621 details->set_cannot_match(); 2622 pos->determines_perfectly = false; 2623 return; 2624 } 2625 } 2626 CharacterRange range = ranges->at(first_range); 2627 uc16 from = range.from(); 2628 uc16 to = range.to(); 2629 if (to > char_mask) { 2630 to = char_mask; 2631 } 2632 uint32_t differing_bits = (from ^ to); 2633 // A mask and compare is only perfect if the differing bits form a 2634 // number like 00011111 with one single block of trailing 1s. 2635 if ((differing_bits & (differing_bits + 1)) == 0 && 2636 from + differing_bits == to) { 2637 pos->determines_perfectly = true; 2638 } 2639 uint32_t common_bits = ~SmearBitsRight(differing_bits); 2640 uint32_t bits = (from & common_bits); 2641 for (int i = first_range + 1; i < ranges->length(); i++) { 2642 CharacterRange range = ranges->at(i); 2643 uc16 from = range.from(); 2644 uc16 to = range.to(); 2645 if (from > char_mask) continue; 2646 if (to > char_mask) to = char_mask; 2647 // Here we are combining more ranges into the mask and compare 2648 // value. With each new range the mask becomes more sparse and 2649 // so the chances of a false positive rise. A character class 2650 // with multiple ranges is assumed never to be equivalent to a 2651 // mask and compare operation. 2652 pos->determines_perfectly = false; 2653 uint32_t new_common_bits = (from ^ to); 2654 new_common_bits = ~SmearBitsRight(new_common_bits); 2655 common_bits &= new_common_bits; 2656 bits &= new_common_bits; 2657 uint32_t differing_bits = (from & common_bits) ^ bits; 2658 common_bits ^= differing_bits; 2659 bits &= common_bits; 2660 } 2661 pos->mask = common_bits; 2662 pos->value = bits; 2663 } 2664 characters_filled_in++; 2665 DCHECK(characters_filled_in <= details->characters()); 2666 if (characters_filled_in == details->characters()) { 2667 return; 2668 } 2669 } 2670 } 2671 DCHECK(characters_filled_in != details->characters()); 2672 if (!details->cannot_match()) { 2673 on_success()-> GetQuickCheckDetails(details, 2674 compiler, 2675 characters_filled_in, 2676 true); 2677 } 2678 } 2679 2680 2681 void QuickCheckDetails::Clear() { 2682 for (int i = 0; i < characters_; i++) { 2683 positions_[i].mask = 0; 2684 positions_[i].value = 0; 2685 positions_[i].determines_perfectly = false; 2686 } 2687 characters_ = 0; 2688 } 2689 2690 2691 void QuickCheckDetails::Advance(int by, bool one_byte) { 2692 DCHECK(by >= 0); 2693 if (by >= characters_) { 2694 Clear(); 2695 return; 2696 } 2697 for (int i = 0; i < characters_ - by; i++) { 2698 positions_[i] = positions_[by + i]; 2699 } 2700 for (int i = characters_ - by; i < characters_; i++) { 2701 positions_[i].mask = 0; 2702 positions_[i].value = 0; 2703 positions_[i].determines_perfectly = false; 2704 } 2705 characters_ -= by; 2706 // We could change mask_ and value_ here but we would never advance unless 2707 // they had already been used in a check and they won't be used again because 2708 // it would gain us nothing. So there's no point. 2709 } 2710 2711 2712 void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) { 2713 DCHECK(characters_ == other->characters_); 2714 if (other->cannot_match_) { 2715 return; 2716 } 2717 if (cannot_match_) { 2718 *this = *other; 2719 return; 2720 } 2721 for (int i = from_index; i < characters_; i++) { 2722 QuickCheckDetails::Position* pos = positions(i); 2723 QuickCheckDetails::Position* other_pos = other->positions(i); 2724 if (pos->mask != other_pos->mask || 2725 pos->value != other_pos->value || 2726 !other_pos->determines_perfectly) { 2727 // Our mask-compare operation will be approximate unless we have the 2728 // exact same operation on both sides of the alternation. 2729 pos->determines_perfectly = false; 2730 } 2731 pos->mask &= other_pos->mask; 2732 pos->value &= pos->mask; 2733 other_pos->value &= pos->mask; 2734 uc16 differing_bits = (pos->value ^ other_pos->value); 2735 pos->mask &= ~differing_bits; 2736 pos->value &= pos->mask; 2737 } 2738 } 2739 2740 2741 class VisitMarker { 2742 public: 2743 explicit VisitMarker(NodeInfo* info) : info_(info) { 2744 DCHECK(!info->visited); 2745 info->visited = true; 2746 } 2747 ~VisitMarker() { 2748 info_->visited = false; 2749 } 2750 private: 2751 NodeInfo* info_; 2752 }; 2753 2754 2755 RegExpNode* SeqRegExpNode::FilterOneByte(int depth, bool ignore_case) { 2756 if (info()->replacement_calculated) return replacement(); 2757 if (depth < 0) return this; 2758 DCHECK(!info()->visited); 2759 VisitMarker marker(info()); 2760 return FilterSuccessor(depth - 1, ignore_case); 2761 } 2762 2763 2764 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) { 2765 RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case); 2766 if (next == NULL) return set_replacement(NULL); 2767 on_success_ = next; 2768 return set_replacement(this); 2769 } 2770 2771 2772 // We need to check for the following characters: 0x39c 0x3bc 0x178. 2773 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) { 2774 // TODO(dcarney): this could be a lot more efficient. 2775 return range.Contains(0x39c) || 2776 range.Contains(0x3bc) || range.Contains(0x178); 2777 } 2778 2779 2780 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) { 2781 for (int i = 0; i < ranges->length(); i++) { 2782 // TODO(dcarney): this could be a lot more efficient. 2783 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true; 2784 } 2785 return false; 2786 } 2787 2788 2789 RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) { 2790 if (info()->replacement_calculated) return replacement(); 2791 if (depth < 0) return this; 2792 DCHECK(!info()->visited); 2793 VisitMarker marker(info()); 2794 int element_count = elms_->length(); 2795 for (int i = 0; i < element_count; i++) { 2796 TextElement elm = elms_->at(i); 2797 if (elm.text_type() == TextElement::ATOM) { 2798 Vector<const uc16> quarks = elm.atom()->data(); 2799 for (int j = 0; j < quarks.length(); j++) { 2800 uint16_t c = quarks[j]; 2801 if (c <= String::kMaxOneByteCharCode) continue; 2802 if (!ignore_case) return set_replacement(NULL); 2803 // Here, we need to check for characters whose upper and lower cases 2804 // are outside the Latin-1 range. 2805 uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c); 2806 // Character is outside Latin-1 completely 2807 if (converted == 0) return set_replacement(NULL); 2808 // Convert quark to Latin-1 in place. 2809 uint16_t* copy = const_cast<uint16_t*>(quarks.start()); 2810 copy[j] = converted; 2811 } 2812 } else { 2813 DCHECK(elm.text_type() == TextElement::CHAR_CLASS); 2814 RegExpCharacterClass* cc = elm.char_class(); 2815 ZoneList<CharacterRange>* ranges = cc->ranges(zone()); 2816 if (!CharacterRange::IsCanonical(ranges)) { 2817 CharacterRange::Canonicalize(ranges); 2818 } 2819 // Now they are in order so we only need to look at the first. 2820 int range_count = ranges->length(); 2821 if (cc->is_negated()) { 2822 if (range_count != 0 && 2823 ranges->at(0).from() == 0 && 2824 ranges->at(0).to() >= String::kMaxOneByteCharCode) { 2825 // This will be handled in a later filter. 2826 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue; 2827 return set_replacement(NULL); 2828 } 2829 } else { 2830 if (range_count == 0 || 2831 ranges->at(0).from() > String::kMaxOneByteCharCode) { 2832 // This will be handled in a later filter. 2833 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue; 2834 return set_replacement(NULL); 2835 } 2836 } 2837 } 2838 } 2839 return FilterSuccessor(depth - 1, ignore_case); 2840 } 2841 2842 2843 RegExpNode* LoopChoiceNode::FilterOneByte(int depth, bool ignore_case) { 2844 if (info()->replacement_calculated) return replacement(); 2845 if (depth < 0) return this; 2846 if (info()->visited) return this; 2847 { 2848 VisitMarker marker(info()); 2849 2850 RegExpNode* continue_replacement = 2851 continue_node_->FilterOneByte(depth - 1, ignore_case); 2852 // If we can't continue after the loop then there is no sense in doing the 2853 // loop. 2854 if (continue_replacement == NULL) return set_replacement(NULL); 2855 } 2856 2857 return ChoiceNode::FilterOneByte(depth - 1, ignore_case); 2858 } 2859 2860 2861 RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) { 2862 if (info()->replacement_calculated) return replacement(); 2863 if (depth < 0) return this; 2864 if (info()->visited) return this; 2865 VisitMarker marker(info()); 2866 int choice_count = alternatives_->length(); 2867 2868 for (int i = 0; i < choice_count; i++) { 2869 GuardedAlternative alternative = alternatives_->at(i); 2870 if (alternative.guards() != NULL && alternative.guards()->length() != 0) { 2871 set_replacement(this); 2872 return this; 2873 } 2874 } 2875 2876 int surviving = 0; 2877 RegExpNode* survivor = NULL; 2878 for (int i = 0; i < choice_count; i++) { 2879 GuardedAlternative alternative = alternatives_->at(i); 2880 RegExpNode* replacement = 2881 alternative.node()->FilterOneByte(depth - 1, ignore_case); 2882 DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK. 2883 if (replacement != NULL) { 2884 alternatives_->at(i).set_node(replacement); 2885 surviving++; 2886 survivor = replacement; 2887 } 2888 } 2889 if (surviving < 2) return set_replacement(survivor); 2890 2891 set_replacement(this); 2892 if (surviving == choice_count) { 2893 return this; 2894 } 2895 // Only some of the nodes survived the filtering. We need to rebuild the 2896 // alternatives list. 2897 ZoneList<GuardedAlternative>* new_alternatives = 2898 new(zone()) ZoneList<GuardedAlternative>(surviving, zone()); 2899 for (int i = 0; i < choice_count; i++) { 2900 RegExpNode* replacement = 2901 alternatives_->at(i).node()->FilterOneByte(depth - 1, ignore_case); 2902 if (replacement != NULL) { 2903 alternatives_->at(i).set_node(replacement); 2904 new_alternatives->Add(alternatives_->at(i), zone()); 2905 } 2906 } 2907 alternatives_ = new_alternatives; 2908 return this; 2909 } 2910 2911 2912 RegExpNode* NegativeLookaheadChoiceNode::FilterOneByte(int depth, 2913 bool ignore_case) { 2914 if (info()->replacement_calculated) return replacement(); 2915 if (depth < 0) return this; 2916 if (info()->visited) return this; 2917 VisitMarker marker(info()); 2918 // Alternative 0 is the negative lookahead, alternative 1 is what comes 2919 // afterwards. 2920 RegExpNode* node = alternatives_->at(1).node(); 2921 RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case); 2922 if (replacement == NULL) return set_replacement(NULL); 2923 alternatives_->at(1).set_node(replacement); 2924 2925 RegExpNode* neg_node = alternatives_->at(0).node(); 2926 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case); 2927 // If the negative lookahead is always going to fail then 2928 // we don't need to check it. 2929 if (neg_replacement == NULL) return set_replacement(replacement); 2930 alternatives_->at(0).set_node(neg_replacement); 2931 return set_replacement(this); 2932 } 2933 2934 2935 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 2936 RegExpCompiler* compiler, 2937 int characters_filled_in, 2938 bool not_at_start) { 2939 if (body_can_be_zero_length_ || info()->visited) return; 2940 VisitMarker marker(info()); 2941 return ChoiceNode::GetQuickCheckDetails(details, 2942 compiler, 2943 characters_filled_in, 2944 not_at_start); 2945 } 2946 2947 2948 void LoopChoiceNode::FillInBMInfo(int offset, 2949 int budget, 2950 BoyerMooreLookahead* bm, 2951 bool not_at_start) { 2952 if (body_can_be_zero_length_ || budget <= 0) { 2953 bm->SetRest(offset); 2954 SaveBMInfo(bm, not_at_start, offset); 2955 return; 2956 } 2957 ChoiceNode::FillInBMInfo(offset, budget - 1, bm, not_at_start); 2958 SaveBMInfo(bm, not_at_start, offset); 2959 } 2960 2961 2962 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 2963 RegExpCompiler* compiler, 2964 int characters_filled_in, 2965 bool not_at_start) { 2966 not_at_start = (not_at_start || not_at_start_); 2967 int choice_count = alternatives_->length(); 2968 DCHECK(choice_count > 0); 2969 alternatives_->at(0).node()->GetQuickCheckDetails(details, 2970 compiler, 2971 characters_filled_in, 2972 not_at_start); 2973 for (int i = 1; i < choice_count; i++) { 2974 QuickCheckDetails new_details(details->characters()); 2975 RegExpNode* node = alternatives_->at(i).node(); 2976 node->GetQuickCheckDetails(&new_details, compiler, 2977 characters_filled_in, 2978 not_at_start); 2979 // Here we merge the quick match details of the two branches. 2980 details->Merge(&new_details, characters_filled_in); 2981 } 2982 } 2983 2984 2985 // Check for [0-9A-Z_a-z]. 2986 static void EmitWordCheck(RegExpMacroAssembler* assembler, 2987 Label* word, 2988 Label* non_word, 2989 bool fall_through_on_word) { 2990 if (assembler->CheckSpecialCharacterClass( 2991 fall_through_on_word ? 'w' : 'W', 2992 fall_through_on_word ? non_word : word)) { 2993 // Optimized implementation available. 2994 return; 2995 } 2996 assembler->CheckCharacterGT('z', non_word); 2997 assembler->CheckCharacterLT('0', non_word); 2998 assembler->CheckCharacterGT('a' - 1, word); 2999 assembler->CheckCharacterLT('9' + 1, word); 3000 assembler->CheckCharacterLT('A', non_word); 3001 assembler->CheckCharacterLT('Z' + 1, word); 3002 if (fall_through_on_word) { 3003 assembler->CheckNotCharacter('_', non_word); 3004 } else { 3005 assembler->CheckCharacter('_', word); 3006 } 3007 } 3008 3009 3010 // Emit the code to check for a ^ in multiline mode (1-character lookbehind 3011 // that matches newline or the start of input). 3012 static void EmitHat(RegExpCompiler* compiler, 3013 RegExpNode* on_success, 3014 Trace* trace) { 3015 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3016 // We will be loading the previous character into the current character 3017 // register. 3018 Trace new_trace(*trace); 3019 new_trace.InvalidateCurrentCharacter(); 3020 3021 Label ok; 3022 if (new_trace.cp_offset() == 0) { 3023 // The start of input counts as a newline in this context, so skip to 3024 // ok if we are at the start. 3025 assembler->CheckAtStart(&ok); 3026 } 3027 // We already checked that we are not at the start of input so it must be 3028 // OK to load the previous character. 3029 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1, 3030 new_trace.backtrack(), 3031 false); 3032 if (!assembler->CheckSpecialCharacterClass('n', 3033 new_trace.backtrack())) { 3034 // Newline means \n, \r, 0x2028 or 0x2029. 3035 if (!compiler->one_byte()) { 3036 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok); 3037 } 3038 assembler->CheckCharacter('\n', &ok); 3039 assembler->CheckNotCharacter('\r', new_trace.backtrack()); 3040 } 3041 assembler->Bind(&ok); 3042 on_success->Emit(compiler, &new_trace); 3043 } 3044 3045 3046 // Emit the code to handle \b and \B (word-boundary or non-word-boundary). 3047 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) { 3048 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3049 Trace::TriBool next_is_word_character = Trace::UNKNOWN; 3050 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE); 3051 BoyerMooreLookahead* lookahead = bm_info(not_at_start); 3052 if (lookahead == NULL) { 3053 int eats_at_least = 3054 Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore, 3055 kRecursionBudget, 3056 not_at_start)); 3057 if (eats_at_least >= 1) { 3058 BoyerMooreLookahead* bm = 3059 new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone()); 3060 FillInBMInfo(0, kRecursionBudget, bm, not_at_start); 3061 if (bm->at(0)->is_non_word()) 3062 next_is_word_character = Trace::FALSE_VALUE; 3063 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE; 3064 } 3065 } else { 3066 if (lookahead->at(0)->is_non_word()) 3067 next_is_word_character = Trace::FALSE_VALUE; 3068 if (lookahead->at(0)->is_word()) 3069 next_is_word_character = Trace::TRUE_VALUE; 3070 } 3071 bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY); 3072 if (next_is_word_character == Trace::UNKNOWN) { 3073 Label before_non_word; 3074 Label before_word; 3075 if (trace->characters_preloaded() != 1) { 3076 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word); 3077 } 3078 // Fall through on non-word. 3079 EmitWordCheck(assembler, &before_word, &before_non_word, false); 3080 // Next character is not a word character. 3081 assembler->Bind(&before_non_word); 3082 Label ok; 3083 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord); 3084 assembler->GoTo(&ok); 3085 3086 assembler->Bind(&before_word); 3087 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord); 3088 assembler->Bind(&ok); 3089 } else if (next_is_word_character == Trace::TRUE_VALUE) { 3090 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord); 3091 } else { 3092 DCHECK(next_is_word_character == Trace::FALSE_VALUE); 3093 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord); 3094 } 3095 } 3096 3097 3098 void AssertionNode::BacktrackIfPrevious( 3099 RegExpCompiler* compiler, 3100 Trace* trace, 3101 AssertionNode::IfPrevious backtrack_if_previous) { 3102 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3103 Trace new_trace(*trace); 3104 new_trace.InvalidateCurrentCharacter(); 3105 3106 Label fall_through, dummy; 3107 3108 Label* non_word = backtrack_if_previous == kIsNonWord ? 3109 new_trace.backtrack() : 3110 &fall_through; 3111 Label* word = backtrack_if_previous == kIsNonWord ? 3112 &fall_through : 3113 new_trace.backtrack(); 3114 3115 if (new_trace.cp_offset() == 0) { 3116 // The start of input counts as a non-word character, so the question is 3117 // decided if we are at the start. 3118 assembler->CheckAtStart(non_word); 3119 } 3120 // We already checked that we are not at the start of input so it must be 3121 // OK to load the previous character. 3122 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false); 3123 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord); 3124 3125 assembler->Bind(&fall_through); 3126 on_success()->Emit(compiler, &new_trace); 3127 } 3128 3129 3130 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details, 3131 RegExpCompiler* compiler, 3132 int filled_in, 3133 bool not_at_start) { 3134 if (assertion_type_ == AT_START && not_at_start) { 3135 details->set_cannot_match(); 3136 return; 3137 } 3138 return on_success()->GetQuickCheckDetails(details, 3139 compiler, 3140 filled_in, 3141 not_at_start); 3142 } 3143 3144 3145 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3146 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3147 switch (assertion_type_) { 3148 case AT_END: { 3149 Label ok; 3150 assembler->CheckPosition(trace->cp_offset(), &ok); 3151 assembler->GoTo(trace->backtrack()); 3152 assembler->Bind(&ok); 3153 break; 3154 } 3155 case AT_START: { 3156 if (trace->at_start() == Trace::FALSE_VALUE) { 3157 assembler->GoTo(trace->backtrack()); 3158 return; 3159 } 3160 if (trace->at_start() == Trace::UNKNOWN) { 3161 assembler->CheckNotAtStart(trace->backtrack()); 3162 Trace at_start_trace = *trace; 3163 at_start_trace.set_at_start(true); 3164 on_success()->Emit(compiler, &at_start_trace); 3165 return; 3166 } 3167 } 3168 break; 3169 case AFTER_NEWLINE: 3170 EmitHat(compiler, on_success(), trace); 3171 return; 3172 case AT_BOUNDARY: 3173 case AT_NON_BOUNDARY: { 3174 EmitBoundaryCheck(compiler, trace); 3175 return; 3176 } 3177 } 3178 on_success()->Emit(compiler, trace); 3179 } 3180 3181 3182 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) { 3183 if (quick_check == NULL) return false; 3184 if (offset >= quick_check->characters()) return false; 3185 return quick_check->positions(offset)->determines_perfectly; 3186 } 3187 3188 3189 static void UpdateBoundsCheck(int index, int* checked_up_to) { 3190 if (index > *checked_up_to) { 3191 *checked_up_to = index; 3192 } 3193 } 3194 3195 3196 // We call this repeatedly to generate code for each pass over the text node. 3197 // The passes are in increasing order of difficulty because we hope one 3198 // of the first passes will fail in which case we are saved the work of the 3199 // later passes. for example for the case independent regexp /%[asdfghjkl]a/ 3200 // we will check the '%' in the first pass, the case independent 'a' in the 3201 // second pass and the character class in the last pass. 3202 // 3203 // The passes are done from right to left, so for example to test for /bar/ 3204 // we will first test for an 'r' with offset 2, then an 'a' with offset 1 3205 // and then a 'b' with offset 0. This means we can avoid the end-of-input 3206 // bounds check most of the time. In the example we only need to check for 3207 // end-of-input when loading the putative 'r'. 3208 // 3209 // A slight complication involves the fact that the first character may already 3210 // be fetched into a register by the previous node. In this case we want to 3211 // do the test for that character first. We do this in separate passes. The 3212 // 'preloaded' argument indicates that we are doing such a 'pass'. If such a 3213 // pass has been performed then subsequent passes will have true in 3214 // first_element_checked to indicate that that character does not need to be 3215 // checked again. 3216 // 3217 // In addition to all this we are passed a Trace, which can 3218 // contain an AlternativeGeneration object. In this AlternativeGeneration 3219 // object we can see details of any quick check that was already passed in 3220 // order to get to the code we are now generating. The quick check can involve 3221 // loading characters, which means we do not need to recheck the bounds 3222 // up to the limit the quick check already checked. In addition the quick 3223 // check can have involved a mask and compare operation which may simplify 3224 // or obviate the need for further checks at some character positions. 3225 void TextNode::TextEmitPass(RegExpCompiler* compiler, 3226 TextEmitPassType pass, 3227 bool preloaded, 3228 Trace* trace, 3229 bool first_element_checked, 3230 int* checked_up_to) { 3231 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3232 Isolate* isolate = assembler->zone()->isolate(); 3233 bool one_byte = compiler->one_byte(); 3234 Label* backtrack = trace->backtrack(); 3235 QuickCheckDetails* quick_check = trace->quick_check_performed(); 3236 int element_count = elms_->length(); 3237 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) { 3238 TextElement elm = elms_->at(i); 3239 int cp_offset = trace->cp_offset() + elm.cp_offset(); 3240 if (elm.text_type() == TextElement::ATOM) { 3241 Vector<const uc16> quarks = elm.atom()->data(); 3242 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { 3243 if (first_element_checked && i == 0 && j == 0) continue; 3244 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue; 3245 EmitCharacterFunction* emit_function = NULL; 3246 switch (pass) { 3247 case NON_LATIN1_MATCH: 3248 DCHECK(one_byte); 3249 if (quarks[j] > String::kMaxOneByteCharCode) { 3250 assembler->GoTo(backtrack); 3251 return; 3252 } 3253 break; 3254 case NON_LETTER_CHARACTER_MATCH: 3255 emit_function = &EmitAtomNonLetter; 3256 break; 3257 case SIMPLE_CHARACTER_MATCH: 3258 emit_function = &EmitSimpleCharacter; 3259 break; 3260 case CASE_CHARACTER_MATCH: 3261 emit_function = &EmitAtomLetter; 3262 break; 3263 default: 3264 break; 3265 } 3266 if (emit_function != NULL) { 3267 bool bound_checked = emit_function(isolate, 3268 compiler, 3269 quarks[j], 3270 backtrack, 3271 cp_offset + j, 3272 *checked_up_to < cp_offset + j, 3273 preloaded); 3274 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); 3275 } 3276 } 3277 } else { 3278 DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type()); 3279 if (pass == CHARACTER_CLASS_MATCH) { 3280 if (first_element_checked && i == 0) continue; 3281 if (DeterminedAlready(quick_check, elm.cp_offset())) continue; 3282 RegExpCharacterClass* cc = elm.char_class(); 3283 EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset, 3284 *checked_up_to < cp_offset, preloaded, zone()); 3285 UpdateBoundsCheck(cp_offset, checked_up_to); 3286 } 3287 } 3288 } 3289 } 3290 3291 3292 int TextNode::Length() { 3293 TextElement elm = elms_->last(); 3294 DCHECK(elm.cp_offset() >= 0); 3295 return elm.cp_offset() + elm.length(); 3296 } 3297 3298 3299 bool TextNode::SkipPass(int int_pass, bool ignore_case) { 3300 TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass); 3301 if (ignore_case) { 3302 return pass == SIMPLE_CHARACTER_MATCH; 3303 } else { 3304 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH; 3305 } 3306 } 3307 3308 3309 // This generates the code to match a text node. A text node can contain 3310 // straight character sequences (possibly to be matched in a case-independent 3311 // way) and character classes. For efficiency we do not do this in a single 3312 // pass from left to right. Instead we pass over the text node several times, 3313 // emitting code for some character positions every time. See the comment on 3314 // TextEmitPass for details. 3315 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3316 LimitResult limit_result = LimitVersions(compiler, trace); 3317 if (limit_result == DONE) return; 3318 DCHECK(limit_result == CONTINUE); 3319 3320 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) { 3321 compiler->SetRegExpTooBig(); 3322 return; 3323 } 3324 3325 if (compiler->one_byte()) { 3326 int dummy = 0; 3327 TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy); 3328 } 3329 3330 bool first_elt_done = false; 3331 int bound_checked_to = trace->cp_offset() - 1; 3332 bound_checked_to += trace->bound_checked_up_to(); 3333 3334 // If a character is preloaded into the current character register then 3335 // check that now. 3336 if (trace->characters_preloaded() == 1) { 3337 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { 3338 if (!SkipPass(pass, compiler->ignore_case())) { 3339 TextEmitPass(compiler, 3340 static_cast<TextEmitPassType>(pass), 3341 true, 3342 trace, 3343 false, 3344 &bound_checked_to); 3345 } 3346 } 3347 first_elt_done = true; 3348 } 3349 3350 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { 3351 if (!SkipPass(pass, compiler->ignore_case())) { 3352 TextEmitPass(compiler, 3353 static_cast<TextEmitPassType>(pass), 3354 false, 3355 trace, 3356 first_elt_done, 3357 &bound_checked_to); 3358 } 3359 } 3360 3361 Trace successor_trace(*trace); 3362 successor_trace.set_at_start(false); 3363 successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler); 3364 RecursionCheck rc(compiler); 3365 on_success()->Emit(compiler, &successor_trace); 3366 } 3367 3368 3369 void Trace::InvalidateCurrentCharacter() { 3370 characters_preloaded_ = 0; 3371 } 3372 3373 3374 void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) { 3375 DCHECK(by > 0); 3376 // We don't have an instruction for shifting the current character register 3377 // down or for using a shifted value for anything so lets just forget that 3378 // we preloaded any characters into it. 3379 characters_preloaded_ = 0; 3380 // Adjust the offsets of the quick check performed information. This 3381 // information is used to find out what we already determined about the 3382 // characters by means of mask and compare. 3383 quick_check_performed_.Advance(by, compiler->one_byte()); 3384 cp_offset_ += by; 3385 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) { 3386 compiler->SetRegExpTooBig(); 3387 cp_offset_ = 0; 3388 } 3389 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by); 3390 } 3391 3392 3393 void TextNode::MakeCaseIndependent(bool is_one_byte) { 3394 int element_count = elms_->length(); 3395 for (int i = 0; i < element_count; i++) { 3396 TextElement elm = elms_->at(i); 3397 if (elm.text_type() == TextElement::CHAR_CLASS) { 3398 RegExpCharacterClass* cc = elm.char_class(); 3399 // None of the standard character classes is different in the case 3400 // independent case and it slows us down if we don't know that. 3401 if (cc->is_standard(zone())) continue; 3402 ZoneList<CharacterRange>* ranges = cc->ranges(zone()); 3403 int range_count = ranges->length(); 3404 for (int j = 0; j < range_count; j++) { 3405 ranges->at(j).AddCaseEquivalents(ranges, is_one_byte, zone()); 3406 } 3407 } 3408 } 3409 } 3410 3411 3412 int TextNode::GreedyLoopTextLength() { 3413 TextElement elm = elms_->at(elms_->length() - 1); 3414 return elm.cp_offset() + elm.length(); 3415 } 3416 3417 3418 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode( 3419 RegExpCompiler* compiler) { 3420 if (elms_->length() != 1) return NULL; 3421 TextElement elm = elms_->at(0); 3422 if (elm.text_type() != TextElement::CHAR_CLASS) return NULL; 3423 RegExpCharacterClass* node = elm.char_class(); 3424 ZoneList<CharacterRange>* ranges = node->ranges(zone()); 3425 if (!CharacterRange::IsCanonical(ranges)) { 3426 CharacterRange::Canonicalize(ranges); 3427 } 3428 if (node->is_negated()) { 3429 return ranges->length() == 0 ? on_success() : NULL; 3430 } 3431 if (ranges->length() != 1) return NULL; 3432 uint32_t max_char; 3433 if (compiler->one_byte()) { 3434 max_char = String::kMaxOneByteCharCode; 3435 } else { 3436 max_char = String::kMaxUtf16CodeUnit; 3437 } 3438 return ranges->at(0).IsEverything(max_char) ? on_success() : NULL; 3439 } 3440 3441 3442 // Finds the fixed match length of a sequence of nodes that goes from 3443 // this alternative and back to this choice node. If there are variable 3444 // length nodes or other complications in the way then return a sentinel 3445 // value indicating that a greedy loop cannot be constructed. 3446 int ChoiceNode::GreedyLoopTextLengthForAlternative( 3447 GuardedAlternative* alternative) { 3448 int length = 0; 3449 RegExpNode* node = alternative->node(); 3450 // Later we will generate code for all these text nodes using recursion 3451 // so we have to limit the max number. 3452 int recursion_depth = 0; 3453 while (node != this) { 3454 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) { 3455 return kNodeIsTooComplexForGreedyLoops; 3456 } 3457 int node_length = node->GreedyLoopTextLength(); 3458 if (node_length == kNodeIsTooComplexForGreedyLoops) { 3459 return kNodeIsTooComplexForGreedyLoops; 3460 } 3461 length += node_length; 3462 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node); 3463 node = seq_node->on_success(); 3464 } 3465 return length; 3466 } 3467 3468 3469 void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) { 3470 DCHECK_EQ(loop_node_, NULL); 3471 AddAlternative(alt); 3472 loop_node_ = alt.node(); 3473 } 3474 3475 3476 void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) { 3477 DCHECK_EQ(continue_node_, NULL); 3478 AddAlternative(alt); 3479 continue_node_ = alt.node(); 3480 } 3481 3482 3483 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3484 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 3485 if (trace->stop_node() == this) { 3486 // Back edge of greedy optimized loop node graph. 3487 int text_length = 3488 GreedyLoopTextLengthForAlternative(&(alternatives_->at(0))); 3489 DCHECK(text_length != kNodeIsTooComplexForGreedyLoops); 3490 // Update the counter-based backtracking info on the stack. This is an 3491 // optimization for greedy loops (see below). 3492 DCHECK(trace->cp_offset() == text_length); 3493 macro_assembler->AdvanceCurrentPosition(text_length); 3494 macro_assembler->GoTo(trace->loop_label()); 3495 return; 3496 } 3497 DCHECK(trace->stop_node() == NULL); 3498 if (!trace->is_trivial()) { 3499 trace->Flush(compiler, this); 3500 return; 3501 } 3502 ChoiceNode::Emit(compiler, trace); 3503 } 3504 3505 3506 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler, 3507 int eats_at_least) { 3508 int preload_characters = Min(4, eats_at_least); 3509 if (compiler->macro_assembler()->CanReadUnaligned()) { 3510 bool one_byte = compiler->one_byte(); 3511 if (one_byte) { 3512 if (preload_characters > 4) preload_characters = 4; 3513 // We can't preload 3 characters because there is no machine instruction 3514 // to do that. We can't just load 4 because we could be reading 3515 // beyond the end of the string, which could cause a memory fault. 3516 if (preload_characters == 3) preload_characters = 2; 3517 } else { 3518 if (preload_characters > 2) preload_characters = 2; 3519 } 3520 } else { 3521 if (preload_characters > 1) preload_characters = 1; 3522 } 3523 return preload_characters; 3524 } 3525 3526 3527 // This class is used when generating the alternatives in a choice node. It 3528 // records the way the alternative is being code generated. 3529 class AlternativeGeneration: public Malloced { 3530 public: 3531 AlternativeGeneration() 3532 : possible_success(), 3533 expects_preload(false), 3534 after(), 3535 quick_check_details() { } 3536 Label possible_success; 3537 bool expects_preload; 3538 Label after; 3539 QuickCheckDetails quick_check_details; 3540 }; 3541 3542 3543 // Creates a list of AlternativeGenerations. If the list has a reasonable 3544 // size then it is on the stack, otherwise the excess is on the heap. 3545 class AlternativeGenerationList { 3546 public: 3547 AlternativeGenerationList(int count, Zone* zone) 3548 : alt_gens_(count, zone) { 3549 for (int i = 0; i < count && i < kAFew; i++) { 3550 alt_gens_.Add(a_few_alt_gens_ + i, zone); 3551 } 3552 for (int i = kAFew; i < count; i++) { 3553 alt_gens_.Add(new AlternativeGeneration(), zone); 3554 } 3555 } 3556 ~AlternativeGenerationList() { 3557 for (int i = kAFew; i < alt_gens_.length(); i++) { 3558 delete alt_gens_[i]; 3559 alt_gens_[i] = NULL; 3560 } 3561 } 3562 3563 AlternativeGeneration* at(int i) { 3564 return alt_gens_[i]; 3565 } 3566 3567 private: 3568 static const int kAFew = 10; 3569 ZoneList<AlternativeGeneration*> alt_gens_; 3570 AlternativeGeneration a_few_alt_gens_[kAFew]; 3571 }; 3572 3573 3574 // The '2' variant is has inclusive from and exclusive to. 3575 // This covers \s as defined in ECMA-262 5.1, 15.10.2.12, 3576 // which include WhiteSpace (7.2) or LineTerminator (7.3) values. 3577 static const int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1, 3578 0x00A0, 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B, 3579 0x2028, 0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001, 3580 0xFEFF, 0xFF00, 0x10000 }; 3581 static const int kSpaceRangeCount = arraysize(kSpaceRanges); 3582 3583 static const int kWordRanges[] = { 3584 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 }; 3585 static const int kWordRangeCount = arraysize(kWordRanges); 3586 static const int kDigitRanges[] = { '0', '9' + 1, 0x10000 }; 3587 static const int kDigitRangeCount = arraysize(kDigitRanges); 3588 static const int kSurrogateRanges[] = { 0xd800, 0xe000, 0x10000 }; 3589 static const int kSurrogateRangeCount = arraysize(kSurrogateRanges); 3590 static const int kLineTerminatorRanges[] = { 0x000A, 0x000B, 0x000D, 0x000E, 3591 0x2028, 0x202A, 0x10000 }; 3592 static const int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges); 3593 3594 3595 void BoyerMoorePositionInfo::Set(int character) { 3596 SetInterval(Interval(character, character)); 3597 } 3598 3599 3600 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) { 3601 s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval); 3602 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval); 3603 d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval); 3604 surrogate_ = 3605 AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval); 3606 if (interval.to() - interval.from() >= kMapSize - 1) { 3607 if (map_count_ != kMapSize) { 3608 map_count_ = kMapSize; 3609 for (int i = 0; i < kMapSize; i++) map_->at(i) = true; 3610 } 3611 return; 3612 } 3613 for (int i = interval.from(); i <= interval.to(); i++) { 3614 int mod_character = (i & kMask); 3615 if (!map_->at(mod_character)) { 3616 map_count_++; 3617 map_->at(mod_character) = true; 3618 } 3619 if (map_count_ == kMapSize) return; 3620 } 3621 } 3622 3623 3624 void BoyerMoorePositionInfo::SetAll() { 3625 s_ = w_ = d_ = kLatticeUnknown; 3626 if (map_count_ != kMapSize) { 3627 map_count_ = kMapSize; 3628 for (int i = 0; i < kMapSize; i++) map_->at(i) = true; 3629 } 3630 } 3631 3632 3633 BoyerMooreLookahead::BoyerMooreLookahead( 3634 int length, RegExpCompiler* compiler, Zone* zone) 3635 : length_(length), 3636 compiler_(compiler) { 3637 if (compiler->one_byte()) { 3638 max_char_ = String::kMaxOneByteCharCode; 3639 } else { 3640 max_char_ = String::kMaxUtf16CodeUnit; 3641 } 3642 bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone); 3643 for (int i = 0; i < length; i++) { 3644 bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone); 3645 } 3646 } 3647 3648 3649 // Find the longest range of lookahead that has the fewest number of different 3650 // characters that can occur at a given position. Since we are optimizing two 3651 // different parameters at once this is a tradeoff. 3652 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) { 3653 int biggest_points = 0; 3654 // If more than 32 characters out of 128 can occur it is unlikely that we can 3655 // be lucky enough to step forwards much of the time. 3656 const int kMaxMax = 32; 3657 for (int max_number_of_chars = 4; 3658 max_number_of_chars < kMaxMax; 3659 max_number_of_chars *= 2) { 3660 biggest_points = 3661 FindBestInterval(max_number_of_chars, biggest_points, from, to); 3662 } 3663 if (biggest_points == 0) return false; 3664 return true; 3665 } 3666 3667 3668 // Find the highest-points range between 0 and length_ where the character 3669 // information is not too vague. 'Too vague' means that there are more than 3670 // max_number_of_chars that can occur at this position. Calculates the number 3671 // of points as the product of width-of-the-range and 3672 // probability-of-finding-one-of-the-characters, where the probability is 3673 // calculated using the frequency distribution of the sample subject string. 3674 int BoyerMooreLookahead::FindBestInterval( 3675 int max_number_of_chars, int old_biggest_points, int* from, int* to) { 3676 int biggest_points = old_biggest_points; 3677 static const int kSize = RegExpMacroAssembler::kTableSize; 3678 for (int i = 0; i < length_; ) { 3679 while (i < length_ && Count(i) > max_number_of_chars) i++; 3680 if (i == length_) break; 3681 int remembered_from = i; 3682 bool union_map[kSize]; 3683 for (int j = 0; j < kSize; j++) union_map[j] = false; 3684 while (i < length_ && Count(i) <= max_number_of_chars) { 3685 BoyerMoorePositionInfo* map = bitmaps_->at(i); 3686 for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j); 3687 i++; 3688 } 3689 int frequency = 0; 3690 for (int j = 0; j < kSize; j++) { 3691 if (union_map[j]) { 3692 // Add 1 to the frequency to give a small per-character boost for 3693 // the cases where our sampling is not good enough and many 3694 // characters have a frequency of zero. This means the frequency 3695 // can theoretically be up to 2*kSize though we treat it mostly as 3696 // a fraction of kSize. 3697 frequency += compiler_->frequency_collator()->Frequency(j) + 1; 3698 } 3699 } 3700 // We use the probability of skipping times the distance we are skipping to 3701 // judge the effectiveness of this. Actually we have a cut-off: By 3702 // dividing by 2 we switch off the skipping if the probability of skipping 3703 // is less than 50%. This is because the multibyte mask-and-compare 3704 // skipping in quickcheck is more likely to do well on this case. 3705 bool in_quickcheck_range = 3706 ((i - remembered_from < 4) || 3707 (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2)); 3708 // Called 'probability' but it is only a rough estimate and can actually 3709 // be outside the 0-kSize range. 3710 int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency; 3711 int points = (i - remembered_from) * probability; 3712 if (points > biggest_points) { 3713 *from = remembered_from; 3714 *to = i - 1; 3715 biggest_points = points; 3716 } 3717 } 3718 return biggest_points; 3719 } 3720 3721 3722 // Take all the characters that will not prevent a successful match if they 3723 // occur in the subject string in the range between min_lookahead and 3724 // max_lookahead (inclusive) measured from the current position. If the 3725 // character at max_lookahead offset is not one of these characters, then we 3726 // can safely skip forwards by the number of characters in the range. 3727 int BoyerMooreLookahead::GetSkipTable(int min_lookahead, 3728 int max_lookahead, 3729 Handle<ByteArray> boolean_skip_table) { 3730 const int kSize = RegExpMacroAssembler::kTableSize; 3731 3732 const int kSkipArrayEntry = 0; 3733 const int kDontSkipArrayEntry = 1; 3734 3735 for (int i = 0; i < kSize; i++) { 3736 boolean_skip_table->set(i, kSkipArrayEntry); 3737 } 3738 int skip = max_lookahead + 1 - min_lookahead; 3739 3740 for (int i = max_lookahead; i >= min_lookahead; i--) { 3741 BoyerMoorePositionInfo* map = bitmaps_->at(i); 3742 for (int j = 0; j < kSize; j++) { 3743 if (map->at(j)) { 3744 boolean_skip_table->set(j, kDontSkipArrayEntry); 3745 } 3746 } 3747 } 3748 3749 return skip; 3750 } 3751 3752 3753 // See comment above on the implementation of GetSkipTable. 3754 void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) { 3755 const int kSize = RegExpMacroAssembler::kTableSize; 3756 3757 int min_lookahead = 0; 3758 int max_lookahead = 0; 3759 3760 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return; 3761 3762 bool found_single_character = false; 3763 int single_character = 0; 3764 for (int i = max_lookahead; i >= min_lookahead; i--) { 3765 BoyerMoorePositionInfo* map = bitmaps_->at(i); 3766 if (map->map_count() > 1 || 3767 (found_single_character && map->map_count() != 0)) { 3768 found_single_character = false; 3769 break; 3770 } 3771 for (int j = 0; j < kSize; j++) { 3772 if (map->at(j)) { 3773 found_single_character = true; 3774 single_character = j; 3775 break; 3776 } 3777 } 3778 } 3779 3780 int lookahead_width = max_lookahead + 1 - min_lookahead; 3781 3782 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) { 3783 // The mask-compare can probably handle this better. 3784 return; 3785 } 3786 3787 if (found_single_character) { 3788 Label cont, again; 3789 masm->Bind(&again); 3790 masm->LoadCurrentCharacter(max_lookahead, &cont, true); 3791 if (max_char_ > kSize) { 3792 masm->CheckCharacterAfterAnd(single_character, 3793 RegExpMacroAssembler::kTableMask, 3794 &cont); 3795 } else { 3796 masm->CheckCharacter(single_character, &cont); 3797 } 3798 masm->AdvanceCurrentPosition(lookahead_width); 3799 masm->GoTo(&again); 3800 masm->Bind(&cont); 3801 return; 3802 } 3803 3804 Factory* factory = masm->zone()->isolate()->factory(); 3805 Handle<ByteArray> boolean_skip_table = factory->NewByteArray(kSize, TENURED); 3806 int skip_distance = GetSkipTable( 3807 min_lookahead, max_lookahead, boolean_skip_table); 3808 DCHECK(skip_distance != 0); 3809 3810 Label cont, again; 3811 masm->Bind(&again); 3812 masm->LoadCurrentCharacter(max_lookahead, &cont, true); 3813 masm->CheckBitInTable(boolean_skip_table, &cont); 3814 masm->AdvanceCurrentPosition(skip_distance); 3815 masm->GoTo(&again); 3816 masm->Bind(&cont); 3817 } 3818 3819 3820 /* Code generation for choice nodes. 3821 * 3822 * We generate quick checks that do a mask and compare to eliminate a 3823 * choice. If the quick check succeeds then it jumps to the continuation to 3824 * do slow checks and check subsequent nodes. If it fails (the common case) 3825 * it falls through to the next choice. 3826 * 3827 * Here is the desired flow graph. Nodes directly below each other imply 3828 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative 3829 * 3 doesn't have a quick check so we have to call the slow check. 3830 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire 3831 * regexp continuation is generated directly after the Sn node, up to the 3832 * next GoTo if we decide to reuse some already generated code. Some 3833 * nodes expect preload_characters to be preloaded into the current 3834 * character register. R nodes do this preloading. Vertices are marked 3835 * F for failures and S for success (possible success in the case of quick 3836 * nodes). L, V, < and > are used as arrow heads. 3837 * 3838 * ----------> R 3839 * | 3840 * V 3841 * Q1 -----> S1 3842 * | S / 3843 * F| / 3844 * | F/ 3845 * | / 3846 * | R 3847 * | / 3848 * V L 3849 * Q2 -----> S2 3850 * | S / 3851 * F| / 3852 * | F/ 3853 * | / 3854 * | R 3855 * | / 3856 * V L 3857 * S3 3858 * | 3859 * F| 3860 * | 3861 * R 3862 * | 3863 * backtrack V 3864 * <----------Q4 3865 * \ F | 3866 * \ |S 3867 * \ F V 3868 * \-----S4 3869 * 3870 * For greedy loops we push the current position, then generate the code that 3871 * eats the input specially in EmitGreedyLoop. The other choice (the 3872 * continuation) is generated by the normal code in EmitChoices, and steps back 3873 * in the input to the starting position when it fails to match. The loop code 3874 * looks like this (U is the unwind code that steps back in the greedy loop). 3875 * 3876 * _____ 3877 * / \ 3878 * V | 3879 * ----------> S1 | 3880 * /| | 3881 * / |S | 3882 * F/ \_____/ 3883 * / 3884 * |<----- 3885 * | \ 3886 * V |S 3887 * Q2 ---> U----->backtrack 3888 * | F / 3889 * S| / 3890 * V F / 3891 * S2--/ 3892 */ 3893 3894 GreedyLoopState::GreedyLoopState(bool not_at_start) { 3895 counter_backtrack_trace_.set_backtrack(&label_); 3896 if (not_at_start) counter_backtrack_trace_.set_at_start(false); 3897 } 3898 3899 3900 void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) { 3901 #ifdef DEBUG 3902 int choice_count = alternatives_->length(); 3903 for (int i = 0; i < choice_count - 1; i++) { 3904 GuardedAlternative alternative = alternatives_->at(i); 3905 ZoneList<Guard*>* guards = alternative.guards(); 3906 int guard_count = (guards == NULL) ? 0 : guards->length(); 3907 for (int j = 0; j < guard_count; j++) { 3908 DCHECK(!trace->mentions_reg(guards->at(j)->reg())); 3909 } 3910 } 3911 #endif 3912 } 3913 3914 3915 void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler, 3916 Trace* current_trace, 3917 PreloadState* state) { 3918 if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) { 3919 // Save some time by looking at most one machine word ahead. 3920 state->eats_at_least_ = 3921 EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget, 3922 current_trace->at_start() == Trace::FALSE_VALUE); 3923 } 3924 state->preload_characters_ = 3925 CalculatePreloadCharacters(compiler, state->eats_at_least_); 3926 3927 state->preload_is_current_ = 3928 (current_trace->characters_preloaded() == state->preload_characters_); 3929 state->preload_has_checked_bounds_ = state->preload_is_current_; 3930 } 3931 3932 3933 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3934 int choice_count = alternatives_->length(); 3935 3936 AssertGuardsMentionRegisters(trace); 3937 3938 LimitResult limit_result = LimitVersions(compiler, trace); 3939 if (limit_result == DONE) return; 3940 DCHECK(limit_result == CONTINUE); 3941 3942 // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for 3943 // other choice nodes we only flush if we are out of code size budget. 3944 if (trace->flush_budget() == 0 && trace->actions() != NULL) { 3945 trace->Flush(compiler, this); 3946 return; 3947 } 3948 3949 RecursionCheck rc(compiler); 3950 3951 PreloadState preload; 3952 preload.init(); 3953 GreedyLoopState greedy_loop_state(not_at_start()); 3954 3955 int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0)); 3956 AlternativeGenerationList alt_gens(choice_count, zone()); 3957 3958 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) { 3959 trace = EmitGreedyLoop(compiler, 3960 trace, 3961 &alt_gens, 3962 &preload, 3963 &greedy_loop_state, 3964 text_length); 3965 } else { 3966 // TODO(erikcorry): Delete this. We don't need this label, but it makes us 3967 // match the traces produced pre-cleanup. 3968 Label second_choice; 3969 compiler->macro_assembler()->Bind(&second_choice); 3970 3971 preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace); 3972 3973 EmitChoices(compiler, 3974 &alt_gens, 3975 0, 3976 trace, 3977 &preload); 3978 } 3979 3980 // At this point we need to generate slow checks for the alternatives where 3981 // the quick check was inlined. We can recognize these because the associated 3982 // label was bound. 3983 int new_flush_budget = trace->flush_budget() / choice_count; 3984 for (int i = 0; i < choice_count; i++) { 3985 AlternativeGeneration* alt_gen = alt_gens.at(i); 3986 Trace new_trace(*trace); 3987 // If there are actions to be flushed we have to limit how many times 3988 // they are flushed. Take the budget of the parent trace and distribute 3989 // it fairly amongst the children. 3990 if (new_trace.actions() != NULL) { 3991 new_trace.set_flush_budget(new_flush_budget); 3992 } 3993 bool next_expects_preload = 3994 i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload; 3995 EmitOutOfLineContinuation(compiler, 3996 &new_trace, 3997 alternatives_->at(i), 3998 alt_gen, 3999 preload.preload_characters_, 4000 next_expects_preload); 4001 } 4002 } 4003 4004 4005 Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler, 4006 Trace* trace, 4007 AlternativeGenerationList* alt_gens, 4008 PreloadState* preload, 4009 GreedyLoopState* greedy_loop_state, 4010 int text_length) { 4011 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 4012 // Here we have special handling for greedy loops containing only text nodes 4013 // and other simple nodes. These are handled by pushing the current 4014 // position on the stack and then incrementing the current position each 4015 // time around the switch. On backtrack we decrement the current position 4016 // and check it against the pushed value. This avoids pushing backtrack 4017 // information for each iteration of the loop, which could take up a lot of 4018 // space. 4019 DCHECK(trace->stop_node() == NULL); 4020 macro_assembler->PushCurrentPosition(); 4021 Label greedy_match_failed; 4022 Trace greedy_match_trace; 4023 if (not_at_start()) greedy_match_trace.set_at_start(false); 4024 greedy_match_trace.set_backtrack(&greedy_match_failed); 4025 Label loop_label; 4026 macro_assembler->Bind(&loop_label); 4027 greedy_match_trace.set_stop_node(this); 4028 greedy_match_trace.set_loop_label(&loop_label); 4029 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace); 4030 macro_assembler->Bind(&greedy_match_failed); 4031 4032 Label second_choice; // For use in greedy matches. 4033 macro_assembler->Bind(&second_choice); 4034 4035 Trace* new_trace = greedy_loop_state->counter_backtrack_trace(); 4036 4037 EmitChoices(compiler, 4038 alt_gens, 4039 1, 4040 new_trace, 4041 preload); 4042 4043 macro_assembler->Bind(greedy_loop_state->label()); 4044 // If we have unwound to the bottom then backtrack. 4045 macro_assembler->CheckGreedyLoop(trace->backtrack()); 4046 // Otherwise try the second priority at an earlier position. 4047 macro_assembler->AdvanceCurrentPosition(-text_length); 4048 macro_assembler->GoTo(&second_choice); 4049 return new_trace; 4050 } 4051 4052 int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler, 4053 Trace* trace) { 4054 int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized; 4055 if (alternatives_->length() != 2) return eats_at_least; 4056 4057 GuardedAlternative alt1 = alternatives_->at(1); 4058 if (alt1.guards() != NULL && alt1.guards()->length() != 0) { 4059 return eats_at_least; 4060 } 4061 RegExpNode* eats_anything_node = alt1.node(); 4062 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) { 4063 return eats_at_least; 4064 } 4065 4066 // Really we should be creating a new trace when we execute this function, 4067 // but there is no need, because the code it generates cannot backtrack, and 4068 // we always arrive here with a trivial trace (since it's the entry to a 4069 // loop. That also implies that there are no preloaded characters, which is 4070 // good, because it means we won't be violating any assumptions by 4071 // overwriting those characters with new load instructions. 4072 DCHECK(trace->is_trivial()); 4073 4074 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 4075 // At this point we know that we are at a non-greedy loop that will eat 4076 // any character one at a time. Any non-anchored regexp has such a 4077 // loop prepended to it in order to find where it starts. We look for 4078 // a pattern of the form ...abc... where we can look 6 characters ahead 4079 // and step forwards 3 if the character is not one of abc. Abc need 4080 // not be atoms, they can be any reasonably limited character class or 4081 // small alternation. 4082 BoyerMooreLookahead* bm = bm_info(false); 4083 if (bm == NULL) { 4084 eats_at_least = Min(kMaxLookaheadForBoyerMoore, 4085 EatsAtLeast(kMaxLookaheadForBoyerMoore, 4086 kRecursionBudget, 4087 false)); 4088 if (eats_at_least >= 1) { 4089 bm = new(zone()) BoyerMooreLookahead(eats_at_least, 4090 compiler, 4091 zone()); 4092 GuardedAlternative alt0 = alternatives_->at(0); 4093 alt0.node()->FillInBMInfo(0, kRecursionBudget, bm, false); 4094 } 4095 } 4096 if (bm != NULL) { 4097 bm->EmitSkipInstructions(macro_assembler); 4098 } 4099 return eats_at_least; 4100 } 4101 4102 4103 void ChoiceNode::EmitChoices(RegExpCompiler* compiler, 4104 AlternativeGenerationList* alt_gens, 4105 int first_choice, 4106 Trace* trace, 4107 PreloadState* preload) { 4108 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 4109 SetUpPreLoad(compiler, trace, preload); 4110 4111 // For now we just call all choices one after the other. The idea ultimately 4112 // is to use the Dispatch table to try only the relevant ones. 4113 int choice_count = alternatives_->length(); 4114 4115 int new_flush_budget = trace->flush_budget() / choice_count; 4116 4117 for (int i = first_choice; i < choice_count; i++) { 4118 bool is_last = i == choice_count - 1; 4119 bool fall_through_on_failure = !is_last; 4120 GuardedAlternative alternative = alternatives_->at(i); 4121 AlternativeGeneration* alt_gen = alt_gens->at(i); 4122 alt_gen->quick_check_details.set_characters(preload->preload_characters_); 4123 ZoneList<Guard*>* guards = alternative.guards(); 4124 int guard_count = (guards == NULL) ? 0 : guards->length(); 4125 Trace new_trace(*trace); 4126 new_trace.set_characters_preloaded(preload->preload_is_current_ ? 4127 preload->preload_characters_ : 4128 0); 4129 if (preload->preload_has_checked_bounds_) { 4130 new_trace.set_bound_checked_up_to(preload->preload_characters_); 4131 } 4132 new_trace.quick_check_performed()->Clear(); 4133 if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE); 4134 if (!is_last) { 4135 new_trace.set_backtrack(&alt_gen->after); 4136 } 4137 alt_gen->expects_preload = preload->preload_is_current_; 4138 bool generate_full_check_inline = false; 4139 if (FLAG_regexp_optimization && 4140 try_to_emit_quick_check_for_alternative(i == 0) && 4141 alternative.node()->EmitQuickCheck(compiler, 4142 trace, 4143 &new_trace, 4144 preload->preload_has_checked_bounds_, 4145 &alt_gen->possible_success, 4146 &alt_gen->quick_check_details, 4147 fall_through_on_failure)) { 4148 // Quick check was generated for this choice. 4149 preload->preload_is_current_ = true; 4150 preload->preload_has_checked_bounds_ = true; 4151 // If we generated the quick check to fall through on possible success, 4152 // we now need to generate the full check inline. 4153 if (!fall_through_on_failure) { 4154 macro_assembler->Bind(&alt_gen->possible_success); 4155 new_trace.set_quick_check_performed(&alt_gen->quick_check_details); 4156 new_trace.set_characters_preloaded(preload->preload_characters_); 4157 new_trace.set_bound_checked_up_to(preload->preload_characters_); 4158 generate_full_check_inline = true; 4159 } 4160 } else if (alt_gen->quick_check_details.cannot_match()) { 4161 if (!fall_through_on_failure) { 4162 macro_assembler->GoTo(trace->backtrack()); 4163 } 4164 continue; 4165 } else { 4166 // No quick check was generated. Put the full code here. 4167 // If this is not the first choice then there could be slow checks from 4168 // previous cases that go here when they fail. There's no reason to 4169 // insist that they preload characters since the slow check we are about 4170 // to generate probably can't use it. 4171 if (i != first_choice) { 4172 alt_gen->expects_preload = false; 4173 new_trace.InvalidateCurrentCharacter(); 4174 } 4175 generate_full_check_inline = true; 4176 } 4177 if (generate_full_check_inline) { 4178 if (new_trace.actions() != NULL) { 4179 new_trace.set_flush_budget(new_flush_budget); 4180 } 4181 for (int j = 0; j < guard_count; j++) { 4182 GenerateGuard(macro_assembler, guards->at(j), &new_trace); 4183 } 4184 alternative.node()->Emit(compiler, &new_trace); 4185 preload->preload_is_current_ = false; 4186 } 4187 macro_assembler->Bind(&alt_gen->after); 4188 } 4189 } 4190 4191 4192 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler, 4193 Trace* trace, 4194 GuardedAlternative alternative, 4195 AlternativeGeneration* alt_gen, 4196 int preload_characters, 4197 bool next_expects_preload) { 4198 if (!alt_gen->possible_success.is_linked()) return; 4199 4200 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 4201 macro_assembler->Bind(&alt_gen->possible_success); 4202 Trace out_of_line_trace(*trace); 4203 out_of_line_trace.set_characters_preloaded(preload_characters); 4204 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details); 4205 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE); 4206 ZoneList<Guard*>* guards = alternative.guards(); 4207 int guard_count = (guards == NULL) ? 0 : guards->length(); 4208 if (next_expects_preload) { 4209 Label reload_current_char; 4210 out_of_line_trace.set_backtrack(&reload_current_char); 4211 for (int j = 0; j < guard_count; j++) { 4212 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace); 4213 } 4214 alternative.node()->Emit(compiler, &out_of_line_trace); 4215 macro_assembler->Bind(&reload_current_char); 4216 // Reload the current character, since the next quick check expects that. 4217 // We don't need to check bounds here because we only get into this 4218 // code through a quick check which already did the checked load. 4219 macro_assembler->LoadCurrentCharacter(trace->cp_offset(), 4220 NULL, 4221 false, 4222 preload_characters); 4223 macro_assembler->GoTo(&(alt_gen->after)); 4224 } else { 4225 out_of_line_trace.set_backtrack(&(alt_gen->after)); 4226 for (int j = 0; j < guard_count; j++) { 4227 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace); 4228 } 4229 alternative.node()->Emit(compiler, &out_of_line_trace); 4230 } 4231 } 4232 4233 4234 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) { 4235 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 4236 LimitResult limit_result = LimitVersions(compiler, trace); 4237 if (limit_result == DONE) return; 4238 DCHECK(limit_result == CONTINUE); 4239 4240 RecursionCheck rc(compiler); 4241 4242 switch (action_type_) { 4243 case STORE_POSITION: { 4244 Trace::DeferredCapture 4245 new_capture(data_.u_position_register.reg, 4246 data_.u_position_register.is_capture, 4247 trace); 4248 Trace new_trace = *trace; 4249 new_trace.add_action(&new_capture); 4250 on_success()->Emit(compiler, &new_trace); 4251 break; 4252 } 4253 case INCREMENT_REGISTER: { 4254 Trace::DeferredIncrementRegister 4255 new_increment(data_.u_increment_register.reg); 4256 Trace new_trace = *trace; 4257 new_trace.add_action(&new_increment); 4258 on_success()->Emit(compiler, &new_trace); 4259 break; 4260 } 4261 case SET_REGISTER: { 4262 Trace::DeferredSetRegister 4263 new_set(data_.u_store_register.reg, data_.u_store_register.value); 4264 Trace new_trace = *trace; 4265 new_trace.add_action(&new_set); 4266 on_success()->Emit(compiler, &new_trace); 4267 break; 4268 } 4269 case CLEAR_CAPTURES: { 4270 Trace::DeferredClearCaptures 4271 new_capture(Interval(data_.u_clear_captures.range_from, 4272 data_.u_clear_captures.range_to)); 4273 Trace new_trace = *trace; 4274 new_trace.add_action(&new_capture); 4275 on_success()->Emit(compiler, &new_trace); 4276 break; 4277 } 4278 case BEGIN_SUBMATCH: 4279 if (!trace->is_trivial()) { 4280 trace->Flush(compiler, this); 4281 } else { 4282 assembler->WriteCurrentPositionToRegister( 4283 data_.u_submatch.current_position_register, 0); 4284 assembler->WriteStackPointerToRegister( 4285 data_.u_submatch.stack_pointer_register); 4286 on_success()->Emit(compiler, trace); 4287 } 4288 break; 4289 case EMPTY_MATCH_CHECK: { 4290 int start_pos_reg = data_.u_empty_match_check.start_register; 4291 int stored_pos = 0; 4292 int rep_reg = data_.u_empty_match_check.repetition_register; 4293 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister); 4294 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos); 4295 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) { 4296 // If we know we haven't advanced and there is no minimum we 4297 // can just backtrack immediately. 4298 assembler->GoTo(trace->backtrack()); 4299 } else if (know_dist && stored_pos < trace->cp_offset()) { 4300 // If we know we've advanced we can generate the continuation 4301 // immediately. 4302 on_success()->Emit(compiler, trace); 4303 } else if (!trace->is_trivial()) { 4304 trace->Flush(compiler, this); 4305 } else { 4306 Label skip_empty_check; 4307 // If we have a minimum number of repetitions we check the current 4308 // number first and skip the empty check if it's not enough. 4309 if (has_minimum) { 4310 int limit = data_.u_empty_match_check.repetition_limit; 4311 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check); 4312 } 4313 // If the match is empty we bail out, otherwise we fall through 4314 // to the on-success continuation. 4315 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register, 4316 trace->backtrack()); 4317 assembler->Bind(&skip_empty_check); 4318 on_success()->Emit(compiler, trace); 4319 } 4320 break; 4321 } 4322 case POSITIVE_SUBMATCH_SUCCESS: { 4323 if (!trace->is_trivial()) { 4324 trace->Flush(compiler, this); 4325 return; 4326 } 4327 assembler->ReadCurrentPositionFromRegister( 4328 data_.u_submatch.current_position_register); 4329 assembler->ReadStackPointerFromRegister( 4330 data_.u_submatch.stack_pointer_register); 4331 int clear_register_count = data_.u_submatch.clear_register_count; 4332 if (clear_register_count == 0) { 4333 on_success()->Emit(compiler, trace); 4334 return; 4335 } 4336 int clear_registers_from = data_.u_submatch.clear_register_from; 4337 Label clear_registers_backtrack; 4338 Trace new_trace = *trace; 4339 new_trace.set_backtrack(&clear_registers_backtrack); 4340 on_success()->Emit(compiler, &new_trace); 4341 4342 assembler->Bind(&clear_registers_backtrack); 4343 int clear_registers_to = clear_registers_from + clear_register_count - 1; 4344 assembler->ClearRegisters(clear_registers_from, clear_registers_to); 4345 4346 DCHECK(trace->backtrack() == NULL); 4347 assembler->Backtrack(); 4348 return; 4349 } 4350 default: 4351 UNREACHABLE(); 4352 } 4353 } 4354 4355 4356 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 4357 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 4358 if (!trace->is_trivial()) { 4359 trace->Flush(compiler, this); 4360 return; 4361 } 4362 4363 LimitResult limit_result = LimitVersions(compiler, trace); 4364 if (limit_result == DONE) return; 4365 DCHECK(limit_result == CONTINUE); 4366 4367 RecursionCheck rc(compiler); 4368 4369 DCHECK_EQ(start_reg_ + 1, end_reg_); 4370 if (compiler->ignore_case()) { 4371 assembler->CheckNotBackReferenceIgnoreCase(start_reg_, 4372 trace->backtrack()); 4373 } else { 4374 assembler->CheckNotBackReference(start_reg_, trace->backtrack()); 4375 } 4376 on_success()->Emit(compiler, trace); 4377 } 4378 4379 4380 // ------------------------------------------------------------------- 4381 // Dot/dotty output 4382 4383 4384 #ifdef DEBUG 4385 4386 4387 class DotPrinter: public NodeVisitor { 4388 public: 4389 DotPrinter(OStream& os, bool ignore_case) // NOLINT 4390 : os_(os), 4391 ignore_case_(ignore_case) {} 4392 void PrintNode(const char* label, RegExpNode* node); 4393 void Visit(RegExpNode* node); 4394 void PrintAttributes(RegExpNode* from); 4395 void PrintOnFailure(RegExpNode* from, RegExpNode* to); 4396 #define DECLARE_VISIT(Type) \ 4397 virtual void Visit##Type(Type##Node* that); 4398 FOR_EACH_NODE_TYPE(DECLARE_VISIT) 4399 #undef DECLARE_VISIT 4400 private: 4401 OStream& os_; 4402 bool ignore_case_; 4403 }; 4404 4405 4406 void DotPrinter::PrintNode(const char* label, RegExpNode* node) { 4407 os_ << "digraph G {\n graph [label=\""; 4408 for (int i = 0; label[i]; i++) { 4409 switch (label[i]) { 4410 case '\\': 4411 os_ << "\\\\"; 4412 break; 4413 case '"': 4414 os_ << "\""; 4415 break; 4416 default: 4417 os_ << label[i]; 4418 break; 4419 } 4420 } 4421 os_ << "\"];\n"; 4422 Visit(node); 4423 os_ << "}" << endl; 4424 } 4425 4426 4427 void DotPrinter::Visit(RegExpNode* node) { 4428 if (node->info()->visited) return; 4429 node->info()->visited = true; 4430 node->Accept(this); 4431 } 4432 4433 4434 void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) { 4435 os_ << " n" << from << " -> n" << on_failure << " [style=dotted];\n"; 4436 Visit(on_failure); 4437 } 4438 4439 4440 class TableEntryBodyPrinter { 4441 public: 4442 TableEntryBodyPrinter(OStream& os, ChoiceNode* choice) // NOLINT 4443 : os_(os), 4444 choice_(choice) {} 4445 void Call(uc16 from, DispatchTable::Entry entry) { 4446 OutSet* out_set = entry.out_set(); 4447 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { 4448 if (out_set->Get(i)) { 4449 os_ << " n" << choice() << ":s" << from << "o" << i << " -> n" 4450 << choice()->alternatives()->at(i).node() << ";\n"; 4451 } 4452 } 4453 } 4454 private: 4455 ChoiceNode* choice() { return choice_; } 4456 OStream& os_; 4457 ChoiceNode* choice_; 4458 }; 4459 4460 4461 class TableEntryHeaderPrinter { 4462 public: 4463 explicit TableEntryHeaderPrinter(OStream& os) // NOLINT 4464 : first_(true), 4465 os_(os) {} 4466 void Call(uc16 from, DispatchTable::Entry entry) { 4467 if (first_) { 4468 first_ = false; 4469 } else { 4470 os_ << "|"; 4471 } 4472 os_ << "{\\" << AsUC16(from) << "-\\" << AsUC16(entry.to()) << "|{"; 4473 OutSet* out_set = entry.out_set(); 4474 int priority = 0; 4475 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { 4476 if (out_set->Get(i)) { 4477 if (priority > 0) os_ << "|"; 4478 os_ << "<s" << from << "o" << i << "> " << priority; 4479 priority++; 4480 } 4481 } 4482 os_ << "}}"; 4483 } 4484 4485 private: 4486 bool first_; 4487 OStream& os_; 4488 }; 4489 4490 4491 class AttributePrinter { 4492 public: 4493 explicit AttributePrinter(OStream& os) // NOLINT 4494 : os_(os), 4495 first_(true) {} 4496 void PrintSeparator() { 4497 if (first_) { 4498 first_ = false; 4499 } else { 4500 os_ << "|"; 4501 } 4502 } 4503 void PrintBit(const char* name, bool value) { 4504 if (!value) return; 4505 PrintSeparator(); 4506 os_ << "{" << name << "}"; 4507 } 4508 void PrintPositive(const char* name, int value) { 4509 if (value < 0) return; 4510 PrintSeparator(); 4511 os_ << "{" << name << "|" << value << "}"; 4512 } 4513 4514 private: 4515 OStream& os_; 4516 bool first_; 4517 }; 4518 4519 4520 void DotPrinter::PrintAttributes(RegExpNode* that) { 4521 os_ << " a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, " 4522 << "margin=0.1, fontsize=10, label=\"{"; 4523 AttributePrinter printer(os_); 4524 NodeInfo* info = that->info(); 4525 printer.PrintBit("NI", info->follows_newline_interest); 4526 printer.PrintBit("WI", info->follows_word_interest); 4527 printer.PrintBit("SI", info->follows_start_interest); 4528 Label* label = that->label(); 4529 if (label->is_bound()) 4530 printer.PrintPositive("@", label->pos()); 4531 os_ << "}\"];\n" 4532 << " a" << that << " -> n" << that 4533 << " [style=dashed, color=grey, arrowhead=none];\n"; 4534 } 4535 4536 4537 static const bool kPrintDispatchTable = false; 4538 void DotPrinter::VisitChoice(ChoiceNode* that) { 4539 if (kPrintDispatchTable) { 4540 os_ << " n" << that << " [shape=Mrecord, label=\""; 4541 TableEntryHeaderPrinter header_printer(os_); 4542 that->GetTable(ignore_case_)->ForEach(&header_printer); 4543 os_ << "\"]\n"; 4544 PrintAttributes(that); 4545 TableEntryBodyPrinter body_printer(os_, that); 4546 that->GetTable(ignore_case_)->ForEach(&body_printer); 4547 } else { 4548 os_ << " n" << that << " [shape=Mrecord, label=\"?\"];\n"; 4549 for (int i = 0; i < that->alternatives()->length(); i++) { 4550 GuardedAlternative alt = that->alternatives()->at(i); 4551 os_ << " n" << that << " -> n" << alt.node(); 4552 } 4553 } 4554 for (int i = 0; i < that->alternatives()->length(); i++) { 4555 GuardedAlternative alt = that->alternatives()->at(i); 4556 alt.node()->Accept(this); 4557 } 4558 } 4559 4560 4561 void DotPrinter::VisitText(TextNode* that) { 4562 Zone* zone = that->zone(); 4563 os_ << " n" << that << " [label=\""; 4564 for (int i = 0; i < that->elements()->length(); i++) { 4565 if (i > 0) os_ << " "; 4566 TextElement elm = that->elements()->at(i); 4567 switch (elm.text_type()) { 4568 case TextElement::ATOM: { 4569 Vector<const uc16> data = elm.atom()->data(); 4570 for (int i = 0; i < data.length(); i++) { 4571 os_ << static_cast<char>(data[i]); 4572 } 4573 break; 4574 } 4575 case TextElement::CHAR_CLASS: { 4576 RegExpCharacterClass* node = elm.char_class(); 4577 os_ << "["; 4578 if (node->is_negated()) os_ << "^"; 4579 for (int j = 0; j < node->ranges(zone)->length(); j++) { 4580 CharacterRange range = node->ranges(zone)->at(j); 4581 os_ << AsUC16(range.from()) << "-" << AsUC16(range.to()); 4582 } 4583 os_ << "]"; 4584 break; 4585 } 4586 default: 4587 UNREACHABLE(); 4588 } 4589 } 4590 os_ << "\", shape=box, peripheries=2];\n"; 4591 PrintAttributes(that); 4592 os_ << " n" << that << " -> n" << that->on_success() << ";\n"; 4593 Visit(that->on_success()); 4594 } 4595 4596 4597 void DotPrinter::VisitBackReference(BackReferenceNode* that) { 4598 os_ << " n" << that << " [label=\"$" << that->start_register() << "..$" 4599 << that->end_register() << "\", shape=doubleoctagon];\n"; 4600 PrintAttributes(that); 4601 os_ << " n" << that << " -> n" << that->on_success() << ";\n"; 4602 Visit(that->on_success()); 4603 } 4604 4605 4606 void DotPrinter::VisitEnd(EndNode* that) { 4607 os_ << " n" << that << " [style=bold, shape=point];\n"; 4608 PrintAttributes(that); 4609 } 4610 4611 4612 void DotPrinter::VisitAssertion(AssertionNode* that) { 4613 os_ << " n" << that << " ["; 4614 switch (that->assertion_type()) { 4615 case AssertionNode::AT_END: 4616 os_ << "label=\"$\", shape=septagon"; 4617 break; 4618 case AssertionNode::AT_START: 4619 os_ << "label=\"^\", shape=septagon"; 4620 break; 4621 case AssertionNode::AT_BOUNDARY: 4622 os_ << "label=\"\\b\", shape=septagon"; 4623 break; 4624 case AssertionNode::AT_NON_BOUNDARY: 4625 os_ << "label=\"\\B\", shape=septagon"; 4626 break; 4627 case AssertionNode::AFTER_NEWLINE: 4628 os_ << "label=\"(?<=\\n)\", shape=septagon"; 4629 break; 4630 } 4631 os_ << "];\n"; 4632 PrintAttributes(that); 4633 RegExpNode* successor = that->on_success(); 4634 os_ << " n" << that << " -> n" << successor << ";\n"; 4635 Visit(successor); 4636 } 4637 4638 4639 void DotPrinter::VisitAction(ActionNode* that) { 4640 os_ << " n" << that << " ["; 4641 switch (that->action_type_) { 4642 case ActionNode::SET_REGISTER: 4643 os_ << "label=\"$" << that->data_.u_store_register.reg 4644 << ":=" << that->data_.u_store_register.value << "\", shape=octagon"; 4645 break; 4646 case ActionNode::INCREMENT_REGISTER: 4647 os_ << "label=\"$" << that->data_.u_increment_register.reg 4648 << "++\", shape=octagon"; 4649 break; 4650 case ActionNode::STORE_POSITION: 4651 os_ << "label=\"$" << that->data_.u_position_register.reg 4652 << ":=$pos\", shape=octagon"; 4653 break; 4654 case ActionNode::BEGIN_SUBMATCH: 4655 os_ << "label=\"$" << that->data_.u_submatch.current_position_register 4656 << ":=$pos,begin\", shape=septagon"; 4657 break; 4658 case ActionNode::POSITIVE_SUBMATCH_SUCCESS: 4659 os_ << "label=\"escape\", shape=septagon"; 4660 break; 4661 case ActionNode::EMPTY_MATCH_CHECK: 4662 os_ << "label=\"$" << that->data_.u_empty_match_check.start_register 4663 << "=$pos?,$" << that->data_.u_empty_match_check.repetition_register 4664 << "<" << that->data_.u_empty_match_check.repetition_limit 4665 << "?\", shape=septagon"; 4666 break; 4667 case ActionNode::CLEAR_CAPTURES: { 4668 os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from 4669 << " to $" << that->data_.u_clear_captures.range_to 4670 << "\", shape=septagon"; 4671 break; 4672 } 4673 } 4674 os_ << "];\n"; 4675 PrintAttributes(that); 4676 RegExpNode* successor = that->on_success(); 4677 os_ << " n" << that << " -> n" << successor << ";\n"; 4678 Visit(successor); 4679 } 4680 4681 4682 class DispatchTableDumper { 4683 public: 4684 explicit DispatchTableDumper(OStream& os) : os_(os) {} 4685 void Call(uc16 key, DispatchTable::Entry entry); 4686 private: 4687 OStream& os_; 4688 }; 4689 4690 4691 void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) { 4692 os_ << "[" << AsUC16(key) << "-" << AsUC16(entry.to()) << "]: {"; 4693 OutSet* set = entry.out_set(); 4694 bool first = true; 4695 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { 4696 if (set->Get(i)) { 4697 if (first) { 4698 first = false; 4699 } else { 4700 os_ << ", "; 4701 } 4702 os_ << i; 4703 } 4704 } 4705 os_ << "}\n"; 4706 } 4707 4708 4709 void DispatchTable::Dump() { 4710 OFStream os(stderr); 4711 DispatchTableDumper dumper(os); 4712 tree()->ForEach(&dumper); 4713 } 4714 4715 4716 void RegExpEngine::DotPrint(const char* label, 4717 RegExpNode* node, 4718 bool ignore_case) { 4719 OFStream os(stdout); 4720 DotPrinter printer(os, ignore_case); 4721 printer.PrintNode(label, node); 4722 } 4723 4724 4725 #endif // DEBUG 4726 4727 4728 // ------------------------------------------------------------------- 4729 // Tree to graph conversion 4730 4731 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler, 4732 RegExpNode* on_success) { 4733 ZoneList<TextElement>* elms = 4734 new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone()); 4735 elms->Add(TextElement::Atom(this), compiler->zone()); 4736 return new(compiler->zone()) TextNode(elms, on_success); 4737 } 4738 4739 4740 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, 4741 RegExpNode* on_success) { 4742 return new(compiler->zone()) TextNode(elements(), on_success); 4743 } 4744 4745 4746 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges, 4747 const int* special_class, 4748 int length) { 4749 length--; // Remove final 0x10000. 4750 DCHECK(special_class[length] == 0x10000); 4751 DCHECK(ranges->length() != 0); 4752 DCHECK(length != 0); 4753 DCHECK(special_class[0] != 0); 4754 if (ranges->length() != (length >> 1) + 1) { 4755 return false; 4756 } 4757 CharacterRange range = ranges->at(0); 4758 if (range.from() != 0) { 4759 return false; 4760 } 4761 for (int i = 0; i < length; i += 2) { 4762 if (special_class[i] != (range.to() + 1)) { 4763 return false; 4764 } 4765 range = ranges->at((i >> 1) + 1); 4766 if (special_class[i+1] != range.from()) { 4767 return false; 4768 } 4769 } 4770 if (range.to() != 0xffff) { 4771 return false; 4772 } 4773 return true; 4774 } 4775 4776 4777 static bool CompareRanges(ZoneList<CharacterRange>* ranges, 4778 const int* special_class, 4779 int length) { 4780 length--; // Remove final 0x10000. 4781 DCHECK(special_class[length] == 0x10000); 4782 if (ranges->length() * 2 != length) { 4783 return false; 4784 } 4785 for (int i = 0; i < length; i += 2) { 4786 CharacterRange range = ranges->at(i >> 1); 4787 if (range.from() != special_class[i] || 4788 range.to() != special_class[i + 1] - 1) { 4789 return false; 4790 } 4791 } 4792 return true; 4793 } 4794 4795 4796 bool RegExpCharacterClass::is_standard(Zone* zone) { 4797 // TODO(lrn): Remove need for this function, by not throwing away information 4798 // along the way. 4799 if (is_negated_) { 4800 return false; 4801 } 4802 if (set_.is_standard()) { 4803 return true; 4804 } 4805 if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) { 4806 set_.set_standard_set_type('s'); 4807 return true; 4808 } 4809 if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) { 4810 set_.set_standard_set_type('S'); 4811 return true; 4812 } 4813 if (CompareInverseRanges(set_.ranges(zone), 4814 kLineTerminatorRanges, 4815 kLineTerminatorRangeCount)) { 4816 set_.set_standard_set_type('.'); 4817 return true; 4818 } 4819 if (CompareRanges(set_.ranges(zone), 4820 kLineTerminatorRanges, 4821 kLineTerminatorRangeCount)) { 4822 set_.set_standard_set_type('n'); 4823 return true; 4824 } 4825 if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) { 4826 set_.set_standard_set_type('w'); 4827 return true; 4828 } 4829 if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) { 4830 set_.set_standard_set_type('W'); 4831 return true; 4832 } 4833 return false; 4834 } 4835 4836 4837 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, 4838 RegExpNode* on_success) { 4839 return new(compiler->zone()) TextNode(this, on_success); 4840 } 4841 4842 4843 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler, 4844 RegExpNode* on_success) { 4845 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 4846 int length = alternatives->length(); 4847 ChoiceNode* result = 4848 new(compiler->zone()) ChoiceNode(length, compiler->zone()); 4849 for (int i = 0; i < length; i++) { 4850 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler, 4851 on_success)); 4852 result->AddAlternative(alternative); 4853 } 4854 return result; 4855 } 4856 4857 4858 RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler, 4859 RegExpNode* on_success) { 4860 return ToNode(min(), 4861 max(), 4862 is_greedy(), 4863 body(), 4864 compiler, 4865 on_success); 4866 } 4867 4868 4869 // Scoped object to keep track of how much we unroll quantifier loops in the 4870 // regexp graph generator. 4871 class RegExpExpansionLimiter { 4872 public: 4873 static const int kMaxExpansionFactor = 6; 4874 RegExpExpansionLimiter(RegExpCompiler* compiler, int factor) 4875 : compiler_(compiler), 4876 saved_expansion_factor_(compiler->current_expansion_factor()), 4877 ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) { 4878 DCHECK(factor > 0); 4879 if (ok_to_expand_) { 4880 if (factor > kMaxExpansionFactor) { 4881 // Avoid integer overflow of the current expansion factor. 4882 ok_to_expand_ = false; 4883 compiler->set_current_expansion_factor(kMaxExpansionFactor + 1); 4884 } else { 4885 int new_factor = saved_expansion_factor_ * factor; 4886 ok_to_expand_ = (new_factor <= kMaxExpansionFactor); 4887 compiler->set_current_expansion_factor(new_factor); 4888 } 4889 } 4890 } 4891 4892 ~RegExpExpansionLimiter() { 4893 compiler_->set_current_expansion_factor(saved_expansion_factor_); 4894 } 4895 4896 bool ok_to_expand() { return ok_to_expand_; } 4897 4898 private: 4899 RegExpCompiler* compiler_; 4900 int saved_expansion_factor_; 4901 bool ok_to_expand_; 4902 4903 DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter); 4904 }; 4905 4906 4907 RegExpNode* RegExpQuantifier::ToNode(int min, 4908 int max, 4909 bool is_greedy, 4910 RegExpTree* body, 4911 RegExpCompiler* compiler, 4912 RegExpNode* on_success, 4913 bool not_at_start) { 4914 // x{f, t} becomes this: 4915 // 4916 // (r++)<-. 4917 // | ` 4918 // | (x) 4919 // v ^ 4920 // (r=0)-->(?)---/ [if r < t] 4921 // | 4922 // [if r >= f] \----> ... 4923 // 4924 4925 // 15.10.2.5 RepeatMatcher algorithm. 4926 // The parser has already eliminated the case where max is 0. In the case 4927 // where max_match is zero the parser has removed the quantifier if min was 4928 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom. 4929 4930 // If we know that we cannot match zero length then things are a little 4931 // simpler since we don't need to make the special zero length match check 4932 // from step 2.1. If the min and max are small we can unroll a little in 4933 // this case. 4934 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,} 4935 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3} 4936 if (max == 0) return on_success; // This can happen due to recursion. 4937 bool body_can_be_empty = (body->min_match() == 0); 4938 int body_start_reg = RegExpCompiler::kNoRegister; 4939 Interval capture_registers = body->CaptureRegisters(); 4940 bool needs_capture_clearing = !capture_registers.is_empty(); 4941 Zone* zone = compiler->zone(); 4942 4943 if (body_can_be_empty) { 4944 body_start_reg = compiler->AllocateRegister(); 4945 } else if (FLAG_regexp_optimization && !needs_capture_clearing) { 4946 // Only unroll if there are no captures and the body can't be 4947 // empty. 4948 { 4949 RegExpExpansionLimiter limiter( 4950 compiler, min + ((max != min) ? 1 : 0)); 4951 if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) { 4952 int new_max = (max == kInfinity) ? max : max - min; 4953 // Recurse once to get the loop or optional matches after the fixed 4954 // ones. 4955 RegExpNode* answer = ToNode( 4956 0, new_max, is_greedy, body, compiler, on_success, true); 4957 // Unroll the forced matches from 0 to min. This can cause chains of 4958 // TextNodes (which the parser does not generate). These should be 4959 // combined if it turns out they hinder good code generation. 4960 for (int i = 0; i < min; i++) { 4961 answer = body->ToNode(compiler, answer); 4962 } 4963 return answer; 4964 } 4965 } 4966 if (max <= kMaxUnrolledMaxMatches && min == 0) { 4967 DCHECK(max > 0); // Due to the 'if' above. 4968 RegExpExpansionLimiter limiter(compiler, max); 4969 if (limiter.ok_to_expand()) { 4970 // Unroll the optional matches up to max. 4971 RegExpNode* answer = on_success; 4972 for (int i = 0; i < max; i++) { 4973 ChoiceNode* alternation = new(zone) ChoiceNode(2, zone); 4974 if (is_greedy) { 4975 alternation->AddAlternative( 4976 GuardedAlternative(body->ToNode(compiler, answer))); 4977 alternation->AddAlternative(GuardedAlternative(on_success)); 4978 } else { 4979 alternation->AddAlternative(GuardedAlternative(on_success)); 4980 alternation->AddAlternative( 4981 GuardedAlternative(body->ToNode(compiler, answer))); 4982 } 4983 answer = alternation; 4984 if (not_at_start) alternation->set_not_at_start(); 4985 } 4986 return answer; 4987 } 4988 } 4989 } 4990 bool has_min = min > 0; 4991 bool has_max = max < RegExpTree::kInfinity; 4992 bool needs_counter = has_min || has_max; 4993 int reg_ctr = needs_counter 4994 ? compiler->AllocateRegister() 4995 : RegExpCompiler::kNoRegister; 4996 LoopChoiceNode* center = new(zone) LoopChoiceNode(body->min_match() == 0, 4997 zone); 4998 if (not_at_start) center->set_not_at_start(); 4999 RegExpNode* loop_return = needs_counter 5000 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center)) 5001 : static_cast<RegExpNode*>(center); 5002 if (body_can_be_empty) { 5003 // If the body can be empty we need to check if it was and then 5004 // backtrack. 5005 loop_return = ActionNode::EmptyMatchCheck(body_start_reg, 5006 reg_ctr, 5007 min, 5008 loop_return); 5009 } 5010 RegExpNode* body_node = body->ToNode(compiler, loop_return); 5011 if (body_can_be_empty) { 5012 // If the body can be empty we need to store the start position 5013 // so we can bail out if it was empty. 5014 body_node = ActionNode::StorePosition(body_start_reg, false, body_node); 5015 } 5016 if (needs_capture_clearing) { 5017 // Before entering the body of this loop we need to clear captures. 5018 body_node = ActionNode::ClearCaptures(capture_registers, body_node); 5019 } 5020 GuardedAlternative body_alt(body_node); 5021 if (has_max) { 5022 Guard* body_guard = 5023 new(zone) Guard(reg_ctr, Guard::LT, max); 5024 body_alt.AddGuard(body_guard, zone); 5025 } 5026 GuardedAlternative rest_alt(on_success); 5027 if (has_min) { 5028 Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min); 5029 rest_alt.AddGuard(rest_guard, zone); 5030 } 5031 if (is_greedy) { 5032 center->AddLoopAlternative(body_alt); 5033 center->AddContinueAlternative(rest_alt); 5034 } else { 5035 center->AddContinueAlternative(rest_alt); 5036 center->AddLoopAlternative(body_alt); 5037 } 5038 if (needs_counter) { 5039 return ActionNode::SetRegister(reg_ctr, 0, center); 5040 } else { 5041 return center; 5042 } 5043 } 5044 5045 5046 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, 5047 RegExpNode* on_success) { 5048 NodeInfo info; 5049 Zone* zone = compiler->zone(); 5050 5051 switch (assertion_type()) { 5052 case START_OF_LINE: 5053 return AssertionNode::AfterNewline(on_success); 5054 case START_OF_INPUT: 5055 return AssertionNode::AtStart(on_success); 5056 case BOUNDARY: 5057 return AssertionNode::AtBoundary(on_success); 5058 case NON_BOUNDARY: 5059 return AssertionNode::AtNonBoundary(on_success); 5060 case END_OF_INPUT: 5061 return AssertionNode::AtEnd(on_success); 5062 case END_OF_LINE: { 5063 // Compile $ in multiline regexps as an alternation with a positive 5064 // lookahead in one side and an end-of-input on the other side. 5065 // We need two registers for the lookahead. 5066 int stack_pointer_register = compiler->AllocateRegister(); 5067 int position_register = compiler->AllocateRegister(); 5068 // The ChoiceNode to distinguish between a newline and end-of-input. 5069 ChoiceNode* result = new(zone) ChoiceNode(2, zone); 5070 // Create a newline atom. 5071 ZoneList<CharacterRange>* newline_ranges = 5072 new(zone) ZoneList<CharacterRange>(3, zone); 5073 CharacterRange::AddClassEscape('n', newline_ranges, zone); 5074 RegExpCharacterClass* newline_atom = new(zone) RegExpCharacterClass('n'); 5075 TextNode* newline_matcher = new(zone) TextNode( 5076 newline_atom, 5077 ActionNode::PositiveSubmatchSuccess(stack_pointer_register, 5078 position_register, 5079 0, // No captures inside. 5080 -1, // Ignored if no captures. 5081 on_success)); 5082 // Create an end-of-input matcher. 5083 RegExpNode* end_of_line = ActionNode::BeginSubmatch( 5084 stack_pointer_register, 5085 position_register, 5086 newline_matcher); 5087 // Add the two alternatives to the ChoiceNode. 5088 GuardedAlternative eol_alternative(end_of_line); 5089 result->AddAlternative(eol_alternative); 5090 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success)); 5091 result->AddAlternative(end_alternative); 5092 return result; 5093 } 5094 default: 5095 UNREACHABLE(); 5096 } 5097 return on_success; 5098 } 5099 5100 5101 RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler, 5102 RegExpNode* on_success) { 5103 return new(compiler->zone()) 5104 BackReferenceNode(RegExpCapture::StartRegister(index()), 5105 RegExpCapture::EndRegister(index()), 5106 on_success); 5107 } 5108 5109 5110 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler, 5111 RegExpNode* on_success) { 5112 return on_success; 5113 } 5114 5115 5116 RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler, 5117 RegExpNode* on_success) { 5118 int stack_pointer_register = compiler->AllocateRegister(); 5119 int position_register = compiler->AllocateRegister(); 5120 5121 const int registers_per_capture = 2; 5122 const int register_of_first_capture = 2; 5123 int register_count = capture_count_ * registers_per_capture; 5124 int register_start = 5125 register_of_first_capture + capture_from_ * registers_per_capture; 5126 5127 RegExpNode* success; 5128 if (is_positive()) { 5129 RegExpNode* node = ActionNode::BeginSubmatch( 5130 stack_pointer_register, 5131 position_register, 5132 body()->ToNode( 5133 compiler, 5134 ActionNode::PositiveSubmatchSuccess(stack_pointer_register, 5135 position_register, 5136 register_count, 5137 register_start, 5138 on_success))); 5139 return node; 5140 } else { 5141 // We use a ChoiceNode for a negative lookahead because it has most of 5142 // the characteristics we need. It has the body of the lookahead as its 5143 // first alternative and the expression after the lookahead of the second 5144 // alternative. If the first alternative succeeds then the 5145 // NegativeSubmatchSuccess will unwind the stack including everything the 5146 // choice node set up and backtrack. If the first alternative fails then 5147 // the second alternative is tried, which is exactly the desired result 5148 // for a negative lookahead. The NegativeLookaheadChoiceNode is a special 5149 // ChoiceNode that knows to ignore the first exit when calculating quick 5150 // checks. 5151 Zone* zone = compiler->zone(); 5152 5153 GuardedAlternative body_alt( 5154 body()->ToNode( 5155 compiler, 5156 success = new(zone) NegativeSubmatchSuccess(stack_pointer_register, 5157 position_register, 5158 register_count, 5159 register_start, 5160 zone))); 5161 ChoiceNode* choice_node = 5162 new(zone) NegativeLookaheadChoiceNode(body_alt, 5163 GuardedAlternative(on_success), 5164 zone); 5165 return ActionNode::BeginSubmatch(stack_pointer_register, 5166 position_register, 5167 choice_node); 5168 } 5169 } 5170 5171 5172 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler, 5173 RegExpNode* on_success) { 5174 return ToNode(body(), index(), compiler, on_success); 5175 } 5176 5177 5178 RegExpNode* RegExpCapture::ToNode(RegExpTree* body, 5179 int index, 5180 RegExpCompiler* compiler, 5181 RegExpNode* on_success) { 5182 int start_reg = RegExpCapture::StartRegister(index); 5183 int end_reg = RegExpCapture::EndRegister(index); 5184 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success); 5185 RegExpNode* body_node = body->ToNode(compiler, store_end); 5186 return ActionNode::StorePosition(start_reg, true, body_node); 5187 } 5188 5189 5190 RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler, 5191 RegExpNode* on_success) { 5192 ZoneList<RegExpTree*>* children = nodes(); 5193 RegExpNode* current = on_success; 5194 for (int i = children->length() - 1; i >= 0; i--) { 5195 current = children->at(i)->ToNode(compiler, current); 5196 } 5197 return current; 5198 } 5199 5200 5201 static void AddClass(const int* elmv, 5202 int elmc, 5203 ZoneList<CharacterRange>* ranges, 5204 Zone* zone) { 5205 elmc--; 5206 DCHECK(elmv[elmc] == 0x10000); 5207 for (int i = 0; i < elmc; i += 2) { 5208 DCHECK(elmv[i] < elmv[i + 1]); 5209 ranges->Add(CharacterRange(elmv[i], elmv[i + 1] - 1), zone); 5210 } 5211 } 5212 5213 5214 static void AddClassNegated(const int *elmv, 5215 int elmc, 5216 ZoneList<CharacterRange>* ranges, 5217 Zone* zone) { 5218 elmc--; 5219 DCHECK(elmv[elmc] == 0x10000); 5220 DCHECK(elmv[0] != 0x0000); 5221 DCHECK(elmv[elmc-1] != String::kMaxUtf16CodeUnit); 5222 uc16 last = 0x0000; 5223 for (int i = 0; i < elmc; i += 2) { 5224 DCHECK(last <= elmv[i] - 1); 5225 DCHECK(elmv[i] < elmv[i + 1]); 5226 ranges->Add(CharacterRange(last, elmv[i] - 1), zone); 5227 last = elmv[i + 1]; 5228 } 5229 ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit), zone); 5230 } 5231 5232 5233 void CharacterRange::AddClassEscape(uc16 type, 5234 ZoneList<CharacterRange>* ranges, 5235 Zone* zone) { 5236 switch (type) { 5237 case 's': 5238 AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone); 5239 break; 5240 case 'S': 5241 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone); 5242 break; 5243 case 'w': 5244 AddClass(kWordRanges, kWordRangeCount, ranges, zone); 5245 break; 5246 case 'W': 5247 AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone); 5248 break; 5249 case 'd': 5250 AddClass(kDigitRanges, kDigitRangeCount, ranges, zone); 5251 break; 5252 case 'D': 5253 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone); 5254 break; 5255 case '.': 5256 AddClassNegated(kLineTerminatorRanges, 5257 kLineTerminatorRangeCount, 5258 ranges, 5259 zone); 5260 break; 5261 // This is not a character range as defined by the spec but a 5262 // convenient shorthand for a character class that matches any 5263 // character. 5264 case '*': 5265 ranges->Add(CharacterRange::Everything(), zone); 5266 break; 5267 // This is the set of characters matched by the $ and ^ symbols 5268 // in multiline mode. 5269 case 'n': 5270 AddClass(kLineTerminatorRanges, 5271 kLineTerminatorRangeCount, 5272 ranges, 5273 zone); 5274 break; 5275 default: 5276 UNREACHABLE(); 5277 } 5278 } 5279 5280 5281 Vector<const int> CharacterRange::GetWordBounds() { 5282 return Vector<const int>(kWordRanges, kWordRangeCount - 1); 5283 } 5284 5285 5286 class CharacterRangeSplitter { 5287 public: 5288 CharacterRangeSplitter(ZoneList<CharacterRange>** included, 5289 ZoneList<CharacterRange>** excluded, 5290 Zone* zone) 5291 : included_(included), 5292 excluded_(excluded), 5293 zone_(zone) { } 5294 void Call(uc16 from, DispatchTable::Entry entry); 5295 5296 static const int kInBase = 0; 5297 static const int kInOverlay = 1; 5298 5299 private: 5300 ZoneList<CharacterRange>** included_; 5301 ZoneList<CharacterRange>** excluded_; 5302 Zone* zone_; 5303 }; 5304 5305 5306 void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) { 5307 if (!entry.out_set()->Get(kInBase)) return; 5308 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay) 5309 ? included_ 5310 : excluded_; 5311 if (*target == NULL) *target = new(zone_) ZoneList<CharacterRange>(2, zone_); 5312 (*target)->Add(CharacterRange(entry.from(), entry.to()), zone_); 5313 } 5314 5315 5316 void CharacterRange::Split(ZoneList<CharacterRange>* base, 5317 Vector<const int> overlay, 5318 ZoneList<CharacterRange>** included, 5319 ZoneList<CharacterRange>** excluded, 5320 Zone* zone) { 5321 DCHECK_EQ(NULL, *included); 5322 DCHECK_EQ(NULL, *excluded); 5323 DispatchTable table(zone); 5324 for (int i = 0; i < base->length(); i++) 5325 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase, zone); 5326 for (int i = 0; i < overlay.length(); i += 2) { 5327 table.AddRange(CharacterRange(overlay[i], overlay[i + 1] - 1), 5328 CharacterRangeSplitter::kInOverlay, zone); 5329 } 5330 CharacterRangeSplitter callback(included, excluded, zone); 5331 table.ForEach(&callback); 5332 } 5333 5334 5335 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges, 5336 bool is_one_byte, Zone* zone) { 5337 Isolate* isolate = zone->isolate(); 5338 uc16 bottom = from(); 5339 uc16 top = to(); 5340 if (is_one_byte && !RangeContainsLatin1Equivalents(*this)) { 5341 if (bottom > String::kMaxOneByteCharCode) return; 5342 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode; 5343 } 5344 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 5345 if (top == bottom) { 5346 // If this is a singleton we just expand the one character. 5347 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars); 5348 for (int i = 0; i < length; i++) { 5349 uc32 chr = chars[i]; 5350 if (chr != bottom) { 5351 ranges->Add(CharacterRange::Singleton(chars[i]), zone); 5352 } 5353 } 5354 } else { 5355 // If this is a range we expand the characters block by block, 5356 // expanding contiguous subranges (blocks) one at a time. 5357 // The approach is as follows. For a given start character we 5358 // look up the remainder of the block that contains it (represented 5359 // by the end point), for instance we find 'z' if the character 5360 // is 'c'. A block is characterized by the property 5361 // that all characters uncanonicalize in the same way, except that 5362 // each entry in the result is incremented by the distance from the first 5363 // element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and 5364 // the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. 5365 // Once we've found the end point we look up its uncanonicalization 5366 // and produce a range for each element. For instance for [c-f] 5367 // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only 5368 // add a range if it is not already contained in the input, so [c-f] 5369 // will be skipped but [C-F] will be added. If this range is not 5370 // completely contained in a block we do this for all the blocks 5371 // covered by the range (handling characters that is not in a block 5372 // as a "singleton block"). 5373 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 5374 int pos = bottom; 5375 while (pos <= top) { 5376 int length = isolate->jsregexp_canonrange()->get(pos, '\0', range); 5377 uc16 block_end; 5378 if (length == 0) { 5379 block_end = pos; 5380 } else { 5381 DCHECK_EQ(1, length); 5382 block_end = range[0]; 5383 } 5384 int end = (block_end > top) ? top : block_end; 5385 length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', range); 5386 for (int i = 0; i < length; i++) { 5387 uc32 c = range[i]; 5388 uc16 range_from = c - (block_end - pos); 5389 uc16 range_to = c - (block_end - end); 5390 if (!(bottom <= range_from && range_to <= top)) { 5391 ranges->Add(CharacterRange(range_from, range_to), zone); 5392 } 5393 } 5394 pos = end + 1; 5395 } 5396 } 5397 } 5398 5399 5400 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) { 5401 DCHECK_NOT_NULL(ranges); 5402 int n = ranges->length(); 5403 if (n <= 1) return true; 5404 int max = ranges->at(0).to(); 5405 for (int i = 1; i < n; i++) { 5406 CharacterRange next_range = ranges->at(i); 5407 if (next_range.from() <= max + 1) return false; 5408 max = next_range.to(); 5409 } 5410 return true; 5411 } 5412 5413 5414 ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) { 5415 if (ranges_ == NULL) { 5416 ranges_ = new(zone) ZoneList<CharacterRange>(2, zone); 5417 CharacterRange::AddClassEscape(standard_set_type_, ranges_, zone); 5418 } 5419 return ranges_; 5420 } 5421 5422 5423 // Move a number of elements in a zonelist to another position 5424 // in the same list. Handles overlapping source and target areas. 5425 static void MoveRanges(ZoneList<CharacterRange>* list, 5426 int from, 5427 int to, 5428 int count) { 5429 // Ranges are potentially overlapping. 5430 if (from < to) { 5431 for (int i = count - 1; i >= 0; i--) { 5432 list->at(to + i) = list->at(from + i); 5433 } 5434 } else { 5435 for (int i = 0; i < count; i++) { 5436 list->at(to + i) = list->at(from + i); 5437 } 5438 } 5439 } 5440 5441 5442 static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list, 5443 int count, 5444 CharacterRange insert) { 5445 // Inserts a range into list[0..count[, which must be sorted 5446 // by from value and non-overlapping and non-adjacent, using at most 5447 // list[0..count] for the result. Returns the number of resulting 5448 // canonicalized ranges. Inserting a range may collapse existing ranges into 5449 // fewer ranges, so the return value can be anything in the range 1..count+1. 5450 uc16 from = insert.from(); 5451 uc16 to = insert.to(); 5452 int start_pos = 0; 5453 int end_pos = count; 5454 for (int i = count - 1; i >= 0; i--) { 5455 CharacterRange current = list->at(i); 5456 if (current.from() > to + 1) { 5457 end_pos = i; 5458 } else if (current.to() + 1 < from) { 5459 start_pos = i + 1; 5460 break; 5461 } 5462 } 5463 5464 // Inserted range overlaps, or is adjacent to, ranges at positions 5465 // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are 5466 // not affected by the insertion. 5467 // If start_pos == end_pos, the range must be inserted before start_pos. 5468 // if start_pos < end_pos, the entire range from start_pos to end_pos 5469 // must be merged with the insert range. 5470 5471 if (start_pos == end_pos) { 5472 // Insert between existing ranges at position start_pos. 5473 if (start_pos < count) { 5474 MoveRanges(list, start_pos, start_pos + 1, count - start_pos); 5475 } 5476 list->at(start_pos) = insert; 5477 return count + 1; 5478 } 5479 if (start_pos + 1 == end_pos) { 5480 // Replace single existing range at position start_pos. 5481 CharacterRange to_replace = list->at(start_pos); 5482 int new_from = Min(to_replace.from(), from); 5483 int new_to = Max(to_replace.to(), to); 5484 list->at(start_pos) = CharacterRange(new_from, new_to); 5485 return count; 5486 } 5487 // Replace a number of existing ranges from start_pos to end_pos - 1. 5488 // Move the remaining ranges down. 5489 5490 int new_from = Min(list->at(start_pos).from(), from); 5491 int new_to = Max(list->at(end_pos - 1).to(), to); 5492 if (end_pos < count) { 5493 MoveRanges(list, end_pos, start_pos + 1, count - end_pos); 5494 } 5495 list->at(start_pos) = CharacterRange(new_from, new_to); 5496 return count - (end_pos - start_pos) + 1; 5497 } 5498 5499 5500 void CharacterSet::Canonicalize() { 5501 // Special/default classes are always considered canonical. The result 5502 // of calling ranges() will be sorted. 5503 if (ranges_ == NULL) return; 5504 CharacterRange::Canonicalize(ranges_); 5505 } 5506 5507 5508 void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) { 5509 if (character_ranges->length() <= 1) return; 5510 // Check whether ranges are already canonical (increasing, non-overlapping, 5511 // non-adjacent). 5512 int n = character_ranges->length(); 5513 int max = character_ranges->at(0).to(); 5514 int i = 1; 5515 while (i < n) { 5516 CharacterRange current = character_ranges->at(i); 5517 if (current.from() <= max + 1) { 5518 break; 5519 } 5520 max = current.to(); 5521 i++; 5522 } 5523 // Canonical until the i'th range. If that's all of them, we are done. 5524 if (i == n) return; 5525 5526 // The ranges at index i and forward are not canonicalized. Make them so by 5527 // doing the equivalent of insertion sort (inserting each into the previous 5528 // list, in order). 5529 // Notice that inserting a range can reduce the number of ranges in the 5530 // result due to combining of adjacent and overlapping ranges. 5531 int read = i; // Range to insert. 5532 int num_canonical = i; // Length of canonicalized part of list. 5533 do { 5534 num_canonical = InsertRangeInCanonicalList(character_ranges, 5535 num_canonical, 5536 character_ranges->at(read)); 5537 read++; 5538 } while (read < n); 5539 character_ranges->Rewind(num_canonical); 5540 5541 DCHECK(CharacterRange::IsCanonical(character_ranges)); 5542 } 5543 5544 5545 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges, 5546 ZoneList<CharacterRange>* negated_ranges, 5547 Zone* zone) { 5548 DCHECK(CharacterRange::IsCanonical(ranges)); 5549 DCHECK_EQ(0, negated_ranges->length()); 5550 int range_count = ranges->length(); 5551 uc16 from = 0; 5552 int i = 0; 5553 if (range_count > 0 && ranges->at(0).from() == 0) { 5554 from = ranges->at(0).to(); 5555 i = 1; 5556 } 5557 while (i < range_count) { 5558 CharacterRange range = ranges->at(i); 5559 negated_ranges->Add(CharacterRange(from + 1, range.from() - 1), zone); 5560 from = range.to(); 5561 i++; 5562 } 5563 if (from < String::kMaxUtf16CodeUnit) { 5564 negated_ranges->Add(CharacterRange(from + 1, String::kMaxUtf16CodeUnit), 5565 zone); 5566 } 5567 } 5568 5569 5570 // ------------------------------------------------------------------- 5571 // Splay tree 5572 5573 5574 OutSet* OutSet::Extend(unsigned value, Zone* zone) { 5575 if (Get(value)) 5576 return this; 5577 if (successors(zone) != NULL) { 5578 for (int i = 0; i < successors(zone)->length(); i++) { 5579 OutSet* successor = successors(zone)->at(i); 5580 if (successor->Get(value)) 5581 return successor; 5582 } 5583 } else { 5584 successors_ = new(zone) ZoneList<OutSet*>(2, zone); 5585 } 5586 OutSet* result = new(zone) OutSet(first_, remaining_); 5587 result->Set(value, zone); 5588 successors(zone)->Add(result, zone); 5589 return result; 5590 } 5591 5592 5593 void OutSet::Set(unsigned value, Zone *zone) { 5594 if (value < kFirstLimit) { 5595 first_ |= (1 << value); 5596 } else { 5597 if (remaining_ == NULL) 5598 remaining_ = new(zone) ZoneList<unsigned>(1, zone); 5599 if (remaining_->is_empty() || !remaining_->Contains(value)) 5600 remaining_->Add(value, zone); 5601 } 5602 } 5603 5604 5605 bool OutSet::Get(unsigned value) const { 5606 if (value < kFirstLimit) { 5607 return (first_ & (1 << value)) != 0; 5608 } else if (remaining_ == NULL) { 5609 return false; 5610 } else { 5611 return remaining_->Contains(value); 5612 } 5613 } 5614 5615 5616 const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar; 5617 5618 5619 void DispatchTable::AddRange(CharacterRange full_range, int value, 5620 Zone* zone) { 5621 CharacterRange current = full_range; 5622 if (tree()->is_empty()) { 5623 // If this is the first range we just insert into the table. 5624 ZoneSplayTree<Config>::Locator loc; 5625 DCHECK_RESULT(tree()->Insert(current.from(), &loc)); 5626 loc.set_value(Entry(current.from(), current.to(), 5627 empty()->Extend(value, zone))); 5628 return; 5629 } 5630 // First see if there is a range to the left of this one that 5631 // overlaps. 5632 ZoneSplayTree<Config>::Locator loc; 5633 if (tree()->FindGreatestLessThan(current.from(), &loc)) { 5634 Entry* entry = &loc.value(); 5635 // If we've found a range that overlaps with this one, and it 5636 // starts strictly to the left of this one, we have to fix it 5637 // because the following code only handles ranges that start on 5638 // or after the start point of the range we're adding. 5639 if (entry->from() < current.from() && entry->to() >= current.from()) { 5640 // Snap the overlapping range in half around the start point of 5641 // the range we're adding. 5642 CharacterRange left(entry->from(), current.from() - 1); 5643 CharacterRange right(current.from(), entry->to()); 5644 // The left part of the overlapping range doesn't overlap. 5645 // Truncate the whole entry to be just the left part. 5646 entry->set_to(left.to()); 5647 // The right part is the one that overlaps. We add this part 5648 // to the map and let the next step deal with merging it with 5649 // the range we're adding. 5650 ZoneSplayTree<Config>::Locator loc; 5651 DCHECK_RESULT(tree()->Insert(right.from(), &loc)); 5652 loc.set_value(Entry(right.from(), 5653 right.to(), 5654 entry->out_set())); 5655 } 5656 } 5657 while (current.is_valid()) { 5658 if (tree()->FindLeastGreaterThan(current.from(), &loc) && 5659 (loc.value().from() <= current.to()) && 5660 (loc.value().to() >= current.from())) { 5661 Entry* entry = &loc.value(); 5662 // We have overlap. If there is space between the start point of 5663 // the range we're adding and where the overlapping range starts 5664 // then we have to add a range covering just that space. 5665 if (current.from() < entry->from()) { 5666 ZoneSplayTree<Config>::Locator ins; 5667 DCHECK_RESULT(tree()->Insert(current.from(), &ins)); 5668 ins.set_value(Entry(current.from(), 5669 entry->from() - 1, 5670 empty()->Extend(value, zone))); 5671 current.set_from(entry->from()); 5672 } 5673 DCHECK_EQ(current.from(), entry->from()); 5674 // If the overlapping range extends beyond the one we want to add 5675 // we have to snap the right part off and add it separately. 5676 if (entry->to() > current.to()) { 5677 ZoneSplayTree<Config>::Locator ins; 5678 DCHECK_RESULT(tree()->Insert(current.to() + 1, &ins)); 5679 ins.set_value(Entry(current.to() + 1, 5680 entry->to(), 5681 entry->out_set())); 5682 entry->set_to(current.to()); 5683 } 5684 DCHECK(entry->to() <= current.to()); 5685 // The overlapping range is now completely contained by the range 5686 // we're adding so we can just update it and move the start point 5687 // of the range we're adding just past it. 5688 entry->AddValue(value, zone); 5689 // Bail out if the last interval ended at 0xFFFF since otherwise 5690 // adding 1 will wrap around to 0. 5691 if (entry->to() == String::kMaxUtf16CodeUnit) 5692 break; 5693 DCHECK(entry->to() + 1 > current.from()); 5694 current.set_from(entry->to() + 1); 5695 } else { 5696 // There is no overlap so we can just add the range 5697 ZoneSplayTree<Config>::Locator ins; 5698 DCHECK_RESULT(tree()->Insert(current.from(), &ins)); 5699 ins.set_value(Entry(current.from(), 5700 current.to(), 5701 empty()->Extend(value, zone))); 5702 break; 5703 } 5704 } 5705 } 5706 5707 5708 OutSet* DispatchTable::Get(uc16 value) { 5709 ZoneSplayTree<Config>::Locator loc; 5710 if (!tree()->FindGreatestLessThan(value, &loc)) 5711 return empty(); 5712 Entry* entry = &loc.value(); 5713 if (value <= entry->to()) 5714 return entry->out_set(); 5715 else 5716 return empty(); 5717 } 5718 5719 5720 // ------------------------------------------------------------------- 5721 // Analysis 5722 5723 5724 void Analysis::EnsureAnalyzed(RegExpNode* that) { 5725 StackLimitCheck check(that->zone()->isolate()); 5726 if (check.HasOverflowed()) { 5727 fail("Stack overflow"); 5728 return; 5729 } 5730 if (that->info()->been_analyzed || that->info()->being_analyzed) 5731 return; 5732 that->info()->being_analyzed = true; 5733 that->Accept(this); 5734 that->info()->being_analyzed = false; 5735 that->info()->been_analyzed = true; 5736 } 5737 5738 5739 void Analysis::VisitEnd(EndNode* that) { 5740 // nothing to do 5741 } 5742 5743 5744 void TextNode::CalculateOffsets() { 5745 int element_count = elements()->length(); 5746 // Set up the offsets of the elements relative to the start. This is a fixed 5747 // quantity since a TextNode can only contain fixed-width things. 5748 int cp_offset = 0; 5749 for (int i = 0; i < element_count; i++) { 5750 TextElement& elm = elements()->at(i); 5751 elm.set_cp_offset(cp_offset); 5752 cp_offset += elm.length(); 5753 } 5754 } 5755 5756 5757 void Analysis::VisitText(TextNode* that) { 5758 if (ignore_case_) { 5759 that->MakeCaseIndependent(is_one_byte_); 5760 } 5761 EnsureAnalyzed(that->on_success()); 5762 if (!has_failed()) { 5763 that->CalculateOffsets(); 5764 } 5765 } 5766 5767 5768 void Analysis::VisitAction(ActionNode* that) { 5769 RegExpNode* target = that->on_success(); 5770 EnsureAnalyzed(target); 5771 if (!has_failed()) { 5772 // If the next node is interested in what it follows then this node 5773 // has to be interested too so it can pass the information on. 5774 that->info()->AddFromFollowing(target->info()); 5775 } 5776 } 5777 5778 5779 void Analysis::VisitChoice(ChoiceNode* that) { 5780 NodeInfo* info = that->info(); 5781 for (int i = 0; i < that->alternatives()->length(); i++) { 5782 RegExpNode* node = that->alternatives()->at(i).node(); 5783 EnsureAnalyzed(node); 5784 if (has_failed()) return; 5785 // Anything the following nodes need to know has to be known by 5786 // this node also, so it can pass it on. 5787 info->AddFromFollowing(node->info()); 5788 } 5789 } 5790 5791 5792 void Analysis::VisitLoopChoice(LoopChoiceNode* that) { 5793 NodeInfo* info = that->info(); 5794 for (int i = 0; i < that->alternatives()->length(); i++) { 5795 RegExpNode* node = that->alternatives()->at(i).node(); 5796 if (node != that->loop_node()) { 5797 EnsureAnalyzed(node); 5798 if (has_failed()) return; 5799 info->AddFromFollowing(node->info()); 5800 } 5801 } 5802 // Check the loop last since it may need the value of this node 5803 // to get a correct result. 5804 EnsureAnalyzed(that->loop_node()); 5805 if (!has_failed()) { 5806 info->AddFromFollowing(that->loop_node()->info()); 5807 } 5808 } 5809 5810 5811 void Analysis::VisitBackReference(BackReferenceNode* that) { 5812 EnsureAnalyzed(that->on_success()); 5813 } 5814 5815 5816 void Analysis::VisitAssertion(AssertionNode* that) { 5817 EnsureAnalyzed(that->on_success()); 5818 } 5819 5820 5821 void BackReferenceNode::FillInBMInfo(int offset, 5822 int budget, 5823 BoyerMooreLookahead* bm, 5824 bool not_at_start) { 5825 // Working out the set of characters that a backreference can match is too 5826 // hard, so we just say that any character can match. 5827 bm->SetRest(offset); 5828 SaveBMInfo(bm, not_at_start, offset); 5829 } 5830 5831 5832 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize == 5833 RegExpMacroAssembler::kTableSize); 5834 5835 5836 void ChoiceNode::FillInBMInfo(int offset, 5837 int budget, 5838 BoyerMooreLookahead* bm, 5839 bool not_at_start) { 5840 ZoneList<GuardedAlternative>* alts = alternatives(); 5841 budget = (budget - 1) / alts->length(); 5842 for (int i = 0; i < alts->length(); i++) { 5843 GuardedAlternative& alt = alts->at(i); 5844 if (alt.guards() != NULL && alt.guards()->length() != 0) { 5845 bm->SetRest(offset); // Give up trying to fill in info. 5846 SaveBMInfo(bm, not_at_start, offset); 5847 return; 5848 } 5849 alt.node()->FillInBMInfo(offset, budget, bm, not_at_start); 5850 } 5851 SaveBMInfo(bm, not_at_start, offset); 5852 } 5853 5854 5855 void TextNode::FillInBMInfo(int initial_offset, 5856 int budget, 5857 BoyerMooreLookahead* bm, 5858 bool not_at_start) { 5859 if (initial_offset >= bm->length()) return; 5860 int offset = initial_offset; 5861 int max_char = bm->max_char(); 5862 for (int i = 0; i < elements()->length(); i++) { 5863 if (offset >= bm->length()) { 5864 if (initial_offset == 0) set_bm_info(not_at_start, bm); 5865 return; 5866 } 5867 TextElement text = elements()->at(i); 5868 if (text.text_type() == TextElement::ATOM) { 5869 RegExpAtom* atom = text.atom(); 5870 for (int j = 0; j < atom->length(); j++, offset++) { 5871 if (offset >= bm->length()) { 5872 if (initial_offset == 0) set_bm_info(not_at_start, bm); 5873 return; 5874 } 5875 uc16 character = atom->data()[j]; 5876 if (bm->compiler()->ignore_case()) { 5877 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 5878 int length = GetCaseIndependentLetters( 5879 Isolate::Current(), 5880 character, 5881 bm->max_char() == String::kMaxOneByteCharCode, 5882 chars); 5883 for (int j = 0; j < length; j++) { 5884 bm->Set(offset, chars[j]); 5885 } 5886 } else { 5887 if (character <= max_char) bm->Set(offset, character); 5888 } 5889 } 5890 } else { 5891 DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type()); 5892 RegExpCharacterClass* char_class = text.char_class(); 5893 ZoneList<CharacterRange>* ranges = char_class->ranges(zone()); 5894 if (char_class->is_negated()) { 5895 bm->SetAll(offset); 5896 } else { 5897 for (int k = 0; k < ranges->length(); k++) { 5898 CharacterRange& range = ranges->at(k); 5899 if (range.from() > max_char) continue; 5900 int to = Min(max_char, static_cast<int>(range.to())); 5901 bm->SetInterval(offset, Interval(range.from(), to)); 5902 } 5903 } 5904 offset++; 5905 } 5906 } 5907 if (offset >= bm->length()) { 5908 if (initial_offset == 0) set_bm_info(not_at_start, bm); 5909 return; 5910 } 5911 on_success()->FillInBMInfo(offset, 5912 budget - 1, 5913 bm, 5914 true); // Not at start after a text node. 5915 if (initial_offset == 0) set_bm_info(not_at_start, bm); 5916 } 5917 5918 5919 // ------------------------------------------------------------------- 5920 // Dispatch table construction 5921 5922 5923 void DispatchTableConstructor::VisitEnd(EndNode* that) { 5924 AddRange(CharacterRange::Everything()); 5925 } 5926 5927 5928 void DispatchTableConstructor::BuildTable(ChoiceNode* node) { 5929 node->set_being_calculated(true); 5930 ZoneList<GuardedAlternative>* alternatives = node->alternatives(); 5931 for (int i = 0; i < alternatives->length(); i++) { 5932 set_choice_index(i); 5933 alternatives->at(i).node()->Accept(this); 5934 } 5935 node->set_being_calculated(false); 5936 } 5937 5938 5939 class AddDispatchRange { 5940 public: 5941 explicit AddDispatchRange(DispatchTableConstructor* constructor) 5942 : constructor_(constructor) { } 5943 void Call(uc32 from, DispatchTable::Entry entry); 5944 private: 5945 DispatchTableConstructor* constructor_; 5946 }; 5947 5948 5949 void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) { 5950 CharacterRange range(from, entry.to()); 5951 constructor_->AddRange(range); 5952 } 5953 5954 5955 void DispatchTableConstructor::VisitChoice(ChoiceNode* node) { 5956 if (node->being_calculated()) 5957 return; 5958 DispatchTable* table = node->GetTable(ignore_case_); 5959 AddDispatchRange adder(this); 5960 table->ForEach(&adder); 5961 } 5962 5963 5964 void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) { 5965 // TODO(160): Find the node that we refer back to and propagate its start 5966 // set back to here. For now we just accept anything. 5967 AddRange(CharacterRange::Everything()); 5968 } 5969 5970 5971 void DispatchTableConstructor::VisitAssertion(AssertionNode* that) { 5972 RegExpNode* target = that->on_success(); 5973 target->Accept(this); 5974 } 5975 5976 5977 static int CompareRangeByFrom(const CharacterRange* a, 5978 const CharacterRange* b) { 5979 return Compare<uc16>(a->from(), b->from()); 5980 } 5981 5982 5983 void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) { 5984 ranges->Sort(CompareRangeByFrom); 5985 uc16 last = 0; 5986 for (int i = 0; i < ranges->length(); i++) { 5987 CharacterRange range = ranges->at(i); 5988 if (last < range.from()) 5989 AddRange(CharacterRange(last, range.from() - 1)); 5990 if (range.to() >= last) { 5991 if (range.to() == String::kMaxUtf16CodeUnit) { 5992 return; 5993 } else { 5994 last = range.to() + 1; 5995 } 5996 } 5997 } 5998 AddRange(CharacterRange(last, String::kMaxUtf16CodeUnit)); 5999 } 6000 6001 6002 void DispatchTableConstructor::VisitText(TextNode* that) { 6003 TextElement elm = that->elements()->at(0); 6004 switch (elm.text_type()) { 6005 case TextElement::ATOM: { 6006 uc16 c = elm.atom()->data()[0]; 6007 AddRange(CharacterRange(c, c)); 6008 break; 6009 } 6010 case TextElement::CHAR_CLASS: { 6011 RegExpCharacterClass* tree = elm.char_class(); 6012 ZoneList<CharacterRange>* ranges = tree->ranges(that->zone()); 6013 if (tree->is_negated()) { 6014 AddInverse(ranges); 6015 } else { 6016 for (int i = 0; i < ranges->length(); i++) 6017 AddRange(ranges->at(i)); 6018 } 6019 break; 6020 } 6021 default: { 6022 UNIMPLEMENTED(); 6023 } 6024 } 6025 } 6026 6027 6028 void DispatchTableConstructor::VisitAction(ActionNode* that) { 6029 RegExpNode* target = that->on_success(); 6030 target->Accept(this); 6031 } 6032 6033 6034 RegExpEngine::CompilationResult RegExpEngine::Compile( 6035 RegExpCompileData* data, bool ignore_case, bool is_global, 6036 bool is_multiline, bool is_sticky, Handle<String> pattern, 6037 Handle<String> sample_subject, bool is_one_byte, Zone* zone) { 6038 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) { 6039 return IrregexpRegExpTooBig(zone->isolate()); 6040 } 6041 RegExpCompiler compiler(data->capture_count, ignore_case, is_one_byte, zone); 6042 6043 // Sample some characters from the middle of the string. 6044 static const int kSampleSize = 128; 6045 6046 sample_subject = String::Flatten(sample_subject); 6047 int chars_sampled = 0; 6048 int half_way = (sample_subject->length() - kSampleSize) / 2; 6049 for (int i = Max(0, half_way); 6050 i < sample_subject->length() && chars_sampled < kSampleSize; 6051 i++, chars_sampled++) { 6052 compiler.frequency_collator()->CountCharacter(sample_subject->Get(i)); 6053 } 6054 6055 // Wrap the body of the regexp in capture #0. 6056 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree, 6057 0, 6058 &compiler, 6059 compiler.accept()); 6060 RegExpNode* node = captured_body; 6061 bool is_end_anchored = data->tree->IsAnchoredAtEnd(); 6062 bool is_start_anchored = data->tree->IsAnchoredAtStart(); 6063 int max_length = data->tree->max_match(); 6064 if (!is_start_anchored && !is_sticky) { 6065 // Add a .*? at the beginning, outside the body capture, unless 6066 // this expression is anchored at the beginning or sticky. 6067 RegExpNode* loop_node = 6068 RegExpQuantifier::ToNode(0, 6069 RegExpTree::kInfinity, 6070 false, 6071 new(zone) RegExpCharacterClass('*'), 6072 &compiler, 6073 captured_body, 6074 data->contains_anchor); 6075 6076 if (data->contains_anchor) { 6077 // Unroll loop once, to take care of the case that might start 6078 // at the start of input. 6079 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone); 6080 first_step_node->AddAlternative(GuardedAlternative(captured_body)); 6081 first_step_node->AddAlternative(GuardedAlternative( 6082 new(zone) TextNode(new(zone) RegExpCharacterClass('*'), loop_node))); 6083 node = first_step_node; 6084 } else { 6085 node = loop_node; 6086 } 6087 } 6088 if (is_one_byte) { 6089 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case); 6090 // Do it again to propagate the new nodes to places where they were not 6091 // put because they had not been calculated yet. 6092 if (node != NULL) { 6093 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case); 6094 } 6095 } 6096 6097 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone); 6098 data->node = node; 6099 Analysis analysis(ignore_case, is_one_byte); 6100 analysis.EnsureAnalyzed(node); 6101 if (analysis.has_failed()) { 6102 const char* error_message = analysis.error_message(); 6103 return CompilationResult(zone->isolate(), error_message); 6104 } 6105 6106 // Create the correct assembler for the architecture. 6107 #ifndef V8_INTERPRETED_REGEXP 6108 // Native regexp implementation. 6109 6110 NativeRegExpMacroAssembler::Mode mode = 6111 is_one_byte ? NativeRegExpMacroAssembler::LATIN1 6112 : NativeRegExpMacroAssembler::UC16; 6113 6114 #if V8_TARGET_ARCH_IA32 6115 RegExpMacroAssemblerIA32 macro_assembler(mode, (data->capture_count + 1) * 2, 6116 zone); 6117 #elif V8_TARGET_ARCH_X64 6118 RegExpMacroAssemblerX64 macro_assembler(mode, (data->capture_count + 1) * 2, 6119 zone); 6120 #elif V8_TARGET_ARCH_ARM 6121 RegExpMacroAssemblerARM macro_assembler(mode, (data->capture_count + 1) * 2, 6122 zone); 6123 #elif V8_TARGET_ARCH_ARM64 6124 RegExpMacroAssemblerARM64 macro_assembler(mode, (data->capture_count + 1) * 2, 6125 zone); 6126 #elif V8_TARGET_ARCH_MIPS 6127 RegExpMacroAssemblerMIPS macro_assembler(mode, (data->capture_count + 1) * 2, 6128 zone); 6129 #elif V8_TARGET_ARCH_MIPS64 6130 RegExpMacroAssemblerMIPS macro_assembler(mode, (data->capture_count + 1) * 2, 6131 zone); 6132 #elif V8_TARGET_ARCH_X87 6133 RegExpMacroAssemblerX87 macro_assembler(mode, (data->capture_count + 1) * 2, 6134 zone); 6135 #else 6136 #error "Unsupported architecture" 6137 #endif 6138 6139 #else // V8_INTERPRETED_REGEXP 6140 // Interpreted regexp implementation. 6141 EmbeddedVector<byte, 1024> codes; 6142 RegExpMacroAssemblerIrregexp macro_assembler(codes, zone); 6143 #endif // V8_INTERPRETED_REGEXP 6144 6145 // Inserted here, instead of in Assembler, because it depends on information 6146 // in the AST that isn't replicated in the Node structure. 6147 static const int kMaxBacksearchLimit = 1024; 6148 if (is_end_anchored && 6149 !is_start_anchored && 6150 max_length < kMaxBacksearchLimit) { 6151 macro_assembler.SetCurrentPositionFromEnd(max_length); 6152 } 6153 6154 if (is_global) { 6155 macro_assembler.set_global_mode( 6156 (data->tree->min_match() > 0) 6157 ? RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK 6158 : RegExpMacroAssembler::GLOBAL); 6159 } 6160 6161 return compiler.Assemble(¯o_assembler, 6162 node, 6163 data->capture_count, 6164 pattern); 6165 } 6166 6167 6168 }} // namespace v8::internal 6169