1 // Copyright 2012 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "src/regexp/jsregexp.h" 6 7 #include <memory> 8 9 #include "src/base/platform/platform.h" 10 #include "src/compilation-cache.h" 11 #include "src/elements.h" 12 #include "src/execution.h" 13 #include "src/factory.h" 14 #include "src/isolate-inl.h" 15 #include "src/messages.h" 16 #include "src/ostreams.h" 17 #include "src/regexp/interpreter-irregexp.h" 18 #include "src/regexp/jsregexp-inl.h" 19 #include "src/regexp/regexp-macro-assembler-irregexp.h" 20 #include "src/regexp/regexp-macro-assembler-tracer.h" 21 #include "src/regexp/regexp-macro-assembler.h" 22 #include "src/regexp/regexp-parser.h" 23 #include "src/regexp/regexp-stack.h" 24 #include "src/runtime/runtime.h" 25 #include "src/splay-tree-inl.h" 26 #include "src/string-search.h" 27 #include "src/unicode-decoder.h" 28 29 #ifdef V8_I18N_SUPPORT 30 #include "unicode/uniset.h" 31 #include "unicode/utypes.h" 32 #endif // V8_I18N_SUPPORT 33 34 #ifndef V8_INTERPRETED_REGEXP 35 #if V8_TARGET_ARCH_IA32 36 #include "src/regexp/ia32/regexp-macro-assembler-ia32.h" 37 #elif V8_TARGET_ARCH_X64 38 #include "src/regexp/x64/regexp-macro-assembler-x64.h" 39 #elif V8_TARGET_ARCH_ARM64 40 #include "src/regexp/arm64/regexp-macro-assembler-arm64.h" 41 #elif V8_TARGET_ARCH_ARM 42 #include "src/regexp/arm/regexp-macro-assembler-arm.h" 43 #elif V8_TARGET_ARCH_PPC 44 #include "src/regexp/ppc/regexp-macro-assembler-ppc.h" 45 #elif V8_TARGET_ARCH_S390 46 #include "src/regexp/s390/regexp-macro-assembler-s390.h" 47 #elif V8_TARGET_ARCH_MIPS 48 #include "src/regexp/mips/regexp-macro-assembler-mips.h" 49 #elif V8_TARGET_ARCH_MIPS64 50 #include "src/regexp/mips64/regexp-macro-assembler-mips64.h" 51 #elif V8_TARGET_ARCH_X87 52 #include "src/regexp/x87/regexp-macro-assembler-x87.h" 53 #else 54 #error Unsupported target architecture. 55 #endif 56 #endif 57 58 59 namespace v8 { 60 namespace internal { 61 62 MUST_USE_RESULT 63 static inline MaybeHandle<Object> ThrowRegExpException( 64 Handle<JSRegExp> re, Handle<String> pattern, Handle<String> error_text) { 65 Isolate* isolate = re->GetIsolate(); 66 THROW_NEW_ERROR(isolate, NewSyntaxError(MessageTemplate::kMalformedRegExp, 67 pattern, error_text), 68 Object); 69 } 70 71 72 inline void ThrowRegExpException(Handle<JSRegExp> re, 73 Handle<String> error_text) { 74 USE(ThrowRegExpException(re, Handle<String>(re->Pattern()), error_text)); 75 } 76 77 78 ContainedInLattice AddRange(ContainedInLattice containment, 79 const int* ranges, 80 int ranges_length, 81 Interval new_range) { 82 DCHECK((ranges_length & 1) == 1); 83 DCHECK(ranges[ranges_length - 1] == String::kMaxCodePoint + 1); 84 if (containment == kLatticeUnknown) return containment; 85 bool inside = false; 86 int last = 0; 87 for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) { 88 // Consider the range from last to ranges[i]. 89 // We haven't got to the new range yet. 90 if (ranges[i] <= new_range.from()) continue; 91 // New range is wholly inside last-ranges[i]. Note that new_range.to() is 92 // inclusive, but the values in ranges are not. 93 if (last <= new_range.from() && new_range.to() < ranges[i]) { 94 return Combine(containment, inside ? kLatticeIn : kLatticeOut); 95 } 96 return kLatticeUnknown; 97 } 98 return containment; 99 } 100 101 102 // More makes code generation slower, less makes V8 benchmark score lower. 103 const int kMaxLookaheadForBoyerMoore = 8; 104 // In a 3-character pattern you can maximally step forwards 3 characters 105 // at a time, which is not always enough to pay for the extra logic. 106 const int kPatternTooShortForBoyerMoore = 2; 107 108 109 // Identifies the sort of regexps where the regexp engine is faster 110 // than the code used for atom matches. 111 static bool HasFewDifferentCharacters(Handle<String> pattern) { 112 int length = Min(kMaxLookaheadForBoyerMoore, pattern->length()); 113 if (length <= kPatternTooShortForBoyerMoore) return false; 114 const int kMod = 128; 115 bool character_found[kMod]; 116 int different = 0; 117 memset(&character_found[0], 0, sizeof(character_found)); 118 for (int i = 0; i < length; i++) { 119 int ch = (pattern->Get(i) & (kMod - 1)); 120 if (!character_found[ch]) { 121 character_found[ch] = true; 122 different++; 123 // We declare a regexp low-alphabet if it has at least 3 times as many 124 // characters as it has different characters. 125 if (different * 3 > length) return false; 126 } 127 } 128 return true; 129 } 130 131 132 // Generic RegExp methods. Dispatches to implementation specific methods. 133 134 135 MaybeHandle<Object> RegExpImpl::Compile(Handle<JSRegExp> re, 136 Handle<String> pattern, 137 JSRegExp::Flags flags) { 138 Isolate* isolate = re->GetIsolate(); 139 Zone zone(isolate->allocator(), ZONE_NAME); 140 CompilationCache* compilation_cache = isolate->compilation_cache(); 141 MaybeHandle<FixedArray> maybe_cached = 142 compilation_cache->LookupRegExp(pattern, flags); 143 Handle<FixedArray> cached; 144 if (maybe_cached.ToHandle(&cached)) { 145 re->set_data(*cached); 146 return re; 147 } 148 pattern = String::Flatten(pattern); 149 PostponeInterruptsScope postpone(isolate); 150 RegExpCompileData parse_result; 151 FlatStringReader reader(isolate, pattern); 152 if (!RegExpParser::ParseRegExp(re->GetIsolate(), &zone, &reader, flags, 153 &parse_result)) { 154 // Throw an exception if we fail to parse the pattern. 155 return ThrowRegExpException(re, pattern, parse_result.error); 156 } 157 158 bool has_been_compiled = false; 159 160 if (parse_result.simple && !(flags & JSRegExp::kIgnoreCase) && 161 !(flags & JSRegExp::kSticky) && !HasFewDifferentCharacters(pattern)) { 162 // Parse-tree is a single atom that is equal to the pattern. 163 AtomCompile(re, pattern, flags, pattern); 164 has_been_compiled = true; 165 } else if (parse_result.tree->IsAtom() && !(flags & JSRegExp::kIgnoreCase) && 166 !(flags & JSRegExp::kSticky) && parse_result.capture_count == 0) { 167 RegExpAtom* atom = parse_result.tree->AsAtom(); 168 Vector<const uc16> atom_pattern = atom->data(); 169 Handle<String> atom_string; 170 ASSIGN_RETURN_ON_EXCEPTION( 171 isolate, atom_string, 172 isolate->factory()->NewStringFromTwoByte(atom_pattern), 173 Object); 174 if (!HasFewDifferentCharacters(atom_string)) { 175 AtomCompile(re, pattern, flags, atom_string); 176 has_been_compiled = true; 177 } 178 } 179 if (!has_been_compiled) { 180 IrregexpInitialize(re, pattern, flags, parse_result.capture_count); 181 } 182 DCHECK(re->data()->IsFixedArray()); 183 // Compilation succeeded so the data is set on the regexp 184 // and we can store it in the cache. 185 Handle<FixedArray> data(FixedArray::cast(re->data())); 186 compilation_cache->PutRegExp(pattern, flags, data); 187 188 return re; 189 } 190 191 MaybeHandle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp, 192 Handle<String> subject, int index, 193 Handle<RegExpMatchInfo> last_match_info) { 194 switch (regexp->TypeTag()) { 195 case JSRegExp::ATOM: 196 return AtomExec(regexp, subject, index, last_match_info); 197 case JSRegExp::IRREGEXP: { 198 return IrregexpExec(regexp, subject, index, last_match_info); 199 } 200 default: 201 UNREACHABLE(); 202 return MaybeHandle<Object>(); 203 } 204 } 205 206 207 // RegExp Atom implementation: Simple string search using indexOf. 208 209 210 void RegExpImpl::AtomCompile(Handle<JSRegExp> re, 211 Handle<String> pattern, 212 JSRegExp::Flags flags, 213 Handle<String> match_pattern) { 214 re->GetIsolate()->factory()->SetRegExpAtomData(re, 215 JSRegExp::ATOM, 216 pattern, 217 flags, 218 match_pattern); 219 } 220 221 static void SetAtomLastCapture(Handle<RegExpMatchInfo> last_match_info, 222 String* subject, int from, int to) { 223 SealHandleScope shs(last_match_info->GetIsolate()); 224 last_match_info->SetNumberOfCaptureRegisters(2); 225 last_match_info->SetLastSubject(subject); 226 last_match_info->SetLastInput(subject); 227 last_match_info->SetCapture(0, from); 228 last_match_info->SetCapture(1, to); 229 } 230 231 232 int RegExpImpl::AtomExecRaw(Handle<JSRegExp> regexp, 233 Handle<String> subject, 234 int index, 235 int32_t* output, 236 int output_size) { 237 Isolate* isolate = regexp->GetIsolate(); 238 239 DCHECK(0 <= index); 240 DCHECK(index <= subject->length()); 241 242 subject = String::Flatten(subject); 243 DisallowHeapAllocation no_gc; // ensure vectors stay valid 244 245 String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex)); 246 int needle_len = needle->length(); 247 DCHECK(needle->IsFlat()); 248 DCHECK_LT(0, needle_len); 249 250 if (index + needle_len > subject->length()) { 251 return RegExpImpl::RE_FAILURE; 252 } 253 254 for (int i = 0; i < output_size; i += 2) { 255 String::FlatContent needle_content = needle->GetFlatContent(); 256 String::FlatContent subject_content = subject->GetFlatContent(); 257 DCHECK(needle_content.IsFlat()); 258 DCHECK(subject_content.IsFlat()); 259 // dispatch on type of strings 260 index = 261 (needle_content.IsOneByte() 262 ? (subject_content.IsOneByte() 263 ? SearchString(isolate, subject_content.ToOneByteVector(), 264 needle_content.ToOneByteVector(), index) 265 : SearchString(isolate, subject_content.ToUC16Vector(), 266 needle_content.ToOneByteVector(), index)) 267 : (subject_content.IsOneByte() 268 ? SearchString(isolate, subject_content.ToOneByteVector(), 269 needle_content.ToUC16Vector(), index) 270 : SearchString(isolate, subject_content.ToUC16Vector(), 271 needle_content.ToUC16Vector(), index))); 272 if (index == -1) { 273 return i / 2; // Return number of matches. 274 } else { 275 output[i] = index; 276 output[i+1] = index + needle_len; 277 index += needle_len; 278 } 279 } 280 return output_size / 2; 281 } 282 283 Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re, Handle<String> subject, 284 int index, 285 Handle<RegExpMatchInfo> last_match_info) { 286 Isolate* isolate = re->GetIsolate(); 287 288 static const int kNumRegisters = 2; 289 STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize); 290 int32_t* output_registers = isolate->jsregexp_static_offsets_vector(); 291 292 int res = AtomExecRaw(re, subject, index, output_registers, kNumRegisters); 293 294 if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value(); 295 296 DCHECK_EQ(res, RegExpImpl::RE_SUCCESS); 297 SealHandleScope shs(isolate); 298 SetAtomLastCapture(last_match_info, *subject, output_registers[0], 299 output_registers[1]); 300 return last_match_info; 301 } 302 303 304 // Irregexp implementation. 305 306 // Ensures that the regexp object contains a compiled version of the 307 // source for either one-byte or two-byte subject strings. 308 // If the compiled version doesn't already exist, it is compiled 309 // from the source pattern. 310 // If compilation fails, an exception is thrown and this function 311 // returns false. 312 bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, 313 Handle<String> sample_subject, 314 bool is_one_byte) { 315 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_one_byte)); 316 #ifdef V8_INTERPRETED_REGEXP 317 if (compiled_code->IsByteArray()) return true; 318 #else // V8_INTERPRETED_REGEXP (RegExp native code) 319 if (compiled_code->IsCode()) return true; 320 #endif 321 // We could potentially have marked this as flushable, but have kept 322 // a saved version if we did not flush it yet. 323 Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_one_byte)); 324 if (saved_code->IsCode()) { 325 // Reinstate the code in the original place. 326 re->SetDataAt(JSRegExp::code_index(is_one_byte), saved_code); 327 DCHECK(compiled_code->IsSmi()); 328 return true; 329 } 330 return CompileIrregexp(re, sample_subject, is_one_byte); 331 } 332 333 334 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, 335 Handle<String> sample_subject, 336 bool is_one_byte) { 337 // Compile the RegExp. 338 Isolate* isolate = re->GetIsolate(); 339 Zone zone(isolate->allocator(), ZONE_NAME); 340 PostponeInterruptsScope postpone(isolate); 341 // If we had a compilation error the last time this is saved at the 342 // saved code index. 343 Object* entry = re->DataAt(JSRegExp::code_index(is_one_byte)); 344 // When arriving here entry can only be a smi, either representing an 345 // uncompiled regexp, a previous compilation error, or code that has 346 // been flushed. 347 DCHECK(entry->IsSmi()); 348 int entry_value = Smi::cast(entry)->value(); 349 DCHECK(entry_value == JSRegExp::kUninitializedValue || 350 entry_value == JSRegExp::kCompilationErrorValue || 351 (entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0)); 352 353 if (entry_value == JSRegExp::kCompilationErrorValue) { 354 // A previous compilation failed and threw an error which we store in 355 // the saved code index (we store the error message, not the actual 356 // error). Recreate the error object and throw it. 357 Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_one_byte)); 358 DCHECK(error_string->IsString()); 359 Handle<String> error_message(String::cast(error_string)); 360 ThrowRegExpException(re, error_message); 361 return false; 362 } 363 364 JSRegExp::Flags flags = re->GetFlags(); 365 366 Handle<String> pattern(re->Pattern()); 367 pattern = String::Flatten(pattern); 368 RegExpCompileData compile_data; 369 FlatStringReader reader(isolate, pattern); 370 if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags, 371 &compile_data)) { 372 // Throw an exception if we fail to parse the pattern. 373 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once. 374 USE(ThrowRegExpException(re, pattern, compile_data.error)); 375 return false; 376 } 377 RegExpEngine::CompilationResult result = 378 RegExpEngine::Compile(isolate, &zone, &compile_data, flags, pattern, 379 sample_subject, is_one_byte); 380 if (result.error_message != NULL) { 381 // Unable to compile regexp. 382 Handle<String> error_message = isolate->factory()->NewStringFromUtf8( 383 CStrVector(result.error_message)).ToHandleChecked(); 384 ThrowRegExpException(re, error_message); 385 return false; 386 } 387 388 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data())); 389 data->set(JSRegExp::code_index(is_one_byte), result.code); 390 SetIrregexpCaptureNameMap(*data, compile_data.capture_name_map); 391 int register_max = IrregexpMaxRegisterCount(*data); 392 if (result.num_registers > register_max) { 393 SetIrregexpMaxRegisterCount(*data, result.num_registers); 394 } 395 396 return true; 397 } 398 399 400 int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) { 401 return Smi::cast( 402 re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value(); 403 } 404 405 406 void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) { 407 re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value)); 408 } 409 410 void RegExpImpl::SetIrregexpCaptureNameMap(FixedArray* re, 411 Handle<FixedArray> value) { 412 if (value.is_null()) { 413 re->set(JSRegExp::kIrregexpCaptureNameMapIndex, Smi::kZero); 414 } else { 415 re->set(JSRegExp::kIrregexpCaptureNameMapIndex, *value); 416 } 417 } 418 419 int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) { 420 return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value(); 421 } 422 423 424 int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) { 425 return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value(); 426 } 427 428 429 ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_one_byte) { 430 return ByteArray::cast(re->get(JSRegExp::code_index(is_one_byte))); 431 } 432 433 434 Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_one_byte) { 435 return Code::cast(re->get(JSRegExp::code_index(is_one_byte))); 436 } 437 438 439 void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re, 440 Handle<String> pattern, 441 JSRegExp::Flags flags, 442 int capture_count) { 443 // Initialize compiled code entries to null. 444 re->GetIsolate()->factory()->SetRegExpIrregexpData(re, 445 JSRegExp::IRREGEXP, 446 pattern, 447 flags, 448 capture_count); 449 } 450 451 452 int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp, 453 Handle<String> subject) { 454 DCHECK(subject->IsFlat()); 455 456 // Check representation of the underlying storage. 457 bool is_one_byte = subject->IsOneByteRepresentationUnderneath(); 458 if (!EnsureCompiledIrregexp(regexp, subject, is_one_byte)) return -1; 459 460 #ifdef V8_INTERPRETED_REGEXP 461 // Byte-code regexp needs space allocated for all its registers. 462 // The result captures are copied to the start of the registers array 463 // if the match succeeds. This way those registers are not clobbered 464 // when we set the last match info from last successful match. 465 return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) + 466 (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2; 467 #else // V8_INTERPRETED_REGEXP 468 // Native regexp only needs room to output captures. Registers are handled 469 // internally. 470 return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2; 471 #endif // V8_INTERPRETED_REGEXP 472 } 473 474 475 int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp, 476 Handle<String> subject, 477 int index, 478 int32_t* output, 479 int output_size) { 480 Isolate* isolate = regexp->GetIsolate(); 481 482 Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate); 483 484 DCHECK(index >= 0); 485 DCHECK(index <= subject->length()); 486 DCHECK(subject->IsFlat()); 487 488 bool is_one_byte = subject->IsOneByteRepresentationUnderneath(); 489 490 #ifndef V8_INTERPRETED_REGEXP 491 DCHECK(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2); 492 do { 493 EnsureCompiledIrregexp(regexp, subject, is_one_byte); 494 Handle<Code> code(IrregexpNativeCode(*irregexp, is_one_byte), isolate); 495 // The stack is used to allocate registers for the compiled regexp code. 496 // This means that in case of failure, the output registers array is left 497 // untouched and contains the capture results from the previous successful 498 // match. We can use that to set the last match info lazily. 499 NativeRegExpMacroAssembler::Result res = 500 NativeRegExpMacroAssembler::Match(code, 501 subject, 502 output, 503 output_size, 504 index, 505 isolate); 506 if (res != NativeRegExpMacroAssembler::RETRY) { 507 DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION || 508 isolate->has_pending_exception()); 509 STATIC_ASSERT( 510 static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS); 511 STATIC_ASSERT( 512 static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE); 513 STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION) 514 == RE_EXCEPTION); 515 return static_cast<IrregexpResult>(res); 516 } 517 // If result is RETRY, the string has changed representation, and we 518 // must restart from scratch. 519 // In this case, it means we must make sure we are prepared to handle 520 // the, potentially, different subject (the string can switch between 521 // being internal and external, and even between being Latin1 and UC16, 522 // but the characters are always the same). 523 IrregexpPrepare(regexp, subject); 524 is_one_byte = subject->IsOneByteRepresentationUnderneath(); 525 } while (true); 526 UNREACHABLE(); 527 return RE_EXCEPTION; 528 #else // V8_INTERPRETED_REGEXP 529 530 DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp)); 531 // We must have done EnsureCompiledIrregexp, so we can get the number of 532 // registers. 533 int number_of_capture_registers = 534 (IrregexpNumberOfCaptures(*irregexp) + 1) * 2; 535 int32_t* raw_output = &output[number_of_capture_registers]; 536 // We do not touch the actual capture result registers until we know there 537 // has been a match so that we can use those capture results to set the 538 // last match info. 539 for (int i = number_of_capture_registers - 1; i >= 0; i--) { 540 raw_output[i] = -1; 541 } 542 Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_one_byte), 543 isolate); 544 545 IrregexpResult result = IrregexpInterpreter::Match(isolate, 546 byte_codes, 547 subject, 548 raw_output, 549 index); 550 if (result == RE_SUCCESS) { 551 // Copy capture results to the start of the registers array. 552 MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t)); 553 } 554 if (result == RE_EXCEPTION) { 555 DCHECK(!isolate->has_pending_exception()); 556 isolate->StackOverflow(); 557 } 558 return result; 559 #endif // V8_INTERPRETED_REGEXP 560 } 561 562 MaybeHandle<Object> RegExpImpl::IrregexpExec( 563 Handle<JSRegExp> regexp, Handle<String> subject, int previous_index, 564 Handle<RegExpMatchInfo> last_match_info) { 565 Isolate* isolate = regexp->GetIsolate(); 566 DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP); 567 568 subject = String::Flatten(subject); 569 570 // Prepare space for the return values. 571 #if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG) 572 if (FLAG_trace_regexp_bytecodes) { 573 String* pattern = regexp->Pattern(); 574 PrintF("\n\nRegexp match: /%s/\n\n", pattern->ToCString().get()); 575 PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get()); 576 } 577 #endif 578 int required_registers = RegExpImpl::IrregexpPrepare(regexp, subject); 579 if (required_registers < 0) { 580 // Compiling failed with an exception. 581 DCHECK(isolate->has_pending_exception()); 582 return MaybeHandle<Object>(); 583 } 584 585 int32_t* output_registers = NULL; 586 if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) { 587 output_registers = NewArray<int32_t>(required_registers); 588 } 589 std::unique_ptr<int32_t[]> auto_release(output_registers); 590 if (output_registers == NULL) { 591 output_registers = isolate->jsregexp_static_offsets_vector(); 592 } 593 594 int res = RegExpImpl::IrregexpExecRaw( 595 regexp, subject, previous_index, output_registers, required_registers); 596 if (res == RE_SUCCESS) { 597 int capture_count = 598 IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())); 599 return SetLastMatchInfo( 600 last_match_info, subject, capture_count, output_registers); 601 } 602 if (res == RE_EXCEPTION) { 603 DCHECK(isolate->has_pending_exception()); 604 return MaybeHandle<Object>(); 605 } 606 DCHECK(res == RE_FAILURE); 607 return isolate->factory()->null_value(); 608 } 609 610 Handle<RegExpMatchInfo> RegExpImpl::SetLastMatchInfo( 611 Handle<RegExpMatchInfo> last_match_info, Handle<String> subject, 612 int capture_count, int32_t* match) { 613 // This is the only place where match infos can grow. If, after executing the 614 // regexp, RegExpExecStub finds that the match info is too small, it restarts 615 // execution in RegExpImpl::Exec, which finally grows the match info right 616 // here. 617 618 int capture_register_count = (capture_count + 1) * 2; 619 Handle<RegExpMatchInfo> result = 620 RegExpMatchInfo::ReserveCaptures(last_match_info, capture_register_count); 621 result->SetNumberOfCaptureRegisters(capture_register_count); 622 623 if (*result != *last_match_info) { 624 // The match info has been reallocated, update the corresponding reference 625 // on the native context. 626 Isolate* isolate = last_match_info->GetIsolate(); 627 if (*last_match_info == *isolate->regexp_last_match_info()) { 628 isolate->native_context()->set_regexp_last_match_info(*result); 629 } else if (*last_match_info == *isolate->regexp_internal_match_info()) { 630 isolate->native_context()->set_regexp_internal_match_info(*result); 631 } 632 } 633 634 DisallowHeapAllocation no_allocation; 635 if (match != NULL) { 636 for (int i = 0; i < capture_register_count; i += 2) { 637 result->SetCapture(i, match[i]); 638 result->SetCapture(i + 1, match[i + 1]); 639 } 640 } 641 result->SetLastSubject(*subject); 642 result->SetLastInput(*subject); 643 return result; 644 } 645 646 647 RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp, 648 Handle<String> subject, 649 Isolate* isolate) 650 : register_array_(NULL), 651 register_array_size_(0), 652 regexp_(regexp), 653 subject_(subject) { 654 #ifdef V8_INTERPRETED_REGEXP 655 bool interpreted = true; 656 #else 657 bool interpreted = false; 658 #endif // V8_INTERPRETED_REGEXP 659 660 if (regexp_->TypeTag() == JSRegExp::ATOM) { 661 static const int kAtomRegistersPerMatch = 2; 662 registers_per_match_ = kAtomRegistersPerMatch; 663 // There is no distinction between interpreted and native for atom regexps. 664 interpreted = false; 665 } else { 666 registers_per_match_ = RegExpImpl::IrregexpPrepare(regexp_, subject_); 667 if (registers_per_match_ < 0) { 668 num_matches_ = -1; // Signal exception. 669 return; 670 } 671 } 672 673 DCHECK_NE(0, regexp->GetFlags() & JSRegExp::kGlobal); 674 if (!interpreted) { 675 register_array_size_ = 676 Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize); 677 max_matches_ = register_array_size_ / registers_per_match_; 678 } else { 679 // Global loop in interpreted regexp is not implemented. We choose 680 // the size of the offsets vector so that it can only store one match. 681 register_array_size_ = registers_per_match_; 682 max_matches_ = 1; 683 } 684 685 if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) { 686 register_array_ = NewArray<int32_t>(register_array_size_); 687 } else { 688 register_array_ = isolate->jsregexp_static_offsets_vector(); 689 } 690 691 // Set state so that fetching the results the first time triggers a call 692 // to the compiled regexp. 693 current_match_index_ = max_matches_ - 1; 694 num_matches_ = max_matches_; 695 DCHECK(registers_per_match_ >= 2); // Each match has at least one capture. 696 DCHECK_GE(register_array_size_, registers_per_match_); 697 int32_t* last_match = 698 ®ister_array_[current_match_index_ * registers_per_match_]; 699 last_match[0] = -1; 700 last_match[1] = 0; 701 } 702 703 int RegExpImpl::GlobalCache::AdvanceZeroLength(int last_index) { 704 if ((regexp_->GetFlags() & JSRegExp::kUnicode) != 0 && 705 last_index + 1 < subject_->length() && 706 unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) && 707 unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) { 708 // Advance over the surrogate pair. 709 return last_index + 2; 710 } 711 return last_index + 1; 712 } 713 714 // ------------------------------------------------------------------- 715 // Implementation of the Irregexp regular expression engine. 716 // 717 // The Irregexp regular expression engine is intended to be a complete 718 // implementation of ECMAScript regular expressions. It generates either 719 // bytecodes or native code. 720 721 // The Irregexp regexp engine is structured in three steps. 722 // 1) The parser generates an abstract syntax tree. See ast.cc. 723 // 2) From the AST a node network is created. The nodes are all 724 // subclasses of RegExpNode. The nodes represent states when 725 // executing a regular expression. Several optimizations are 726 // performed on the node network. 727 // 3) From the nodes we generate either byte codes or native code 728 // that can actually execute the regular expression (perform 729 // the search). The code generation step is described in more 730 // detail below. 731 732 // Code generation. 733 // 734 // The nodes are divided into four main categories. 735 // * Choice nodes 736 // These represent places where the regular expression can 737 // match in more than one way. For example on entry to an 738 // alternation (foo|bar) or a repetition (*, +, ? or {}). 739 // * Action nodes 740 // These represent places where some action should be 741 // performed. Examples include recording the current position 742 // in the input string to a register (in order to implement 743 // captures) or other actions on register for example in order 744 // to implement the counters needed for {} repetitions. 745 // * Matching nodes 746 // These attempt to match some element part of the input string. 747 // Examples of elements include character classes, plain strings 748 // or back references. 749 // * End nodes 750 // These are used to implement the actions required on finding 751 // a successful match or failing to find a match. 752 // 753 // The code generated (whether as byte codes or native code) maintains 754 // some state as it runs. This consists of the following elements: 755 // 756 // * The capture registers. Used for string captures. 757 // * Other registers. Used for counters etc. 758 // * The current position. 759 // * The stack of backtracking information. Used when a matching node 760 // fails to find a match and needs to try an alternative. 761 // 762 // Conceptual regular expression execution model: 763 // 764 // There is a simple conceptual model of regular expression execution 765 // which will be presented first. The actual code generated is a more 766 // efficient simulation of the simple conceptual model: 767 // 768 // * Choice nodes are implemented as follows: 769 // For each choice except the last { 770 // push current position 771 // push backtrack code location 772 // <generate code to test for choice> 773 // backtrack code location: 774 // pop current position 775 // } 776 // <generate code to test for last choice> 777 // 778 // * Actions nodes are generated as follows 779 // <push affected registers on backtrack stack> 780 // <generate code to perform action> 781 // push backtrack code location 782 // <generate code to test for following nodes> 783 // backtrack code location: 784 // <pop affected registers to restore their state> 785 // <pop backtrack location from stack and go to it> 786 // 787 // * Matching nodes are generated as follows: 788 // if input string matches at current position 789 // update current position 790 // <generate code to test for following nodes> 791 // else 792 // <pop backtrack location from stack and go to it> 793 // 794 // Thus it can be seen that the current position is saved and restored 795 // by the choice nodes, whereas the registers are saved and restored by 796 // by the action nodes that manipulate them. 797 // 798 // The other interesting aspect of this model is that nodes are generated 799 // at the point where they are needed by a recursive call to Emit(). If 800 // the node has already been code generated then the Emit() call will 801 // generate a jump to the previously generated code instead. In order to 802 // limit recursion it is possible for the Emit() function to put the node 803 // on a work list for later generation and instead generate a jump. The 804 // destination of the jump is resolved later when the code is generated. 805 // 806 // Actual regular expression code generation. 807 // 808 // Code generation is actually more complicated than the above. In order 809 // to improve the efficiency of the generated code some optimizations are 810 // performed 811 // 812 // * Choice nodes have 1-character lookahead. 813 // A choice node looks at the following character and eliminates some of 814 // the choices immediately based on that character. This is not yet 815 // implemented. 816 // * Simple greedy loops store reduced backtracking information. 817 // A quantifier like /.*foo/m will greedily match the whole input. It will 818 // then need to backtrack to a point where it can match "foo". The naive 819 // implementation of this would push each character position onto the 820 // backtracking stack, then pop them off one by one. This would use space 821 // proportional to the length of the input string. However since the "." 822 // can only match in one way and always has a constant length (in this case 823 // of 1) it suffices to store the current position on the top of the stack 824 // once. Matching now becomes merely incrementing the current position and 825 // backtracking becomes decrementing the current position and checking the 826 // result against the stored current position. This is faster and saves 827 // space. 828 // * The current state is virtualized. 829 // This is used to defer expensive operations until it is clear that they 830 // are needed and to generate code for a node more than once, allowing 831 // specialized an efficient versions of the code to be created. This is 832 // explained in the section below. 833 // 834 // Execution state virtualization. 835 // 836 // Instead of emitting code, nodes that manipulate the state can record their 837 // manipulation in an object called the Trace. The Trace object can record a 838 // current position offset, an optional backtrack code location on the top of 839 // the virtualized backtrack stack and some register changes. When a node is 840 // to be emitted it can flush the Trace or update it. Flushing the Trace 841 // will emit code to bring the actual state into line with the virtual state. 842 // Avoiding flushing the state can postpone some work (e.g. updates of capture 843 // registers). Postponing work can save time when executing the regular 844 // expression since it may be found that the work never has to be done as a 845 // failure to match can occur. In addition it is much faster to jump to a 846 // known backtrack code location than it is to pop an unknown backtrack 847 // location from the stack and jump there. 848 // 849 // The virtual state found in the Trace affects code generation. For example 850 // the virtual state contains the difference between the actual current 851 // position and the virtual current position, and matching code needs to use 852 // this offset to attempt a match in the correct location of the input 853 // string. Therefore code generated for a non-trivial trace is specialized 854 // to that trace. The code generator therefore has the ability to generate 855 // code for each node several times. In order to limit the size of the 856 // generated code there is an arbitrary limit on how many specialized sets of 857 // code may be generated for a given node. If the limit is reached, the 858 // trace is flushed and a generic version of the code for a node is emitted. 859 // This is subsequently used for that node. The code emitted for non-generic 860 // trace is not recorded in the node and so it cannot currently be reused in 861 // the event that code generation is requested for an identical trace. 862 863 864 void RegExpTree::AppendToText(RegExpText* text, Zone* zone) { 865 UNREACHABLE(); 866 } 867 868 869 void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) { 870 text->AddElement(TextElement::Atom(this), zone); 871 } 872 873 874 void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) { 875 text->AddElement(TextElement::CharClass(this), zone); 876 } 877 878 879 void RegExpText::AppendToText(RegExpText* text, Zone* zone) { 880 for (int i = 0; i < elements()->length(); i++) 881 text->AddElement(elements()->at(i), zone); 882 } 883 884 885 TextElement TextElement::Atom(RegExpAtom* atom) { 886 return TextElement(ATOM, atom); 887 } 888 889 890 TextElement TextElement::CharClass(RegExpCharacterClass* char_class) { 891 return TextElement(CHAR_CLASS, char_class); 892 } 893 894 895 int TextElement::length() const { 896 switch (text_type()) { 897 case ATOM: 898 return atom()->length(); 899 900 case CHAR_CLASS: 901 return 1; 902 } 903 UNREACHABLE(); 904 return 0; 905 } 906 907 908 DispatchTable* ChoiceNode::GetTable(bool ignore_case) { 909 if (table_ == NULL) { 910 table_ = new(zone()) DispatchTable(zone()); 911 DispatchTableConstructor cons(table_, ignore_case, zone()); 912 cons.BuildTable(this); 913 } 914 return table_; 915 } 916 917 918 class FrequencyCollator { 919 public: 920 FrequencyCollator() : total_samples_(0) { 921 for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) { 922 frequencies_[i] = CharacterFrequency(i); 923 } 924 } 925 926 void CountCharacter(int character) { 927 int index = (character & RegExpMacroAssembler::kTableMask); 928 frequencies_[index].Increment(); 929 total_samples_++; 930 } 931 932 // Does not measure in percent, but rather per-128 (the table size from the 933 // regexp macro assembler). 934 int Frequency(int in_character) { 935 DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character); 936 if (total_samples_ < 1) return 1; // Division by zero. 937 int freq_in_per128 = 938 (frequencies_[in_character].counter() * 128) / total_samples_; 939 return freq_in_per128; 940 } 941 942 private: 943 class CharacterFrequency { 944 public: 945 CharacterFrequency() : counter_(0), character_(-1) { } 946 explicit CharacterFrequency(int character) 947 : counter_(0), character_(character) { } 948 949 void Increment() { counter_++; } 950 int counter() { return counter_; } 951 int character() { return character_; } 952 953 private: 954 int counter_; 955 int character_; 956 }; 957 958 959 private: 960 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize]; 961 int total_samples_; 962 }; 963 964 965 class RegExpCompiler { 966 public: 967 RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count, 968 JSRegExp::Flags flags, bool is_one_byte); 969 970 int AllocateRegister() { 971 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) { 972 reg_exp_too_big_ = true; 973 return next_register_; 974 } 975 return next_register_++; 976 } 977 978 // Lookarounds to match lone surrogates for unicode character class matches 979 // are never nested. We can therefore reuse registers. 980 int UnicodeLookaroundStackRegister() { 981 if (unicode_lookaround_stack_register_ == kNoRegister) { 982 unicode_lookaround_stack_register_ = AllocateRegister(); 983 } 984 return unicode_lookaround_stack_register_; 985 } 986 987 int UnicodeLookaroundPositionRegister() { 988 if (unicode_lookaround_position_register_ == kNoRegister) { 989 unicode_lookaround_position_register_ = AllocateRegister(); 990 } 991 return unicode_lookaround_position_register_; 992 } 993 994 RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler, 995 RegExpNode* start, 996 int capture_count, 997 Handle<String> pattern); 998 999 inline void AddWork(RegExpNode* node) { 1000 if (!node->on_work_list() && !node->label()->is_bound()) { 1001 node->set_on_work_list(true); 1002 work_list_->Add(node); 1003 } 1004 } 1005 1006 static const int kImplementationOffset = 0; 1007 static const int kNumberOfRegistersOffset = 0; 1008 static const int kCodeOffset = 1; 1009 1010 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; } 1011 EndNode* accept() { return accept_; } 1012 1013 static const int kMaxRecursion = 100; 1014 inline int recursion_depth() { return recursion_depth_; } 1015 inline void IncrementRecursionDepth() { recursion_depth_++; } 1016 inline void DecrementRecursionDepth() { recursion_depth_--; } 1017 1018 void SetRegExpTooBig() { reg_exp_too_big_ = true; } 1019 1020 inline bool ignore_case() { return (flags_ & JSRegExp::kIgnoreCase) != 0; } 1021 inline bool unicode() { return (flags_ & JSRegExp::kUnicode) != 0; } 1022 inline bool one_byte() { return one_byte_; } 1023 inline bool optimize() { return optimize_; } 1024 inline void set_optimize(bool value) { optimize_ = value; } 1025 inline bool limiting_recursion() { return limiting_recursion_; } 1026 inline void set_limiting_recursion(bool value) { 1027 limiting_recursion_ = value; 1028 } 1029 bool read_backward() { return read_backward_; } 1030 void set_read_backward(bool value) { read_backward_ = value; } 1031 FrequencyCollator* frequency_collator() { return &frequency_collator_; } 1032 1033 int current_expansion_factor() { return current_expansion_factor_; } 1034 void set_current_expansion_factor(int value) { 1035 current_expansion_factor_ = value; 1036 } 1037 1038 Isolate* isolate() const { return isolate_; } 1039 Zone* zone() const { return zone_; } 1040 1041 static const int kNoRegister = -1; 1042 1043 private: 1044 EndNode* accept_; 1045 int next_register_; 1046 int unicode_lookaround_stack_register_; 1047 int unicode_lookaround_position_register_; 1048 List<RegExpNode*>* work_list_; 1049 int recursion_depth_; 1050 RegExpMacroAssembler* macro_assembler_; 1051 JSRegExp::Flags flags_; 1052 bool one_byte_; 1053 bool reg_exp_too_big_; 1054 bool limiting_recursion_; 1055 bool optimize_; 1056 bool read_backward_; 1057 int current_expansion_factor_; 1058 FrequencyCollator frequency_collator_; 1059 Isolate* isolate_; 1060 Zone* zone_; 1061 }; 1062 1063 1064 class RecursionCheck { 1065 public: 1066 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) { 1067 compiler->IncrementRecursionDepth(); 1068 } 1069 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); } 1070 private: 1071 RegExpCompiler* compiler_; 1072 }; 1073 1074 1075 static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) { 1076 return RegExpEngine::CompilationResult(isolate, "RegExp too big"); 1077 } 1078 1079 1080 // Attempts to compile the regexp using an Irregexp code generator. Returns 1081 // a fixed array or a null handle depending on whether it succeeded. 1082 RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count, 1083 JSRegExp::Flags flags, bool one_byte) 1084 : next_register_(2 * (capture_count + 1)), 1085 unicode_lookaround_stack_register_(kNoRegister), 1086 unicode_lookaround_position_register_(kNoRegister), 1087 work_list_(NULL), 1088 recursion_depth_(0), 1089 flags_(flags), 1090 one_byte_(one_byte), 1091 reg_exp_too_big_(false), 1092 limiting_recursion_(false), 1093 optimize_(FLAG_regexp_optimization), 1094 read_backward_(false), 1095 current_expansion_factor_(1), 1096 frequency_collator_(), 1097 isolate_(isolate), 1098 zone_(zone) { 1099 accept_ = new(zone) EndNode(EndNode::ACCEPT, zone); 1100 DCHECK(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister); 1101 } 1102 1103 1104 RegExpEngine::CompilationResult RegExpCompiler::Assemble( 1105 RegExpMacroAssembler* macro_assembler, 1106 RegExpNode* start, 1107 int capture_count, 1108 Handle<String> pattern) { 1109 Isolate* isolate = pattern->GetHeap()->isolate(); 1110 1111 #ifdef DEBUG 1112 if (FLAG_trace_regexp_assembler) 1113 macro_assembler_ = new RegExpMacroAssemblerTracer(isolate, macro_assembler); 1114 else 1115 #endif 1116 macro_assembler_ = macro_assembler; 1117 1118 List <RegExpNode*> work_list(0); 1119 work_list_ = &work_list; 1120 Label fail; 1121 macro_assembler_->PushBacktrack(&fail); 1122 Trace new_trace; 1123 start->Emit(this, &new_trace); 1124 macro_assembler_->Bind(&fail); 1125 macro_assembler_->Fail(); 1126 while (!work_list.is_empty()) { 1127 RegExpNode* node = work_list.RemoveLast(); 1128 node->set_on_work_list(false); 1129 if (!node->label()->is_bound()) node->Emit(this, &new_trace); 1130 } 1131 if (reg_exp_too_big_) { 1132 macro_assembler_->AbortedCodeGeneration(); 1133 return IrregexpRegExpTooBig(isolate_); 1134 } 1135 1136 Handle<HeapObject> code = macro_assembler_->GetCode(pattern); 1137 isolate->IncreaseTotalRegexpCodeGenerated(code->Size()); 1138 work_list_ = NULL; 1139 #ifdef ENABLE_DISASSEMBLER 1140 if (FLAG_print_code) { 1141 CodeTracer::Scope trace_scope(isolate->GetCodeTracer()); 1142 OFStream os(trace_scope.file()); 1143 Handle<Code>::cast(code)->Disassemble(pattern->ToCString().get(), os); 1144 } 1145 #endif 1146 #ifdef DEBUG 1147 if (FLAG_trace_regexp_assembler) { 1148 delete macro_assembler_; 1149 } 1150 #endif 1151 return RegExpEngine::CompilationResult(*code, next_register_); 1152 } 1153 1154 1155 bool Trace::DeferredAction::Mentions(int that) { 1156 if (action_type() == ActionNode::CLEAR_CAPTURES) { 1157 Interval range = static_cast<DeferredClearCaptures*>(this)->range(); 1158 return range.Contains(that); 1159 } else { 1160 return reg() == that; 1161 } 1162 } 1163 1164 1165 bool Trace::mentions_reg(int reg) { 1166 for (DeferredAction* action = actions_; 1167 action != NULL; 1168 action = action->next()) { 1169 if (action->Mentions(reg)) 1170 return true; 1171 } 1172 return false; 1173 } 1174 1175 1176 bool Trace::GetStoredPosition(int reg, int* cp_offset) { 1177 DCHECK_EQ(0, *cp_offset); 1178 for (DeferredAction* action = actions_; 1179 action != NULL; 1180 action = action->next()) { 1181 if (action->Mentions(reg)) { 1182 if (action->action_type() == ActionNode::STORE_POSITION) { 1183 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset(); 1184 return true; 1185 } else { 1186 return false; 1187 } 1188 } 1189 } 1190 return false; 1191 } 1192 1193 1194 int Trace::FindAffectedRegisters(OutSet* affected_registers, 1195 Zone* zone) { 1196 int max_register = RegExpCompiler::kNoRegister; 1197 for (DeferredAction* action = actions_; 1198 action != NULL; 1199 action = action->next()) { 1200 if (action->action_type() == ActionNode::CLEAR_CAPTURES) { 1201 Interval range = static_cast<DeferredClearCaptures*>(action)->range(); 1202 for (int i = range.from(); i <= range.to(); i++) 1203 affected_registers->Set(i, zone); 1204 if (range.to() > max_register) max_register = range.to(); 1205 } else { 1206 affected_registers->Set(action->reg(), zone); 1207 if (action->reg() > max_register) max_register = action->reg(); 1208 } 1209 } 1210 return max_register; 1211 } 1212 1213 1214 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler, 1215 int max_register, 1216 const OutSet& registers_to_pop, 1217 const OutSet& registers_to_clear) { 1218 for (int reg = max_register; reg >= 0; reg--) { 1219 if (registers_to_pop.Get(reg)) { 1220 assembler->PopRegister(reg); 1221 } else if (registers_to_clear.Get(reg)) { 1222 int clear_to = reg; 1223 while (reg > 0 && registers_to_clear.Get(reg - 1)) { 1224 reg--; 1225 } 1226 assembler->ClearRegisters(reg, clear_to); 1227 } 1228 } 1229 } 1230 1231 1232 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler, 1233 int max_register, 1234 const OutSet& affected_registers, 1235 OutSet* registers_to_pop, 1236 OutSet* registers_to_clear, 1237 Zone* zone) { 1238 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1. 1239 const int push_limit = (assembler->stack_limit_slack() + 1) / 2; 1240 1241 // Count pushes performed to force a stack limit check occasionally. 1242 int pushes = 0; 1243 1244 for (int reg = 0; reg <= max_register; reg++) { 1245 if (!affected_registers.Get(reg)) { 1246 continue; 1247 } 1248 1249 // The chronologically first deferred action in the trace 1250 // is used to infer the action needed to restore a register 1251 // to its previous state (or not, if it's safe to ignore it). 1252 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR }; 1253 DeferredActionUndoType undo_action = IGNORE; 1254 1255 int value = 0; 1256 bool absolute = false; 1257 bool clear = false; 1258 static const int kNoStore = kMinInt; 1259 int store_position = kNoStore; 1260 // This is a little tricky because we are scanning the actions in reverse 1261 // historical order (newest first). 1262 for (DeferredAction* action = actions_; 1263 action != NULL; 1264 action = action->next()) { 1265 if (action->Mentions(reg)) { 1266 switch (action->action_type()) { 1267 case ActionNode::SET_REGISTER: { 1268 Trace::DeferredSetRegister* psr = 1269 static_cast<Trace::DeferredSetRegister*>(action); 1270 if (!absolute) { 1271 value += psr->value(); 1272 absolute = true; 1273 } 1274 // SET_REGISTER is currently only used for newly introduced loop 1275 // counters. They can have a significant previous value if they 1276 // occour in a loop. TODO(lrn): Propagate this information, so 1277 // we can set undo_action to IGNORE if we know there is no value to 1278 // restore. 1279 undo_action = RESTORE; 1280 DCHECK_EQ(store_position, kNoStore); 1281 DCHECK(!clear); 1282 break; 1283 } 1284 case ActionNode::INCREMENT_REGISTER: 1285 if (!absolute) { 1286 value++; 1287 } 1288 DCHECK_EQ(store_position, kNoStore); 1289 DCHECK(!clear); 1290 undo_action = RESTORE; 1291 break; 1292 case ActionNode::STORE_POSITION: { 1293 Trace::DeferredCapture* pc = 1294 static_cast<Trace::DeferredCapture*>(action); 1295 if (!clear && store_position == kNoStore) { 1296 store_position = pc->cp_offset(); 1297 } 1298 1299 // For captures we know that stores and clears alternate. 1300 // Other register, are never cleared, and if the occur 1301 // inside a loop, they might be assigned more than once. 1302 if (reg <= 1) { 1303 // Registers zero and one, aka "capture zero", is 1304 // always set correctly if we succeed. There is no 1305 // need to undo a setting on backtrack, because we 1306 // will set it again or fail. 1307 undo_action = IGNORE; 1308 } else { 1309 undo_action = pc->is_capture() ? CLEAR : RESTORE; 1310 } 1311 DCHECK(!absolute); 1312 DCHECK_EQ(value, 0); 1313 break; 1314 } 1315 case ActionNode::CLEAR_CAPTURES: { 1316 // Since we're scanning in reverse order, if we've already 1317 // set the position we have to ignore historically earlier 1318 // clearing operations. 1319 if (store_position == kNoStore) { 1320 clear = true; 1321 } 1322 undo_action = RESTORE; 1323 DCHECK(!absolute); 1324 DCHECK_EQ(value, 0); 1325 break; 1326 } 1327 default: 1328 UNREACHABLE(); 1329 break; 1330 } 1331 } 1332 } 1333 // Prepare for the undo-action (e.g., push if it's going to be popped). 1334 if (undo_action == RESTORE) { 1335 pushes++; 1336 RegExpMacroAssembler::StackCheckFlag stack_check = 1337 RegExpMacroAssembler::kNoStackLimitCheck; 1338 if (pushes == push_limit) { 1339 stack_check = RegExpMacroAssembler::kCheckStackLimit; 1340 pushes = 0; 1341 } 1342 1343 assembler->PushRegister(reg, stack_check); 1344 registers_to_pop->Set(reg, zone); 1345 } else if (undo_action == CLEAR) { 1346 registers_to_clear->Set(reg, zone); 1347 } 1348 // Perform the chronologically last action (or accumulated increment) 1349 // for the register. 1350 if (store_position != kNoStore) { 1351 assembler->WriteCurrentPositionToRegister(reg, store_position); 1352 } else if (clear) { 1353 assembler->ClearRegisters(reg, reg); 1354 } else if (absolute) { 1355 assembler->SetRegister(reg, value); 1356 } else if (value != 0) { 1357 assembler->AdvanceRegister(reg, value); 1358 } 1359 } 1360 } 1361 1362 1363 // This is called as we come into a loop choice node and some other tricky 1364 // nodes. It normalizes the state of the code generator to ensure we can 1365 // generate generic code. 1366 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) { 1367 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1368 1369 DCHECK(!is_trivial()); 1370 1371 if (actions_ == NULL && backtrack() == NULL) { 1372 // Here we just have some deferred cp advances to fix and we are back to 1373 // a normal situation. We may also have to forget some information gained 1374 // through a quick check that was already performed. 1375 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_); 1376 // Create a new trivial state and generate the node with that. 1377 Trace new_state; 1378 successor->Emit(compiler, &new_state); 1379 return; 1380 } 1381 1382 // Generate deferred actions here along with code to undo them again. 1383 OutSet affected_registers; 1384 1385 if (backtrack() != NULL) { 1386 // Here we have a concrete backtrack location. These are set up by choice 1387 // nodes and so they indicate that we have a deferred save of the current 1388 // position which we may need to emit here. 1389 assembler->PushCurrentPosition(); 1390 } 1391 1392 int max_register = FindAffectedRegisters(&affected_registers, 1393 compiler->zone()); 1394 OutSet registers_to_pop; 1395 OutSet registers_to_clear; 1396 PerformDeferredActions(assembler, 1397 max_register, 1398 affected_registers, 1399 ®isters_to_pop, 1400 ®isters_to_clear, 1401 compiler->zone()); 1402 if (cp_offset_ != 0) { 1403 assembler->AdvanceCurrentPosition(cp_offset_); 1404 } 1405 1406 // Create a new trivial state and generate the node with that. 1407 Label undo; 1408 assembler->PushBacktrack(&undo); 1409 if (successor->KeepRecursing(compiler)) { 1410 Trace new_state; 1411 successor->Emit(compiler, &new_state); 1412 } else { 1413 compiler->AddWork(successor); 1414 assembler->GoTo(successor->label()); 1415 } 1416 1417 // On backtrack we need to restore state. 1418 assembler->Bind(&undo); 1419 RestoreAffectedRegisters(assembler, 1420 max_register, 1421 registers_to_pop, 1422 registers_to_clear); 1423 if (backtrack() == NULL) { 1424 assembler->Backtrack(); 1425 } else { 1426 assembler->PopCurrentPosition(); 1427 assembler->GoTo(backtrack()); 1428 } 1429 } 1430 1431 1432 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) { 1433 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1434 1435 // Omit flushing the trace. We discard the entire stack frame anyway. 1436 1437 if (!label()->is_bound()) { 1438 // We are completely independent of the trace, since we ignore it, 1439 // so this code can be used as the generic version. 1440 assembler->Bind(label()); 1441 } 1442 1443 // Throw away everything on the backtrack stack since the start 1444 // of the negative submatch and restore the character position. 1445 assembler->ReadCurrentPositionFromRegister(current_position_register_); 1446 assembler->ReadStackPointerFromRegister(stack_pointer_register_); 1447 if (clear_capture_count_ > 0) { 1448 // Clear any captures that might have been performed during the success 1449 // of the body of the negative look-ahead. 1450 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1; 1451 assembler->ClearRegisters(clear_capture_start_, clear_capture_end); 1452 } 1453 // Now that we have unwound the stack we find at the top of the stack the 1454 // backtrack that the BeginSubmatch node got. 1455 assembler->Backtrack(); 1456 } 1457 1458 1459 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) { 1460 if (!trace->is_trivial()) { 1461 trace->Flush(compiler, this); 1462 return; 1463 } 1464 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1465 if (!label()->is_bound()) { 1466 assembler->Bind(label()); 1467 } 1468 switch (action_) { 1469 case ACCEPT: 1470 assembler->Succeed(); 1471 return; 1472 case BACKTRACK: 1473 assembler->GoTo(trace->backtrack()); 1474 return; 1475 case NEGATIVE_SUBMATCH_SUCCESS: 1476 // This case is handled in a different virtual method. 1477 UNREACHABLE(); 1478 } 1479 UNIMPLEMENTED(); 1480 } 1481 1482 1483 void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) { 1484 if (guards_ == NULL) 1485 guards_ = new(zone) ZoneList<Guard*>(1, zone); 1486 guards_->Add(guard, zone); 1487 } 1488 1489 1490 ActionNode* ActionNode::SetRegister(int reg, 1491 int val, 1492 RegExpNode* on_success) { 1493 ActionNode* result = 1494 new(on_success->zone()) ActionNode(SET_REGISTER, on_success); 1495 result->data_.u_store_register.reg = reg; 1496 result->data_.u_store_register.value = val; 1497 return result; 1498 } 1499 1500 1501 ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) { 1502 ActionNode* result = 1503 new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success); 1504 result->data_.u_increment_register.reg = reg; 1505 return result; 1506 } 1507 1508 1509 ActionNode* ActionNode::StorePosition(int reg, 1510 bool is_capture, 1511 RegExpNode* on_success) { 1512 ActionNode* result = 1513 new(on_success->zone()) ActionNode(STORE_POSITION, on_success); 1514 result->data_.u_position_register.reg = reg; 1515 result->data_.u_position_register.is_capture = is_capture; 1516 return result; 1517 } 1518 1519 1520 ActionNode* ActionNode::ClearCaptures(Interval range, 1521 RegExpNode* on_success) { 1522 ActionNode* result = 1523 new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success); 1524 result->data_.u_clear_captures.range_from = range.from(); 1525 result->data_.u_clear_captures.range_to = range.to(); 1526 return result; 1527 } 1528 1529 1530 ActionNode* ActionNode::BeginSubmatch(int stack_reg, 1531 int position_reg, 1532 RegExpNode* on_success) { 1533 ActionNode* result = 1534 new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success); 1535 result->data_.u_submatch.stack_pointer_register = stack_reg; 1536 result->data_.u_submatch.current_position_register = position_reg; 1537 return result; 1538 } 1539 1540 1541 ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg, 1542 int position_reg, 1543 int clear_register_count, 1544 int clear_register_from, 1545 RegExpNode* on_success) { 1546 ActionNode* result = 1547 new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success); 1548 result->data_.u_submatch.stack_pointer_register = stack_reg; 1549 result->data_.u_submatch.current_position_register = position_reg; 1550 result->data_.u_submatch.clear_register_count = clear_register_count; 1551 result->data_.u_submatch.clear_register_from = clear_register_from; 1552 return result; 1553 } 1554 1555 1556 ActionNode* ActionNode::EmptyMatchCheck(int start_register, 1557 int repetition_register, 1558 int repetition_limit, 1559 RegExpNode* on_success) { 1560 ActionNode* result = 1561 new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success); 1562 result->data_.u_empty_match_check.start_register = start_register; 1563 result->data_.u_empty_match_check.repetition_register = repetition_register; 1564 result->data_.u_empty_match_check.repetition_limit = repetition_limit; 1565 return result; 1566 } 1567 1568 1569 #define DEFINE_ACCEPT(Type) \ 1570 void Type##Node::Accept(NodeVisitor* visitor) { \ 1571 visitor->Visit##Type(this); \ 1572 } 1573 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT) 1574 #undef DEFINE_ACCEPT 1575 1576 1577 void LoopChoiceNode::Accept(NodeVisitor* visitor) { 1578 visitor->VisitLoopChoice(this); 1579 } 1580 1581 1582 // ------------------------------------------------------------------- 1583 // Emit code. 1584 1585 1586 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler, 1587 Guard* guard, 1588 Trace* trace) { 1589 switch (guard->op()) { 1590 case Guard::LT: 1591 DCHECK(!trace->mentions_reg(guard->reg())); 1592 macro_assembler->IfRegisterGE(guard->reg(), 1593 guard->value(), 1594 trace->backtrack()); 1595 break; 1596 case Guard::GEQ: 1597 DCHECK(!trace->mentions_reg(guard->reg())); 1598 macro_assembler->IfRegisterLT(guard->reg(), 1599 guard->value(), 1600 trace->backtrack()); 1601 break; 1602 } 1603 } 1604 1605 1606 // Returns the number of characters in the equivalence class, omitting those 1607 // that cannot occur in the source string because it is Latin1. 1608 static int GetCaseIndependentLetters(Isolate* isolate, uc16 character, 1609 bool one_byte_subject, 1610 unibrow::uchar* letters) { 1611 int length = 1612 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters); 1613 // Unibrow returns 0 or 1 for characters where case independence is 1614 // trivial. 1615 if (length == 0) { 1616 letters[0] = character; 1617 length = 1; 1618 } 1619 1620 if (one_byte_subject) { 1621 int new_length = 0; 1622 for (int i = 0; i < length; i++) { 1623 if (letters[i] <= String::kMaxOneByteCharCode) { 1624 letters[new_length++] = letters[i]; 1625 } 1626 } 1627 length = new_length; 1628 } 1629 1630 return length; 1631 } 1632 1633 1634 static inline bool EmitSimpleCharacter(Isolate* isolate, 1635 RegExpCompiler* compiler, 1636 uc16 c, 1637 Label* on_failure, 1638 int cp_offset, 1639 bool check, 1640 bool preloaded) { 1641 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1642 bool bound_checked = false; 1643 if (!preloaded) { 1644 assembler->LoadCurrentCharacter( 1645 cp_offset, 1646 on_failure, 1647 check); 1648 bound_checked = true; 1649 } 1650 assembler->CheckNotCharacter(c, on_failure); 1651 return bound_checked; 1652 } 1653 1654 1655 // Only emits non-letters (things that don't have case). Only used for case 1656 // independent matches. 1657 static inline bool EmitAtomNonLetter(Isolate* isolate, 1658 RegExpCompiler* compiler, 1659 uc16 c, 1660 Label* on_failure, 1661 int cp_offset, 1662 bool check, 1663 bool preloaded) { 1664 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 1665 bool one_byte = compiler->one_byte(); 1666 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1667 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars); 1668 if (length < 1) { 1669 // This can't match. Must be an one-byte subject and a non-one-byte 1670 // character. We do not need to do anything since the one-byte pass 1671 // already handled this. 1672 return false; // Bounds not checked. 1673 } 1674 bool checked = false; 1675 // We handle the length > 1 case in a later pass. 1676 if (length == 1) { 1677 if (one_byte && c > String::kMaxOneByteCharCodeU) { 1678 // Can't match - see above. 1679 return false; // Bounds not checked. 1680 } 1681 if (!preloaded) { 1682 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 1683 checked = check; 1684 } 1685 macro_assembler->CheckNotCharacter(c, on_failure); 1686 } 1687 return checked; 1688 } 1689 1690 1691 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler, 1692 bool one_byte, uc16 c1, uc16 c2, 1693 Label* on_failure) { 1694 uc16 char_mask; 1695 if (one_byte) { 1696 char_mask = String::kMaxOneByteCharCode; 1697 } else { 1698 char_mask = String::kMaxUtf16CodeUnit; 1699 } 1700 uc16 exor = c1 ^ c2; 1701 // Check whether exor has only one bit set. 1702 if (((exor - 1) & exor) == 0) { 1703 // If c1 and c2 differ only by one bit. 1704 // Ecma262UnCanonicalize always gives the highest number last. 1705 DCHECK(c2 > c1); 1706 uc16 mask = char_mask ^ exor; 1707 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure); 1708 return true; 1709 } 1710 DCHECK(c2 > c1); 1711 uc16 diff = c2 - c1; 1712 if (((diff - 1) & diff) == 0 && c1 >= diff) { 1713 // If the characters differ by 2^n but don't differ by one bit then 1714 // subtract the difference from the found character, then do the or 1715 // trick. We avoid the theoretical case where negative numbers are 1716 // involved in order to simplify code generation. 1717 uc16 mask = char_mask ^ diff; 1718 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff, 1719 diff, 1720 mask, 1721 on_failure); 1722 return true; 1723 } 1724 return false; 1725 } 1726 1727 1728 typedef bool EmitCharacterFunction(Isolate* isolate, 1729 RegExpCompiler* compiler, 1730 uc16 c, 1731 Label* on_failure, 1732 int cp_offset, 1733 bool check, 1734 bool preloaded); 1735 1736 // Only emits letters (things that have case). Only used for case independent 1737 // matches. 1738 static inline bool EmitAtomLetter(Isolate* isolate, 1739 RegExpCompiler* compiler, 1740 uc16 c, 1741 Label* on_failure, 1742 int cp_offset, 1743 bool check, 1744 bool preloaded) { 1745 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 1746 bool one_byte = compiler->one_byte(); 1747 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1748 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars); 1749 if (length <= 1) return false; 1750 // We may not need to check against the end of the input string 1751 // if this character lies before a character that matched. 1752 if (!preloaded) { 1753 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 1754 } 1755 Label ok; 1756 DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4); 1757 switch (length) { 1758 case 2: { 1759 if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0], 1760 chars[1], on_failure)) { 1761 } else { 1762 macro_assembler->CheckCharacter(chars[0], &ok); 1763 macro_assembler->CheckNotCharacter(chars[1], on_failure); 1764 macro_assembler->Bind(&ok); 1765 } 1766 break; 1767 } 1768 case 4: 1769 macro_assembler->CheckCharacter(chars[3], &ok); 1770 // Fall through! 1771 case 3: 1772 macro_assembler->CheckCharacter(chars[0], &ok); 1773 macro_assembler->CheckCharacter(chars[1], &ok); 1774 macro_assembler->CheckNotCharacter(chars[2], on_failure); 1775 macro_assembler->Bind(&ok); 1776 break; 1777 default: 1778 UNREACHABLE(); 1779 break; 1780 } 1781 return true; 1782 } 1783 1784 1785 static void EmitBoundaryTest(RegExpMacroAssembler* masm, 1786 int border, 1787 Label* fall_through, 1788 Label* above_or_equal, 1789 Label* below) { 1790 if (below != fall_through) { 1791 masm->CheckCharacterLT(border, below); 1792 if (above_or_equal != fall_through) masm->GoTo(above_or_equal); 1793 } else { 1794 masm->CheckCharacterGT(border - 1, above_or_equal); 1795 } 1796 } 1797 1798 1799 static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm, 1800 int first, 1801 int last, 1802 Label* fall_through, 1803 Label* in_range, 1804 Label* out_of_range) { 1805 if (in_range == fall_through) { 1806 if (first == last) { 1807 masm->CheckNotCharacter(first, out_of_range); 1808 } else { 1809 masm->CheckCharacterNotInRange(first, last, out_of_range); 1810 } 1811 } else { 1812 if (first == last) { 1813 masm->CheckCharacter(first, in_range); 1814 } else { 1815 masm->CheckCharacterInRange(first, last, in_range); 1816 } 1817 if (out_of_range != fall_through) masm->GoTo(out_of_range); 1818 } 1819 } 1820 1821 1822 // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even. 1823 // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd. 1824 static void EmitUseLookupTable( 1825 RegExpMacroAssembler* masm, 1826 ZoneList<int>* ranges, 1827 int start_index, 1828 int end_index, 1829 int min_char, 1830 Label* fall_through, 1831 Label* even_label, 1832 Label* odd_label) { 1833 static const int kSize = RegExpMacroAssembler::kTableSize; 1834 static const int kMask = RegExpMacroAssembler::kTableMask; 1835 1836 int base = (min_char & ~kMask); 1837 USE(base); 1838 1839 // Assert that everything is on one kTableSize page. 1840 for (int i = start_index; i <= end_index; i++) { 1841 DCHECK_EQ(ranges->at(i) & ~kMask, base); 1842 } 1843 DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base); 1844 1845 char templ[kSize]; 1846 Label* on_bit_set; 1847 Label* on_bit_clear; 1848 int bit; 1849 if (even_label == fall_through) { 1850 on_bit_set = odd_label; 1851 on_bit_clear = even_label; 1852 bit = 1; 1853 } else { 1854 on_bit_set = even_label; 1855 on_bit_clear = odd_label; 1856 bit = 0; 1857 } 1858 for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) { 1859 templ[i] = bit; 1860 } 1861 int j = 0; 1862 bit ^= 1; 1863 for (int i = start_index; i < end_index; i++) { 1864 for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) { 1865 templ[j] = bit; 1866 } 1867 bit ^= 1; 1868 } 1869 for (int i = j; i < kSize; i++) { 1870 templ[i] = bit; 1871 } 1872 Factory* factory = masm->isolate()->factory(); 1873 // TODO(erikcorry): Cache these. 1874 Handle<ByteArray> ba = factory->NewByteArray(kSize, TENURED); 1875 for (int i = 0; i < kSize; i++) { 1876 ba->set(i, templ[i]); 1877 } 1878 masm->CheckBitInTable(ba, on_bit_set); 1879 if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear); 1880 } 1881 1882 1883 static void CutOutRange(RegExpMacroAssembler* masm, 1884 ZoneList<int>* ranges, 1885 int start_index, 1886 int end_index, 1887 int cut_index, 1888 Label* even_label, 1889 Label* odd_label) { 1890 bool odd = (((cut_index - start_index) & 1) == 1); 1891 Label* in_range_label = odd ? odd_label : even_label; 1892 Label dummy; 1893 EmitDoubleBoundaryTest(masm, 1894 ranges->at(cut_index), 1895 ranges->at(cut_index + 1) - 1, 1896 &dummy, 1897 in_range_label, 1898 &dummy); 1899 DCHECK(!dummy.is_linked()); 1900 // Cut out the single range by rewriting the array. This creates a new 1901 // range that is a merger of the two ranges on either side of the one we 1902 // are cutting out. The oddity of the labels is preserved. 1903 for (int j = cut_index; j > start_index; j--) { 1904 ranges->at(j) = ranges->at(j - 1); 1905 } 1906 for (int j = cut_index + 1; j < end_index; j++) { 1907 ranges->at(j) = ranges->at(j + 1); 1908 } 1909 } 1910 1911 1912 // Unicode case. Split the search space into kSize spaces that are handled 1913 // with recursion. 1914 static void SplitSearchSpace(ZoneList<int>* ranges, 1915 int start_index, 1916 int end_index, 1917 int* new_start_index, 1918 int* new_end_index, 1919 int* border) { 1920 static const int kSize = RegExpMacroAssembler::kTableSize; 1921 static const int kMask = RegExpMacroAssembler::kTableMask; 1922 1923 int first = ranges->at(start_index); 1924 int last = ranges->at(end_index) - 1; 1925 1926 *new_start_index = start_index; 1927 *border = (ranges->at(start_index) & ~kMask) + kSize; 1928 while (*new_start_index < end_index) { 1929 if (ranges->at(*new_start_index) > *border) break; 1930 (*new_start_index)++; 1931 } 1932 // new_start_index is the index of the first edge that is beyond the 1933 // current kSize space. 1934 1935 // For very large search spaces we do a binary chop search of the non-Latin1 1936 // space instead of just going to the end of the current kSize space. The 1937 // heuristics are complicated a little by the fact that any 128-character 1938 // encoding space can be quickly tested with a table lookup, so we don't 1939 // wish to do binary chop search at a smaller granularity than that. A 1940 // 128-character space can take up a lot of space in the ranges array if, 1941 // for example, we only want to match every second character (eg. the lower 1942 // case characters on some Unicode pages). 1943 int binary_chop_index = (end_index + start_index) / 2; 1944 // The first test ensures that we get to the code that handles the Latin1 1945 // range with a single not-taken branch, speeding up this important 1946 // character range (even non-Latin1 charset-based text has spaces and 1947 // punctuation). 1948 if (*border - 1 > String::kMaxOneByteCharCode && // Latin1 case. 1949 end_index - start_index > (*new_start_index - start_index) * 2 && 1950 last - first > kSize * 2 && binary_chop_index > *new_start_index && 1951 ranges->at(binary_chop_index) >= first + 2 * kSize) { 1952 int scan_forward_for_section_border = binary_chop_index;; 1953 int new_border = (ranges->at(binary_chop_index) | kMask) + 1; 1954 1955 while (scan_forward_for_section_border < end_index) { 1956 if (ranges->at(scan_forward_for_section_border) > new_border) { 1957 *new_start_index = scan_forward_for_section_border; 1958 *border = new_border; 1959 break; 1960 } 1961 scan_forward_for_section_border++; 1962 } 1963 } 1964 1965 DCHECK(*new_start_index > start_index); 1966 *new_end_index = *new_start_index - 1; 1967 if (ranges->at(*new_end_index) == *border) { 1968 (*new_end_index)--; 1969 } 1970 if (*border >= ranges->at(end_index)) { 1971 *border = ranges->at(end_index); 1972 *new_start_index = end_index; // Won't be used. 1973 *new_end_index = end_index - 1; 1974 } 1975 } 1976 1977 1978 // Gets a series of segment boundaries representing a character class. If the 1979 // character is in the range between an even and an odd boundary (counting from 1980 // start_index) then go to even_label, otherwise go to odd_label. We already 1981 // know that the character is in the range of min_char to max_char inclusive. 1982 // Either label can be NULL indicating backtracking. Either label can also be 1983 // equal to the fall_through label. 1984 static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<int>* ranges, 1985 int start_index, int end_index, uc32 min_char, 1986 uc32 max_char, Label* fall_through, 1987 Label* even_label, Label* odd_label) { 1988 DCHECK_LE(min_char, String::kMaxUtf16CodeUnit); 1989 DCHECK_LE(max_char, String::kMaxUtf16CodeUnit); 1990 1991 int first = ranges->at(start_index); 1992 int last = ranges->at(end_index) - 1; 1993 1994 DCHECK_LT(min_char, first); 1995 1996 // Just need to test if the character is before or on-or-after 1997 // a particular character. 1998 if (start_index == end_index) { 1999 EmitBoundaryTest(masm, first, fall_through, even_label, odd_label); 2000 return; 2001 } 2002 2003 // Another almost trivial case: There is one interval in the middle that is 2004 // different from the end intervals. 2005 if (start_index + 1 == end_index) { 2006 EmitDoubleBoundaryTest( 2007 masm, first, last, fall_through, even_label, odd_label); 2008 return; 2009 } 2010 2011 // It's not worth using table lookup if there are very few intervals in the 2012 // character class. 2013 if (end_index - start_index <= 6) { 2014 // It is faster to test for individual characters, so we look for those 2015 // first, then try arbitrary ranges in the second round. 2016 static int kNoCutIndex = -1; 2017 int cut = kNoCutIndex; 2018 for (int i = start_index; i < end_index; i++) { 2019 if (ranges->at(i) == ranges->at(i + 1) - 1) { 2020 cut = i; 2021 break; 2022 } 2023 } 2024 if (cut == kNoCutIndex) cut = start_index; 2025 CutOutRange( 2026 masm, ranges, start_index, end_index, cut, even_label, odd_label); 2027 DCHECK_GE(end_index - start_index, 2); 2028 GenerateBranches(masm, 2029 ranges, 2030 start_index + 1, 2031 end_index - 1, 2032 min_char, 2033 max_char, 2034 fall_through, 2035 even_label, 2036 odd_label); 2037 return; 2038 } 2039 2040 // If there are a lot of intervals in the regexp, then we will use tables to 2041 // determine whether the character is inside or outside the character class. 2042 static const int kBits = RegExpMacroAssembler::kTableSizeBits; 2043 2044 if ((max_char >> kBits) == (min_char >> kBits)) { 2045 EmitUseLookupTable(masm, 2046 ranges, 2047 start_index, 2048 end_index, 2049 min_char, 2050 fall_through, 2051 even_label, 2052 odd_label); 2053 return; 2054 } 2055 2056 if ((min_char >> kBits) != (first >> kBits)) { 2057 masm->CheckCharacterLT(first, odd_label); 2058 GenerateBranches(masm, 2059 ranges, 2060 start_index + 1, 2061 end_index, 2062 first, 2063 max_char, 2064 fall_through, 2065 odd_label, 2066 even_label); 2067 return; 2068 } 2069 2070 int new_start_index = 0; 2071 int new_end_index = 0; 2072 int border = 0; 2073 2074 SplitSearchSpace(ranges, 2075 start_index, 2076 end_index, 2077 &new_start_index, 2078 &new_end_index, 2079 &border); 2080 2081 Label handle_rest; 2082 Label* above = &handle_rest; 2083 if (border == last + 1) { 2084 // We didn't find any section that started after the limit, so everything 2085 // above the border is one of the terminal labels. 2086 above = (end_index & 1) != (start_index & 1) ? odd_label : even_label; 2087 DCHECK(new_end_index == end_index - 1); 2088 } 2089 2090 DCHECK_LE(start_index, new_end_index); 2091 DCHECK_LE(new_start_index, end_index); 2092 DCHECK_LT(start_index, new_start_index); 2093 DCHECK_LT(new_end_index, end_index); 2094 DCHECK(new_end_index + 1 == new_start_index || 2095 (new_end_index + 2 == new_start_index && 2096 border == ranges->at(new_end_index + 1))); 2097 DCHECK_LT(min_char, border - 1); 2098 DCHECK_LT(border, max_char); 2099 DCHECK_LT(ranges->at(new_end_index), border); 2100 DCHECK(border < ranges->at(new_start_index) || 2101 (border == ranges->at(new_start_index) && 2102 new_start_index == end_index && 2103 new_end_index == end_index - 1 && 2104 border == last + 1)); 2105 DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1)); 2106 2107 masm->CheckCharacterGT(border - 1, above); 2108 Label dummy; 2109 GenerateBranches(masm, 2110 ranges, 2111 start_index, 2112 new_end_index, 2113 min_char, 2114 border - 1, 2115 &dummy, 2116 even_label, 2117 odd_label); 2118 if (handle_rest.is_linked()) { 2119 masm->Bind(&handle_rest); 2120 bool flip = (new_start_index & 1) != (start_index & 1); 2121 GenerateBranches(masm, 2122 ranges, 2123 new_start_index, 2124 end_index, 2125 border, 2126 max_char, 2127 &dummy, 2128 flip ? odd_label : even_label, 2129 flip ? even_label : odd_label); 2130 } 2131 } 2132 2133 2134 static void EmitCharClass(RegExpMacroAssembler* macro_assembler, 2135 RegExpCharacterClass* cc, bool one_byte, 2136 Label* on_failure, int cp_offset, bool check_offset, 2137 bool preloaded, Zone* zone) { 2138 ZoneList<CharacterRange>* ranges = cc->ranges(zone); 2139 CharacterRange::Canonicalize(ranges); 2140 2141 int max_char; 2142 if (one_byte) { 2143 max_char = String::kMaxOneByteCharCode; 2144 } else { 2145 max_char = String::kMaxUtf16CodeUnit; 2146 } 2147 2148 int range_count = ranges->length(); 2149 2150 int last_valid_range = range_count - 1; 2151 while (last_valid_range >= 0) { 2152 CharacterRange& range = ranges->at(last_valid_range); 2153 if (range.from() <= max_char) { 2154 break; 2155 } 2156 last_valid_range--; 2157 } 2158 2159 if (last_valid_range < 0) { 2160 if (!cc->is_negated()) { 2161 macro_assembler->GoTo(on_failure); 2162 } 2163 if (check_offset) { 2164 macro_assembler->CheckPosition(cp_offset, on_failure); 2165 } 2166 return; 2167 } 2168 2169 if (last_valid_range == 0 && 2170 ranges->at(0).IsEverything(max_char)) { 2171 if (cc->is_negated()) { 2172 macro_assembler->GoTo(on_failure); 2173 } else { 2174 // This is a common case hit by non-anchored expressions. 2175 if (check_offset) { 2176 macro_assembler->CheckPosition(cp_offset, on_failure); 2177 } 2178 } 2179 return; 2180 } 2181 2182 if (!preloaded) { 2183 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset); 2184 } 2185 2186 if (cc->is_standard(zone) && 2187 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(), 2188 on_failure)) { 2189 return; 2190 } 2191 2192 2193 // A new list with ascending entries. Each entry is a code unit 2194 // where there is a boundary between code units that are part of 2195 // the class and code units that are not. Normally we insert an 2196 // entry at zero which goes to the failure label, but if there 2197 // was already one there we fall through for success on that entry. 2198 // Subsequent entries have alternating meaning (success/failure). 2199 ZoneList<int>* range_boundaries = 2200 new(zone) ZoneList<int>(last_valid_range, zone); 2201 2202 bool zeroth_entry_is_failure = !cc->is_negated(); 2203 2204 for (int i = 0; i <= last_valid_range; i++) { 2205 CharacterRange& range = ranges->at(i); 2206 if (range.from() == 0) { 2207 DCHECK_EQ(i, 0); 2208 zeroth_entry_is_failure = !zeroth_entry_is_failure; 2209 } else { 2210 range_boundaries->Add(range.from(), zone); 2211 } 2212 range_boundaries->Add(range.to() + 1, zone); 2213 } 2214 int end_index = range_boundaries->length() - 1; 2215 if (range_boundaries->at(end_index) > max_char) { 2216 end_index--; 2217 } 2218 2219 Label fall_through; 2220 GenerateBranches(macro_assembler, 2221 range_boundaries, 2222 0, // start_index. 2223 end_index, 2224 0, // min_char. 2225 max_char, 2226 &fall_through, 2227 zeroth_entry_is_failure ? &fall_through : on_failure, 2228 zeroth_entry_is_failure ? on_failure : &fall_through); 2229 macro_assembler->Bind(&fall_through); 2230 } 2231 2232 2233 RegExpNode::~RegExpNode() { 2234 } 2235 2236 2237 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler, 2238 Trace* trace) { 2239 // If we are generating a greedy loop then don't stop and don't reuse code. 2240 if (trace->stop_node() != NULL) { 2241 return CONTINUE; 2242 } 2243 2244 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 2245 if (trace->is_trivial()) { 2246 if (label_.is_bound() || on_work_list() || !KeepRecursing(compiler)) { 2247 // If a generic version is already scheduled to be generated or we have 2248 // recursed too deeply then just generate a jump to that code. 2249 macro_assembler->GoTo(&label_); 2250 // This will queue it up for generation of a generic version if it hasn't 2251 // already been queued. 2252 compiler->AddWork(this); 2253 return DONE; 2254 } 2255 // Generate generic version of the node and bind the label for later use. 2256 macro_assembler->Bind(&label_); 2257 return CONTINUE; 2258 } 2259 2260 // We are being asked to make a non-generic version. Keep track of how many 2261 // non-generic versions we generate so as not to overdo it. 2262 trace_count_++; 2263 if (KeepRecursing(compiler) && compiler->optimize() && 2264 trace_count_ < kMaxCopiesCodeGenerated) { 2265 return CONTINUE; 2266 } 2267 2268 // If we get here code has been generated for this node too many times or 2269 // recursion is too deep. Time to switch to a generic version. The code for 2270 // generic versions above can handle deep recursion properly. 2271 bool was_limiting = compiler->limiting_recursion(); 2272 compiler->set_limiting_recursion(true); 2273 trace->Flush(compiler, this); 2274 compiler->set_limiting_recursion(was_limiting); 2275 return DONE; 2276 } 2277 2278 2279 bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) { 2280 return !compiler->limiting_recursion() && 2281 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion; 2282 } 2283 2284 2285 int ActionNode::EatsAtLeast(int still_to_find, 2286 int budget, 2287 bool not_at_start) { 2288 if (budget <= 0) return 0; 2289 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input! 2290 return on_success()->EatsAtLeast(still_to_find, 2291 budget - 1, 2292 not_at_start); 2293 } 2294 2295 2296 void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget, 2297 BoyerMooreLookahead* bm, bool not_at_start) { 2298 if (action_type_ == BEGIN_SUBMATCH) { 2299 bm->SetRest(offset); 2300 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) { 2301 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); 2302 } 2303 SaveBMInfo(bm, not_at_start, offset); 2304 } 2305 2306 2307 int AssertionNode::EatsAtLeast(int still_to_find, 2308 int budget, 2309 bool not_at_start) { 2310 if (budget <= 0) return 0; 2311 // If we know we are not at the start and we are asked "how many characters 2312 // will you match if you succeed?" then we can answer anything since false 2313 // implies false. So lets just return the max answer (still_to_find) since 2314 // that won't prevent us from preloading a lot of characters for the other 2315 // branches in the node graph. 2316 if (assertion_type() == AT_START && not_at_start) return still_to_find; 2317 return on_success()->EatsAtLeast(still_to_find, 2318 budget - 1, 2319 not_at_start); 2320 } 2321 2322 2323 void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget, 2324 BoyerMooreLookahead* bm, bool not_at_start) { 2325 // Match the behaviour of EatsAtLeast on this node. 2326 if (assertion_type() == AT_START && not_at_start) return; 2327 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); 2328 SaveBMInfo(bm, not_at_start, offset); 2329 } 2330 2331 2332 int BackReferenceNode::EatsAtLeast(int still_to_find, 2333 int budget, 2334 bool not_at_start) { 2335 if (read_backward()) return 0; 2336 if (budget <= 0) return 0; 2337 return on_success()->EatsAtLeast(still_to_find, 2338 budget - 1, 2339 not_at_start); 2340 } 2341 2342 2343 int TextNode::EatsAtLeast(int still_to_find, 2344 int budget, 2345 bool not_at_start) { 2346 if (read_backward()) return 0; 2347 int answer = Length(); 2348 if (answer >= still_to_find) return answer; 2349 if (budget <= 0) return answer; 2350 // We are not at start after this node so we set the last argument to 'true'. 2351 return answer + on_success()->EatsAtLeast(still_to_find - answer, 2352 budget - 1, 2353 true); 2354 } 2355 2356 2357 int NegativeLookaroundChoiceNode::EatsAtLeast(int still_to_find, int budget, 2358 bool not_at_start) { 2359 if (budget <= 0) return 0; 2360 // Alternative 0 is the negative lookahead, alternative 1 is what comes 2361 // afterwards. 2362 RegExpNode* node = alternatives_->at(1).node(); 2363 return node->EatsAtLeast(still_to_find, budget - 1, not_at_start); 2364 } 2365 2366 2367 void NegativeLookaroundChoiceNode::GetQuickCheckDetails( 2368 QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in, 2369 bool not_at_start) { 2370 // Alternative 0 is the negative lookahead, alternative 1 is what comes 2371 // afterwards. 2372 RegExpNode* node = alternatives_->at(1).node(); 2373 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start); 2374 } 2375 2376 2377 int ChoiceNode::EatsAtLeastHelper(int still_to_find, 2378 int budget, 2379 RegExpNode* ignore_this_node, 2380 bool not_at_start) { 2381 if (budget <= 0) return 0; 2382 int min = 100; 2383 int choice_count = alternatives_->length(); 2384 budget = (budget - 1) / choice_count; 2385 for (int i = 0; i < choice_count; i++) { 2386 RegExpNode* node = alternatives_->at(i).node(); 2387 if (node == ignore_this_node) continue; 2388 int node_eats_at_least = 2389 node->EatsAtLeast(still_to_find, budget, not_at_start); 2390 if (node_eats_at_least < min) min = node_eats_at_least; 2391 if (min == 0) return 0; 2392 } 2393 return min; 2394 } 2395 2396 2397 int LoopChoiceNode::EatsAtLeast(int still_to_find, 2398 int budget, 2399 bool not_at_start) { 2400 return EatsAtLeastHelper(still_to_find, 2401 budget - 1, 2402 loop_node_, 2403 not_at_start); 2404 } 2405 2406 2407 int ChoiceNode::EatsAtLeast(int still_to_find, 2408 int budget, 2409 bool not_at_start) { 2410 return EatsAtLeastHelper(still_to_find, 2411 budget, 2412 NULL, 2413 not_at_start); 2414 } 2415 2416 2417 // Takes the left-most 1-bit and smears it out, setting all bits to its right. 2418 static inline uint32_t SmearBitsRight(uint32_t v) { 2419 v |= v >> 1; 2420 v |= v >> 2; 2421 v |= v >> 4; 2422 v |= v >> 8; 2423 v |= v >> 16; 2424 return v; 2425 } 2426 2427 2428 bool QuickCheckDetails::Rationalize(bool asc) { 2429 bool found_useful_op = false; 2430 uint32_t char_mask; 2431 if (asc) { 2432 char_mask = String::kMaxOneByteCharCode; 2433 } else { 2434 char_mask = String::kMaxUtf16CodeUnit; 2435 } 2436 mask_ = 0; 2437 value_ = 0; 2438 int char_shift = 0; 2439 for (int i = 0; i < characters_; i++) { 2440 Position* pos = &positions_[i]; 2441 if ((pos->mask & String::kMaxOneByteCharCode) != 0) { 2442 found_useful_op = true; 2443 } 2444 mask_ |= (pos->mask & char_mask) << char_shift; 2445 value_ |= (pos->value & char_mask) << char_shift; 2446 char_shift += asc ? 8 : 16; 2447 } 2448 return found_useful_op; 2449 } 2450 2451 2452 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler, 2453 Trace* bounds_check_trace, 2454 Trace* trace, 2455 bool preload_has_checked_bounds, 2456 Label* on_possible_success, 2457 QuickCheckDetails* details, 2458 bool fall_through_on_failure) { 2459 if (details->characters() == 0) return false; 2460 GetQuickCheckDetails( 2461 details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE); 2462 if (details->cannot_match()) return false; 2463 if (!details->Rationalize(compiler->one_byte())) return false; 2464 DCHECK(details->characters() == 1 || 2465 compiler->macro_assembler()->CanReadUnaligned()); 2466 uint32_t mask = details->mask(); 2467 uint32_t value = details->value(); 2468 2469 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2470 2471 if (trace->characters_preloaded() != details->characters()) { 2472 DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset()); 2473 // We are attempting to preload the minimum number of characters 2474 // any choice would eat, so if the bounds check fails, then none of the 2475 // choices can succeed, so we can just immediately backtrack, rather 2476 // than go to the next choice. 2477 assembler->LoadCurrentCharacter(trace->cp_offset(), 2478 bounds_check_trace->backtrack(), 2479 !preload_has_checked_bounds, 2480 details->characters()); 2481 } 2482 2483 2484 bool need_mask = true; 2485 2486 if (details->characters() == 1) { 2487 // If number of characters preloaded is 1 then we used a byte or 16 bit 2488 // load so the value is already masked down. 2489 uint32_t char_mask; 2490 if (compiler->one_byte()) { 2491 char_mask = String::kMaxOneByteCharCode; 2492 } else { 2493 char_mask = String::kMaxUtf16CodeUnit; 2494 } 2495 if ((mask & char_mask) == char_mask) need_mask = false; 2496 mask &= char_mask; 2497 } else { 2498 // For 2-character preloads in one-byte mode or 1-character preloads in 2499 // two-byte mode we also use a 16 bit load with zero extend. 2500 static const uint32_t kTwoByteMask = 0xffff; 2501 static const uint32_t kFourByteMask = 0xffffffff; 2502 if (details->characters() == 2 && compiler->one_byte()) { 2503 if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false; 2504 } else if (details->characters() == 1 && !compiler->one_byte()) { 2505 if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false; 2506 } else { 2507 if (mask == kFourByteMask) need_mask = false; 2508 } 2509 } 2510 2511 if (fall_through_on_failure) { 2512 if (need_mask) { 2513 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success); 2514 } else { 2515 assembler->CheckCharacter(value, on_possible_success); 2516 } 2517 } else { 2518 if (need_mask) { 2519 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack()); 2520 } else { 2521 assembler->CheckNotCharacter(value, trace->backtrack()); 2522 } 2523 } 2524 return true; 2525 } 2526 2527 2528 // Here is the meat of GetQuickCheckDetails (see also the comment on the 2529 // super-class in the .h file). 2530 // 2531 // We iterate along the text object, building up for each character a 2532 // mask and value that can be used to test for a quick failure to match. 2533 // The masks and values for the positions will be combined into a single 2534 // machine word for the current character width in order to be used in 2535 // generating a quick check. 2536 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, 2537 RegExpCompiler* compiler, 2538 int characters_filled_in, 2539 bool not_at_start) { 2540 // Do not collect any quick check details if the text node reads backward, 2541 // since it reads in the opposite direction than we use for quick checks. 2542 if (read_backward()) return; 2543 Isolate* isolate = compiler->macro_assembler()->isolate(); 2544 DCHECK(characters_filled_in < details->characters()); 2545 int characters = details->characters(); 2546 int char_mask; 2547 if (compiler->one_byte()) { 2548 char_mask = String::kMaxOneByteCharCode; 2549 } else { 2550 char_mask = String::kMaxUtf16CodeUnit; 2551 } 2552 for (int k = 0; k < elements()->length(); k++) { 2553 TextElement elm = elements()->at(k); 2554 if (elm.text_type() == TextElement::ATOM) { 2555 Vector<const uc16> quarks = elm.atom()->data(); 2556 for (int i = 0; i < characters && i < quarks.length(); i++) { 2557 QuickCheckDetails::Position* pos = 2558 details->positions(characters_filled_in); 2559 uc16 c = quarks[i]; 2560 if (compiler->ignore_case()) { 2561 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 2562 int length = GetCaseIndependentLetters(isolate, c, 2563 compiler->one_byte(), chars); 2564 if (length == 0) { 2565 // This can happen because all case variants are non-Latin1, but we 2566 // know the input is Latin1. 2567 details->set_cannot_match(); 2568 pos->determines_perfectly = false; 2569 return; 2570 } 2571 if (length == 1) { 2572 // This letter has no case equivalents, so it's nice and simple 2573 // and the mask-compare will determine definitely whether we have 2574 // a match at this character position. 2575 pos->mask = char_mask; 2576 pos->value = c; 2577 pos->determines_perfectly = true; 2578 } else { 2579 uint32_t common_bits = char_mask; 2580 uint32_t bits = chars[0]; 2581 for (int j = 1; j < length; j++) { 2582 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits); 2583 common_bits ^= differing_bits; 2584 bits &= common_bits; 2585 } 2586 // If length is 2 and common bits has only one zero in it then 2587 // our mask and compare instruction will determine definitely 2588 // whether we have a match at this character position. Otherwise 2589 // it can only be an approximate check. 2590 uint32_t one_zero = (common_bits | ~char_mask); 2591 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) { 2592 pos->determines_perfectly = true; 2593 } 2594 pos->mask = common_bits; 2595 pos->value = bits; 2596 } 2597 } else { 2598 // Don't ignore case. Nice simple case where the mask-compare will 2599 // determine definitely whether we have a match at this character 2600 // position. 2601 if (c > char_mask) { 2602 details->set_cannot_match(); 2603 pos->determines_perfectly = false; 2604 return; 2605 } 2606 pos->mask = char_mask; 2607 pos->value = c; 2608 pos->determines_perfectly = true; 2609 } 2610 characters_filled_in++; 2611 DCHECK(characters_filled_in <= details->characters()); 2612 if (characters_filled_in == details->characters()) { 2613 return; 2614 } 2615 } 2616 } else { 2617 QuickCheckDetails::Position* pos = 2618 details->positions(characters_filled_in); 2619 RegExpCharacterClass* tree = elm.char_class(); 2620 ZoneList<CharacterRange>* ranges = tree->ranges(zone()); 2621 if (tree->is_negated()) { 2622 // A quick check uses multi-character mask and compare. There is no 2623 // useful way to incorporate a negative char class into this scheme 2624 // so we just conservatively create a mask and value that will always 2625 // succeed. 2626 pos->mask = 0; 2627 pos->value = 0; 2628 } else { 2629 int first_range = 0; 2630 while (ranges->at(first_range).from() > char_mask) { 2631 first_range++; 2632 if (first_range == ranges->length()) { 2633 details->set_cannot_match(); 2634 pos->determines_perfectly = false; 2635 return; 2636 } 2637 } 2638 CharacterRange range = ranges->at(first_range); 2639 uc16 from = range.from(); 2640 uc16 to = range.to(); 2641 if (to > char_mask) { 2642 to = char_mask; 2643 } 2644 uint32_t differing_bits = (from ^ to); 2645 // A mask and compare is only perfect if the differing bits form a 2646 // number like 00011111 with one single block of trailing 1s. 2647 if ((differing_bits & (differing_bits + 1)) == 0 && 2648 from + differing_bits == to) { 2649 pos->determines_perfectly = true; 2650 } 2651 uint32_t common_bits = ~SmearBitsRight(differing_bits); 2652 uint32_t bits = (from & common_bits); 2653 for (int i = first_range + 1; i < ranges->length(); i++) { 2654 CharacterRange range = ranges->at(i); 2655 uc16 from = range.from(); 2656 uc16 to = range.to(); 2657 if (from > char_mask) continue; 2658 if (to > char_mask) to = char_mask; 2659 // Here we are combining more ranges into the mask and compare 2660 // value. With each new range the mask becomes more sparse and 2661 // so the chances of a false positive rise. A character class 2662 // with multiple ranges is assumed never to be equivalent to a 2663 // mask and compare operation. 2664 pos->determines_perfectly = false; 2665 uint32_t new_common_bits = (from ^ to); 2666 new_common_bits = ~SmearBitsRight(new_common_bits); 2667 common_bits &= new_common_bits; 2668 bits &= new_common_bits; 2669 uint32_t differing_bits = (from & common_bits) ^ bits; 2670 common_bits ^= differing_bits; 2671 bits &= common_bits; 2672 } 2673 pos->mask = common_bits; 2674 pos->value = bits; 2675 } 2676 characters_filled_in++; 2677 DCHECK(characters_filled_in <= details->characters()); 2678 if (characters_filled_in == details->characters()) { 2679 return; 2680 } 2681 } 2682 } 2683 DCHECK(characters_filled_in != details->characters()); 2684 if (!details->cannot_match()) { 2685 on_success()-> GetQuickCheckDetails(details, 2686 compiler, 2687 characters_filled_in, 2688 true); 2689 } 2690 } 2691 2692 2693 void QuickCheckDetails::Clear() { 2694 for (int i = 0; i < characters_; i++) { 2695 positions_[i].mask = 0; 2696 positions_[i].value = 0; 2697 positions_[i].determines_perfectly = false; 2698 } 2699 characters_ = 0; 2700 } 2701 2702 2703 void QuickCheckDetails::Advance(int by, bool one_byte) { 2704 if (by >= characters_ || by < 0) { 2705 DCHECK_IMPLIES(by < 0, characters_ == 0); 2706 Clear(); 2707 return; 2708 } 2709 DCHECK_LE(characters_ - by, 4); 2710 DCHECK_LE(characters_, 4); 2711 for (int i = 0; i < characters_ - by; i++) { 2712 positions_[i] = positions_[by + i]; 2713 } 2714 for (int i = characters_ - by; i < characters_; i++) { 2715 positions_[i].mask = 0; 2716 positions_[i].value = 0; 2717 positions_[i].determines_perfectly = false; 2718 } 2719 characters_ -= by; 2720 // We could change mask_ and value_ here but we would never advance unless 2721 // they had already been used in a check and they won't be used again because 2722 // it would gain us nothing. So there's no point. 2723 } 2724 2725 2726 void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) { 2727 DCHECK(characters_ == other->characters_); 2728 if (other->cannot_match_) { 2729 return; 2730 } 2731 if (cannot_match_) { 2732 *this = *other; 2733 return; 2734 } 2735 for (int i = from_index; i < characters_; i++) { 2736 QuickCheckDetails::Position* pos = positions(i); 2737 QuickCheckDetails::Position* other_pos = other->positions(i); 2738 if (pos->mask != other_pos->mask || 2739 pos->value != other_pos->value || 2740 !other_pos->determines_perfectly) { 2741 // Our mask-compare operation will be approximate unless we have the 2742 // exact same operation on both sides of the alternation. 2743 pos->determines_perfectly = false; 2744 } 2745 pos->mask &= other_pos->mask; 2746 pos->value &= pos->mask; 2747 other_pos->value &= pos->mask; 2748 uc16 differing_bits = (pos->value ^ other_pos->value); 2749 pos->mask &= ~differing_bits; 2750 pos->value &= pos->mask; 2751 } 2752 } 2753 2754 2755 class VisitMarker { 2756 public: 2757 explicit VisitMarker(NodeInfo* info) : info_(info) { 2758 DCHECK(!info->visited); 2759 info->visited = true; 2760 } 2761 ~VisitMarker() { 2762 info_->visited = false; 2763 } 2764 private: 2765 NodeInfo* info_; 2766 }; 2767 2768 2769 RegExpNode* SeqRegExpNode::FilterOneByte(int depth, bool ignore_case) { 2770 if (info()->replacement_calculated) return replacement(); 2771 if (depth < 0) return this; 2772 DCHECK(!info()->visited); 2773 VisitMarker marker(info()); 2774 return FilterSuccessor(depth - 1, ignore_case); 2775 } 2776 2777 2778 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) { 2779 RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case); 2780 if (next == NULL) return set_replacement(NULL); 2781 on_success_ = next; 2782 return set_replacement(this); 2783 } 2784 2785 2786 // We need to check for the following characters: 0x39c 0x3bc 0x178. 2787 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) { 2788 // TODO(dcarney): this could be a lot more efficient. 2789 return range.Contains(0x39c) || 2790 range.Contains(0x3bc) || range.Contains(0x178); 2791 } 2792 2793 2794 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) { 2795 for (int i = 0; i < ranges->length(); i++) { 2796 // TODO(dcarney): this could be a lot more efficient. 2797 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true; 2798 } 2799 return false; 2800 } 2801 2802 2803 RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) { 2804 if (info()->replacement_calculated) return replacement(); 2805 if (depth < 0) return this; 2806 DCHECK(!info()->visited); 2807 VisitMarker marker(info()); 2808 int element_count = elements()->length(); 2809 for (int i = 0; i < element_count; i++) { 2810 TextElement elm = elements()->at(i); 2811 if (elm.text_type() == TextElement::ATOM) { 2812 Vector<const uc16> quarks = elm.atom()->data(); 2813 for (int j = 0; j < quarks.length(); j++) { 2814 uint16_t c = quarks[j]; 2815 if (c <= String::kMaxOneByteCharCode) continue; 2816 if (!ignore_case) return set_replacement(NULL); 2817 // Here, we need to check for characters whose upper and lower cases 2818 // are outside the Latin-1 range. 2819 uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c); 2820 // Character is outside Latin-1 completely 2821 if (converted == 0) return set_replacement(NULL); 2822 // Convert quark to Latin-1 in place. 2823 uint16_t* copy = const_cast<uint16_t*>(quarks.start()); 2824 copy[j] = converted; 2825 } 2826 } else { 2827 DCHECK(elm.text_type() == TextElement::CHAR_CLASS); 2828 RegExpCharacterClass* cc = elm.char_class(); 2829 ZoneList<CharacterRange>* ranges = cc->ranges(zone()); 2830 CharacterRange::Canonicalize(ranges); 2831 // Now they are in order so we only need to look at the first. 2832 int range_count = ranges->length(); 2833 if (cc->is_negated()) { 2834 if (range_count != 0 && 2835 ranges->at(0).from() == 0 && 2836 ranges->at(0).to() >= String::kMaxOneByteCharCode) { 2837 // This will be handled in a later filter. 2838 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue; 2839 return set_replacement(NULL); 2840 } 2841 } else { 2842 if (range_count == 0 || 2843 ranges->at(0).from() > String::kMaxOneByteCharCode) { 2844 // This will be handled in a later filter. 2845 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue; 2846 return set_replacement(NULL); 2847 } 2848 } 2849 } 2850 } 2851 return FilterSuccessor(depth - 1, ignore_case); 2852 } 2853 2854 2855 RegExpNode* LoopChoiceNode::FilterOneByte(int depth, bool ignore_case) { 2856 if (info()->replacement_calculated) return replacement(); 2857 if (depth < 0) return this; 2858 if (info()->visited) return this; 2859 { 2860 VisitMarker marker(info()); 2861 2862 RegExpNode* continue_replacement = 2863 continue_node_->FilterOneByte(depth - 1, ignore_case); 2864 // If we can't continue after the loop then there is no sense in doing the 2865 // loop. 2866 if (continue_replacement == NULL) return set_replacement(NULL); 2867 } 2868 2869 return ChoiceNode::FilterOneByte(depth - 1, ignore_case); 2870 } 2871 2872 2873 RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) { 2874 if (info()->replacement_calculated) return replacement(); 2875 if (depth < 0) return this; 2876 if (info()->visited) return this; 2877 VisitMarker marker(info()); 2878 int choice_count = alternatives_->length(); 2879 2880 for (int i = 0; i < choice_count; i++) { 2881 GuardedAlternative alternative = alternatives_->at(i); 2882 if (alternative.guards() != NULL && alternative.guards()->length() != 0) { 2883 set_replacement(this); 2884 return this; 2885 } 2886 } 2887 2888 int surviving = 0; 2889 RegExpNode* survivor = NULL; 2890 for (int i = 0; i < choice_count; i++) { 2891 GuardedAlternative alternative = alternatives_->at(i); 2892 RegExpNode* replacement = 2893 alternative.node()->FilterOneByte(depth - 1, ignore_case); 2894 DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK. 2895 if (replacement != NULL) { 2896 alternatives_->at(i).set_node(replacement); 2897 surviving++; 2898 survivor = replacement; 2899 } 2900 } 2901 if (surviving < 2) return set_replacement(survivor); 2902 2903 set_replacement(this); 2904 if (surviving == choice_count) { 2905 return this; 2906 } 2907 // Only some of the nodes survived the filtering. We need to rebuild the 2908 // alternatives list. 2909 ZoneList<GuardedAlternative>* new_alternatives = 2910 new(zone()) ZoneList<GuardedAlternative>(surviving, zone()); 2911 for (int i = 0; i < choice_count; i++) { 2912 RegExpNode* replacement = 2913 alternatives_->at(i).node()->FilterOneByte(depth - 1, ignore_case); 2914 if (replacement != NULL) { 2915 alternatives_->at(i).set_node(replacement); 2916 new_alternatives->Add(alternatives_->at(i), zone()); 2917 } 2918 } 2919 alternatives_ = new_alternatives; 2920 return this; 2921 } 2922 2923 2924 RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth, 2925 bool ignore_case) { 2926 if (info()->replacement_calculated) return replacement(); 2927 if (depth < 0) return this; 2928 if (info()->visited) return this; 2929 VisitMarker marker(info()); 2930 // Alternative 0 is the negative lookahead, alternative 1 is what comes 2931 // afterwards. 2932 RegExpNode* node = alternatives_->at(1).node(); 2933 RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case); 2934 if (replacement == NULL) return set_replacement(NULL); 2935 alternatives_->at(1).set_node(replacement); 2936 2937 RegExpNode* neg_node = alternatives_->at(0).node(); 2938 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case); 2939 // If the negative lookahead is always going to fail then 2940 // we don't need to check it. 2941 if (neg_replacement == NULL) return set_replacement(replacement); 2942 alternatives_->at(0).set_node(neg_replacement); 2943 return set_replacement(this); 2944 } 2945 2946 2947 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 2948 RegExpCompiler* compiler, 2949 int characters_filled_in, 2950 bool not_at_start) { 2951 if (body_can_be_zero_length_ || info()->visited) return; 2952 VisitMarker marker(info()); 2953 return ChoiceNode::GetQuickCheckDetails(details, 2954 compiler, 2955 characters_filled_in, 2956 not_at_start); 2957 } 2958 2959 2960 void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget, 2961 BoyerMooreLookahead* bm, bool not_at_start) { 2962 if (body_can_be_zero_length_ || budget <= 0) { 2963 bm->SetRest(offset); 2964 SaveBMInfo(bm, not_at_start, offset); 2965 return; 2966 } 2967 ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); 2968 SaveBMInfo(bm, not_at_start, offset); 2969 } 2970 2971 2972 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 2973 RegExpCompiler* compiler, 2974 int characters_filled_in, 2975 bool not_at_start) { 2976 not_at_start = (not_at_start || not_at_start_); 2977 int choice_count = alternatives_->length(); 2978 DCHECK(choice_count > 0); 2979 alternatives_->at(0).node()->GetQuickCheckDetails(details, 2980 compiler, 2981 characters_filled_in, 2982 not_at_start); 2983 for (int i = 1; i < choice_count; i++) { 2984 QuickCheckDetails new_details(details->characters()); 2985 RegExpNode* node = alternatives_->at(i).node(); 2986 node->GetQuickCheckDetails(&new_details, compiler, 2987 characters_filled_in, 2988 not_at_start); 2989 // Here we merge the quick match details of the two branches. 2990 details->Merge(&new_details, characters_filled_in); 2991 } 2992 } 2993 2994 2995 // Check for [0-9A-Z_a-z]. 2996 static void EmitWordCheck(RegExpMacroAssembler* assembler, 2997 Label* word, 2998 Label* non_word, 2999 bool fall_through_on_word) { 3000 if (assembler->CheckSpecialCharacterClass( 3001 fall_through_on_word ? 'w' : 'W', 3002 fall_through_on_word ? non_word : word)) { 3003 // Optimized implementation available. 3004 return; 3005 } 3006 assembler->CheckCharacterGT('z', non_word); 3007 assembler->CheckCharacterLT('0', non_word); 3008 assembler->CheckCharacterGT('a' - 1, word); 3009 assembler->CheckCharacterLT('9' + 1, word); 3010 assembler->CheckCharacterLT('A', non_word); 3011 assembler->CheckCharacterLT('Z' + 1, word); 3012 if (fall_through_on_word) { 3013 assembler->CheckNotCharacter('_', non_word); 3014 } else { 3015 assembler->CheckCharacter('_', word); 3016 } 3017 } 3018 3019 3020 // Emit the code to check for a ^ in multiline mode (1-character lookbehind 3021 // that matches newline or the start of input). 3022 static void EmitHat(RegExpCompiler* compiler, 3023 RegExpNode* on_success, 3024 Trace* trace) { 3025 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3026 // We will be loading the previous character into the current character 3027 // register. 3028 Trace new_trace(*trace); 3029 new_trace.InvalidateCurrentCharacter(); 3030 3031 Label ok; 3032 if (new_trace.cp_offset() == 0) { 3033 // The start of input counts as a newline in this context, so skip to 3034 // ok if we are at the start. 3035 assembler->CheckAtStart(&ok); 3036 } 3037 // We already checked that we are not at the start of input so it must be 3038 // OK to load the previous character. 3039 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1, 3040 new_trace.backtrack(), 3041 false); 3042 if (!assembler->CheckSpecialCharacterClass('n', 3043 new_trace.backtrack())) { 3044 // Newline means \n, \r, 0x2028 or 0x2029. 3045 if (!compiler->one_byte()) { 3046 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok); 3047 } 3048 assembler->CheckCharacter('\n', &ok); 3049 assembler->CheckNotCharacter('\r', new_trace.backtrack()); 3050 } 3051 assembler->Bind(&ok); 3052 on_success->Emit(compiler, &new_trace); 3053 } 3054 3055 3056 // Emit the code to handle \b and \B (word-boundary or non-word-boundary). 3057 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) { 3058 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3059 Isolate* isolate = assembler->isolate(); 3060 Trace::TriBool next_is_word_character = Trace::UNKNOWN; 3061 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE); 3062 BoyerMooreLookahead* lookahead = bm_info(not_at_start); 3063 if (lookahead == NULL) { 3064 int eats_at_least = 3065 Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore, 3066 kRecursionBudget, 3067 not_at_start)); 3068 if (eats_at_least >= 1) { 3069 BoyerMooreLookahead* bm = 3070 new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone()); 3071 FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start); 3072 if (bm->at(0)->is_non_word()) 3073 next_is_word_character = Trace::FALSE_VALUE; 3074 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE; 3075 } 3076 } else { 3077 if (lookahead->at(0)->is_non_word()) 3078 next_is_word_character = Trace::FALSE_VALUE; 3079 if (lookahead->at(0)->is_word()) 3080 next_is_word_character = Trace::TRUE_VALUE; 3081 } 3082 bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY); 3083 if (next_is_word_character == Trace::UNKNOWN) { 3084 Label before_non_word; 3085 Label before_word; 3086 if (trace->characters_preloaded() != 1) { 3087 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word); 3088 } 3089 // Fall through on non-word. 3090 EmitWordCheck(assembler, &before_word, &before_non_word, false); 3091 // Next character is not a word character. 3092 assembler->Bind(&before_non_word); 3093 Label ok; 3094 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord); 3095 assembler->GoTo(&ok); 3096 3097 assembler->Bind(&before_word); 3098 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord); 3099 assembler->Bind(&ok); 3100 } else if (next_is_word_character == Trace::TRUE_VALUE) { 3101 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord); 3102 } else { 3103 DCHECK(next_is_word_character == Trace::FALSE_VALUE); 3104 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord); 3105 } 3106 } 3107 3108 3109 void AssertionNode::BacktrackIfPrevious( 3110 RegExpCompiler* compiler, 3111 Trace* trace, 3112 AssertionNode::IfPrevious backtrack_if_previous) { 3113 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3114 Trace new_trace(*trace); 3115 new_trace.InvalidateCurrentCharacter(); 3116 3117 Label fall_through, dummy; 3118 3119 Label* non_word = backtrack_if_previous == kIsNonWord ? 3120 new_trace.backtrack() : 3121 &fall_through; 3122 Label* word = backtrack_if_previous == kIsNonWord ? 3123 &fall_through : 3124 new_trace.backtrack(); 3125 3126 if (new_trace.cp_offset() == 0) { 3127 // The start of input counts as a non-word character, so the question is 3128 // decided if we are at the start. 3129 assembler->CheckAtStart(non_word); 3130 } 3131 // We already checked that we are not at the start of input so it must be 3132 // OK to load the previous character. 3133 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false); 3134 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord); 3135 3136 assembler->Bind(&fall_through); 3137 on_success()->Emit(compiler, &new_trace); 3138 } 3139 3140 3141 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details, 3142 RegExpCompiler* compiler, 3143 int filled_in, 3144 bool not_at_start) { 3145 if (assertion_type_ == AT_START && not_at_start) { 3146 details->set_cannot_match(); 3147 return; 3148 } 3149 return on_success()->GetQuickCheckDetails(details, 3150 compiler, 3151 filled_in, 3152 not_at_start); 3153 } 3154 3155 3156 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3157 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3158 switch (assertion_type_) { 3159 case AT_END: { 3160 Label ok; 3161 assembler->CheckPosition(trace->cp_offset(), &ok); 3162 assembler->GoTo(trace->backtrack()); 3163 assembler->Bind(&ok); 3164 break; 3165 } 3166 case AT_START: { 3167 if (trace->at_start() == Trace::FALSE_VALUE) { 3168 assembler->GoTo(trace->backtrack()); 3169 return; 3170 } 3171 if (trace->at_start() == Trace::UNKNOWN) { 3172 assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack()); 3173 Trace at_start_trace = *trace; 3174 at_start_trace.set_at_start(Trace::TRUE_VALUE); 3175 on_success()->Emit(compiler, &at_start_trace); 3176 return; 3177 } 3178 } 3179 break; 3180 case AFTER_NEWLINE: 3181 EmitHat(compiler, on_success(), trace); 3182 return; 3183 case AT_BOUNDARY: 3184 case AT_NON_BOUNDARY: { 3185 EmitBoundaryCheck(compiler, trace); 3186 return; 3187 } 3188 } 3189 on_success()->Emit(compiler, trace); 3190 } 3191 3192 3193 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) { 3194 if (quick_check == NULL) return false; 3195 if (offset >= quick_check->characters()) return false; 3196 return quick_check->positions(offset)->determines_perfectly; 3197 } 3198 3199 3200 static void UpdateBoundsCheck(int index, int* checked_up_to) { 3201 if (index > *checked_up_to) { 3202 *checked_up_to = index; 3203 } 3204 } 3205 3206 3207 // We call this repeatedly to generate code for each pass over the text node. 3208 // The passes are in increasing order of difficulty because we hope one 3209 // of the first passes will fail in which case we are saved the work of the 3210 // later passes. for example for the case independent regexp /%[asdfghjkl]a/ 3211 // we will check the '%' in the first pass, the case independent 'a' in the 3212 // second pass and the character class in the last pass. 3213 // 3214 // The passes are done from right to left, so for example to test for /bar/ 3215 // we will first test for an 'r' with offset 2, then an 'a' with offset 1 3216 // and then a 'b' with offset 0. This means we can avoid the end-of-input 3217 // bounds check most of the time. In the example we only need to check for 3218 // end-of-input when loading the putative 'r'. 3219 // 3220 // A slight complication involves the fact that the first character may already 3221 // be fetched into a register by the previous node. In this case we want to 3222 // do the test for that character first. We do this in separate passes. The 3223 // 'preloaded' argument indicates that we are doing such a 'pass'. If such a 3224 // pass has been performed then subsequent passes will have true in 3225 // first_element_checked to indicate that that character does not need to be 3226 // checked again. 3227 // 3228 // In addition to all this we are passed a Trace, which can 3229 // contain an AlternativeGeneration object. In this AlternativeGeneration 3230 // object we can see details of any quick check that was already passed in 3231 // order to get to the code we are now generating. The quick check can involve 3232 // loading characters, which means we do not need to recheck the bounds 3233 // up to the limit the quick check already checked. In addition the quick 3234 // check can have involved a mask and compare operation which may simplify 3235 // or obviate the need for further checks at some character positions. 3236 void TextNode::TextEmitPass(RegExpCompiler* compiler, 3237 TextEmitPassType pass, 3238 bool preloaded, 3239 Trace* trace, 3240 bool first_element_checked, 3241 int* checked_up_to) { 3242 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3243 Isolate* isolate = assembler->isolate(); 3244 bool one_byte = compiler->one_byte(); 3245 Label* backtrack = trace->backtrack(); 3246 QuickCheckDetails* quick_check = trace->quick_check_performed(); 3247 int element_count = elements()->length(); 3248 int backward_offset = read_backward() ? -Length() : 0; 3249 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) { 3250 TextElement elm = elements()->at(i); 3251 int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset; 3252 if (elm.text_type() == TextElement::ATOM) { 3253 Vector<const uc16> quarks = elm.atom()->data(); 3254 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { 3255 if (first_element_checked && i == 0 && j == 0) continue; 3256 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue; 3257 EmitCharacterFunction* emit_function = NULL; 3258 switch (pass) { 3259 case NON_LATIN1_MATCH: 3260 DCHECK(one_byte); 3261 if (quarks[j] > String::kMaxOneByteCharCode) { 3262 assembler->GoTo(backtrack); 3263 return; 3264 } 3265 break; 3266 case NON_LETTER_CHARACTER_MATCH: 3267 emit_function = &EmitAtomNonLetter; 3268 break; 3269 case SIMPLE_CHARACTER_MATCH: 3270 emit_function = &EmitSimpleCharacter; 3271 break; 3272 case CASE_CHARACTER_MATCH: 3273 emit_function = &EmitAtomLetter; 3274 break; 3275 default: 3276 break; 3277 } 3278 if (emit_function != NULL) { 3279 bool bounds_check = *checked_up_to < cp_offset + j || read_backward(); 3280 bool bound_checked = 3281 emit_function(isolate, compiler, quarks[j], backtrack, 3282 cp_offset + j, bounds_check, preloaded); 3283 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); 3284 } 3285 } 3286 } else { 3287 DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type()); 3288 if (pass == CHARACTER_CLASS_MATCH) { 3289 if (first_element_checked && i == 0) continue; 3290 if (DeterminedAlready(quick_check, elm.cp_offset())) continue; 3291 RegExpCharacterClass* cc = elm.char_class(); 3292 bool bounds_check = *checked_up_to < cp_offset || read_backward(); 3293 EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset, 3294 bounds_check, preloaded, zone()); 3295 UpdateBoundsCheck(cp_offset, checked_up_to); 3296 } 3297 } 3298 } 3299 } 3300 3301 3302 int TextNode::Length() { 3303 TextElement elm = elements()->last(); 3304 DCHECK(elm.cp_offset() >= 0); 3305 return elm.cp_offset() + elm.length(); 3306 } 3307 3308 3309 bool TextNode::SkipPass(int int_pass, bool ignore_case) { 3310 TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass); 3311 if (ignore_case) { 3312 return pass == SIMPLE_CHARACTER_MATCH; 3313 } else { 3314 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH; 3315 } 3316 } 3317 3318 3319 TextNode* TextNode::CreateForCharacterRanges(Zone* zone, 3320 ZoneList<CharacterRange>* ranges, 3321 bool read_backward, 3322 RegExpNode* on_success) { 3323 DCHECK_NOT_NULL(ranges); 3324 ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone); 3325 elms->Add( 3326 TextElement::CharClass(new (zone) RegExpCharacterClass(ranges, false)), 3327 zone); 3328 return new (zone) TextNode(elms, read_backward, on_success); 3329 } 3330 3331 3332 TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead, 3333 CharacterRange trail, 3334 bool read_backward, 3335 RegExpNode* on_success) { 3336 ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead); 3337 ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail); 3338 ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone); 3339 elms->Add(TextElement::CharClass( 3340 new (zone) RegExpCharacterClass(lead_ranges, false)), 3341 zone); 3342 elms->Add(TextElement::CharClass( 3343 new (zone) RegExpCharacterClass(trail_ranges, false)), 3344 zone); 3345 return new (zone) TextNode(elms, read_backward, on_success); 3346 } 3347 3348 3349 // This generates the code to match a text node. A text node can contain 3350 // straight character sequences (possibly to be matched in a case-independent 3351 // way) and character classes. For efficiency we do not do this in a single 3352 // pass from left to right. Instead we pass over the text node several times, 3353 // emitting code for some character positions every time. See the comment on 3354 // TextEmitPass for details. 3355 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3356 LimitResult limit_result = LimitVersions(compiler, trace); 3357 if (limit_result == DONE) return; 3358 DCHECK(limit_result == CONTINUE); 3359 3360 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) { 3361 compiler->SetRegExpTooBig(); 3362 return; 3363 } 3364 3365 if (compiler->one_byte()) { 3366 int dummy = 0; 3367 TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy); 3368 } 3369 3370 bool first_elt_done = false; 3371 int bound_checked_to = trace->cp_offset() - 1; 3372 bound_checked_to += trace->bound_checked_up_to(); 3373 3374 // If a character is preloaded into the current character register then 3375 // check that now. 3376 if (trace->characters_preloaded() == 1) { 3377 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { 3378 if (!SkipPass(pass, compiler->ignore_case())) { 3379 TextEmitPass(compiler, 3380 static_cast<TextEmitPassType>(pass), 3381 true, 3382 trace, 3383 false, 3384 &bound_checked_to); 3385 } 3386 } 3387 first_elt_done = true; 3388 } 3389 3390 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { 3391 if (!SkipPass(pass, compiler->ignore_case())) { 3392 TextEmitPass(compiler, 3393 static_cast<TextEmitPassType>(pass), 3394 false, 3395 trace, 3396 first_elt_done, 3397 &bound_checked_to); 3398 } 3399 } 3400 3401 Trace successor_trace(*trace); 3402 // If we advance backward, we may end up at the start. 3403 successor_trace.AdvanceCurrentPositionInTrace( 3404 read_backward() ? -Length() : Length(), compiler); 3405 successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN 3406 : Trace::FALSE_VALUE); 3407 RecursionCheck rc(compiler); 3408 on_success()->Emit(compiler, &successor_trace); 3409 } 3410 3411 3412 void Trace::InvalidateCurrentCharacter() { 3413 characters_preloaded_ = 0; 3414 } 3415 3416 3417 void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) { 3418 // We don't have an instruction for shifting the current character register 3419 // down or for using a shifted value for anything so lets just forget that 3420 // we preloaded any characters into it. 3421 characters_preloaded_ = 0; 3422 // Adjust the offsets of the quick check performed information. This 3423 // information is used to find out what we already determined about the 3424 // characters by means of mask and compare. 3425 quick_check_performed_.Advance(by, compiler->one_byte()); 3426 cp_offset_ += by; 3427 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) { 3428 compiler->SetRegExpTooBig(); 3429 cp_offset_ = 0; 3430 } 3431 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by); 3432 } 3433 3434 3435 void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) { 3436 int element_count = elements()->length(); 3437 for (int i = 0; i < element_count; i++) { 3438 TextElement elm = elements()->at(i); 3439 if (elm.text_type() == TextElement::CHAR_CLASS) { 3440 RegExpCharacterClass* cc = elm.char_class(); 3441 // None of the standard character classes is different in the case 3442 // independent case and it slows us down if we don't know that. 3443 if (cc->is_standard(zone())) continue; 3444 ZoneList<CharacterRange>* ranges = cc->ranges(zone()); 3445 CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, is_one_byte); 3446 } 3447 } 3448 } 3449 3450 3451 int TextNode::GreedyLoopTextLength() { return Length(); } 3452 3453 3454 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode( 3455 RegExpCompiler* compiler) { 3456 if (read_backward()) return NULL; 3457 if (elements()->length() != 1) return NULL; 3458 TextElement elm = elements()->at(0); 3459 if (elm.text_type() != TextElement::CHAR_CLASS) return NULL; 3460 RegExpCharacterClass* node = elm.char_class(); 3461 ZoneList<CharacterRange>* ranges = node->ranges(zone()); 3462 CharacterRange::Canonicalize(ranges); 3463 if (node->is_negated()) { 3464 return ranges->length() == 0 ? on_success() : NULL; 3465 } 3466 if (ranges->length() != 1) return NULL; 3467 uint32_t max_char; 3468 if (compiler->one_byte()) { 3469 max_char = String::kMaxOneByteCharCode; 3470 } else { 3471 max_char = String::kMaxUtf16CodeUnit; 3472 } 3473 return ranges->at(0).IsEverything(max_char) ? on_success() : NULL; 3474 } 3475 3476 3477 // Finds the fixed match length of a sequence of nodes that goes from 3478 // this alternative and back to this choice node. If there are variable 3479 // length nodes or other complications in the way then return a sentinel 3480 // value indicating that a greedy loop cannot be constructed. 3481 int ChoiceNode::GreedyLoopTextLengthForAlternative( 3482 GuardedAlternative* alternative) { 3483 int length = 0; 3484 RegExpNode* node = alternative->node(); 3485 // Later we will generate code for all these text nodes using recursion 3486 // so we have to limit the max number. 3487 int recursion_depth = 0; 3488 while (node != this) { 3489 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) { 3490 return kNodeIsTooComplexForGreedyLoops; 3491 } 3492 int node_length = node->GreedyLoopTextLength(); 3493 if (node_length == kNodeIsTooComplexForGreedyLoops) { 3494 return kNodeIsTooComplexForGreedyLoops; 3495 } 3496 length += node_length; 3497 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node); 3498 node = seq_node->on_success(); 3499 } 3500 return read_backward() ? -length : length; 3501 } 3502 3503 3504 void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) { 3505 DCHECK_NULL(loop_node_); 3506 AddAlternative(alt); 3507 loop_node_ = alt.node(); 3508 } 3509 3510 3511 void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) { 3512 DCHECK_NULL(continue_node_); 3513 AddAlternative(alt); 3514 continue_node_ = alt.node(); 3515 } 3516 3517 3518 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3519 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 3520 if (trace->stop_node() == this) { 3521 // Back edge of greedy optimized loop node graph. 3522 int text_length = 3523 GreedyLoopTextLengthForAlternative(&(alternatives_->at(0))); 3524 DCHECK(text_length != kNodeIsTooComplexForGreedyLoops); 3525 // Update the counter-based backtracking info on the stack. This is an 3526 // optimization for greedy loops (see below). 3527 DCHECK(trace->cp_offset() == text_length); 3528 macro_assembler->AdvanceCurrentPosition(text_length); 3529 macro_assembler->GoTo(trace->loop_label()); 3530 return; 3531 } 3532 DCHECK_NULL(trace->stop_node()); 3533 if (!trace->is_trivial()) { 3534 trace->Flush(compiler, this); 3535 return; 3536 } 3537 ChoiceNode::Emit(compiler, trace); 3538 } 3539 3540 3541 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler, 3542 int eats_at_least) { 3543 int preload_characters = Min(4, eats_at_least); 3544 if (compiler->macro_assembler()->CanReadUnaligned()) { 3545 bool one_byte = compiler->one_byte(); 3546 if (one_byte) { 3547 if (preload_characters > 4) preload_characters = 4; 3548 // We can't preload 3 characters because there is no machine instruction 3549 // to do that. We can't just load 4 because we could be reading 3550 // beyond the end of the string, which could cause a memory fault. 3551 if (preload_characters == 3) preload_characters = 2; 3552 } else { 3553 if (preload_characters > 2) preload_characters = 2; 3554 } 3555 } else { 3556 if (preload_characters > 1) preload_characters = 1; 3557 } 3558 return preload_characters; 3559 } 3560 3561 3562 // This class is used when generating the alternatives in a choice node. It 3563 // records the way the alternative is being code generated. 3564 class AlternativeGeneration: public Malloced { 3565 public: 3566 AlternativeGeneration() 3567 : possible_success(), 3568 expects_preload(false), 3569 after(), 3570 quick_check_details() { } 3571 Label possible_success; 3572 bool expects_preload; 3573 Label after; 3574 QuickCheckDetails quick_check_details; 3575 }; 3576 3577 3578 // Creates a list of AlternativeGenerations. If the list has a reasonable 3579 // size then it is on the stack, otherwise the excess is on the heap. 3580 class AlternativeGenerationList { 3581 public: 3582 AlternativeGenerationList(int count, Zone* zone) 3583 : alt_gens_(count, zone) { 3584 for (int i = 0; i < count && i < kAFew; i++) { 3585 alt_gens_.Add(a_few_alt_gens_ + i, zone); 3586 } 3587 for (int i = kAFew; i < count; i++) { 3588 alt_gens_.Add(new AlternativeGeneration(), zone); 3589 } 3590 } 3591 ~AlternativeGenerationList() { 3592 for (int i = kAFew; i < alt_gens_.length(); i++) { 3593 delete alt_gens_[i]; 3594 alt_gens_[i] = NULL; 3595 } 3596 } 3597 3598 AlternativeGeneration* at(int i) { 3599 return alt_gens_[i]; 3600 } 3601 3602 private: 3603 static const int kAFew = 10; 3604 ZoneList<AlternativeGeneration*> alt_gens_; 3605 AlternativeGeneration a_few_alt_gens_[kAFew]; 3606 }; 3607 3608 3609 static const uc32 kRangeEndMarker = 0x110000; 3610 3611 // The '2' variant is has inclusive from and exclusive to. 3612 // This covers \s as defined in ECMA-262 5.1, 15.10.2.12, 3613 // which include WhiteSpace (7.2) or LineTerminator (7.3) values. 3614 static const int kSpaceRanges[] = { 3615 '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680, 0x1681, 3616 0x180E, 0x180F, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030, 3617 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker}; 3618 static const int kSpaceRangeCount = arraysize(kSpaceRanges); 3619 3620 static const int kWordRanges[] = { 3621 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, kRangeEndMarker}; 3622 static const int kWordRangeCount = arraysize(kWordRanges); 3623 static const int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker}; 3624 static const int kDigitRangeCount = arraysize(kDigitRanges); 3625 static const int kSurrogateRanges[] = { 3626 kLeadSurrogateStart, kLeadSurrogateStart + 1, kRangeEndMarker}; 3627 static const int kSurrogateRangeCount = arraysize(kSurrogateRanges); 3628 static const int kLineTerminatorRanges[] = { 3629 0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, kRangeEndMarker}; 3630 static const int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges); 3631 3632 void BoyerMoorePositionInfo::Set(int character) { 3633 SetInterval(Interval(character, character)); 3634 } 3635 3636 3637 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) { 3638 s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval); 3639 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval); 3640 d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval); 3641 surrogate_ = 3642 AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval); 3643 if (interval.to() - interval.from() >= kMapSize - 1) { 3644 if (map_count_ != kMapSize) { 3645 map_count_ = kMapSize; 3646 for (int i = 0; i < kMapSize; i++) map_->at(i) = true; 3647 } 3648 return; 3649 } 3650 for (int i = interval.from(); i <= interval.to(); i++) { 3651 int mod_character = (i & kMask); 3652 if (!map_->at(mod_character)) { 3653 map_count_++; 3654 map_->at(mod_character) = true; 3655 } 3656 if (map_count_ == kMapSize) return; 3657 } 3658 } 3659 3660 3661 void BoyerMoorePositionInfo::SetAll() { 3662 s_ = w_ = d_ = kLatticeUnknown; 3663 if (map_count_ != kMapSize) { 3664 map_count_ = kMapSize; 3665 for (int i = 0; i < kMapSize; i++) map_->at(i) = true; 3666 } 3667 } 3668 3669 3670 BoyerMooreLookahead::BoyerMooreLookahead( 3671 int length, RegExpCompiler* compiler, Zone* zone) 3672 : length_(length), 3673 compiler_(compiler) { 3674 if (compiler->one_byte()) { 3675 max_char_ = String::kMaxOneByteCharCode; 3676 } else { 3677 max_char_ = String::kMaxUtf16CodeUnit; 3678 } 3679 bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone); 3680 for (int i = 0; i < length; i++) { 3681 bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone); 3682 } 3683 } 3684 3685 3686 // Find the longest range of lookahead that has the fewest number of different 3687 // characters that can occur at a given position. Since we are optimizing two 3688 // different parameters at once this is a tradeoff. 3689 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) { 3690 int biggest_points = 0; 3691 // If more than 32 characters out of 128 can occur it is unlikely that we can 3692 // be lucky enough to step forwards much of the time. 3693 const int kMaxMax = 32; 3694 for (int max_number_of_chars = 4; 3695 max_number_of_chars < kMaxMax; 3696 max_number_of_chars *= 2) { 3697 biggest_points = 3698 FindBestInterval(max_number_of_chars, biggest_points, from, to); 3699 } 3700 if (biggest_points == 0) return false; 3701 return true; 3702 } 3703 3704 3705 // Find the highest-points range between 0 and length_ where the character 3706 // information is not too vague. 'Too vague' means that there are more than 3707 // max_number_of_chars that can occur at this position. Calculates the number 3708 // of points as the product of width-of-the-range and 3709 // probability-of-finding-one-of-the-characters, where the probability is 3710 // calculated using the frequency distribution of the sample subject string. 3711 int BoyerMooreLookahead::FindBestInterval( 3712 int max_number_of_chars, int old_biggest_points, int* from, int* to) { 3713 int biggest_points = old_biggest_points; 3714 static const int kSize = RegExpMacroAssembler::kTableSize; 3715 for (int i = 0; i < length_; ) { 3716 while (i < length_ && Count(i) > max_number_of_chars) i++; 3717 if (i == length_) break; 3718 int remembered_from = i; 3719 bool union_map[kSize]; 3720 for (int j = 0; j < kSize; j++) union_map[j] = false; 3721 while (i < length_ && Count(i) <= max_number_of_chars) { 3722 BoyerMoorePositionInfo* map = bitmaps_->at(i); 3723 for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j); 3724 i++; 3725 } 3726 int frequency = 0; 3727 for (int j = 0; j < kSize; j++) { 3728 if (union_map[j]) { 3729 // Add 1 to the frequency to give a small per-character boost for 3730 // the cases where our sampling is not good enough and many 3731 // characters have a frequency of zero. This means the frequency 3732 // can theoretically be up to 2*kSize though we treat it mostly as 3733 // a fraction of kSize. 3734 frequency += compiler_->frequency_collator()->Frequency(j) + 1; 3735 } 3736 } 3737 // We use the probability of skipping times the distance we are skipping to 3738 // judge the effectiveness of this. Actually we have a cut-off: By 3739 // dividing by 2 we switch off the skipping if the probability of skipping 3740 // is less than 50%. This is because the multibyte mask-and-compare 3741 // skipping in quickcheck is more likely to do well on this case. 3742 bool in_quickcheck_range = 3743 ((i - remembered_from < 4) || 3744 (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2)); 3745 // Called 'probability' but it is only a rough estimate and can actually 3746 // be outside the 0-kSize range. 3747 int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency; 3748 int points = (i - remembered_from) * probability; 3749 if (points > biggest_points) { 3750 *from = remembered_from; 3751 *to = i - 1; 3752 biggest_points = points; 3753 } 3754 } 3755 return biggest_points; 3756 } 3757 3758 3759 // Take all the characters that will not prevent a successful match if they 3760 // occur in the subject string in the range between min_lookahead and 3761 // max_lookahead (inclusive) measured from the current position. If the 3762 // character at max_lookahead offset is not one of these characters, then we 3763 // can safely skip forwards by the number of characters in the range. 3764 int BoyerMooreLookahead::GetSkipTable(int min_lookahead, 3765 int max_lookahead, 3766 Handle<ByteArray> boolean_skip_table) { 3767 const int kSize = RegExpMacroAssembler::kTableSize; 3768 3769 const int kSkipArrayEntry = 0; 3770 const int kDontSkipArrayEntry = 1; 3771 3772 for (int i = 0; i < kSize; i++) { 3773 boolean_skip_table->set(i, kSkipArrayEntry); 3774 } 3775 int skip = max_lookahead + 1 - min_lookahead; 3776 3777 for (int i = max_lookahead; i >= min_lookahead; i--) { 3778 BoyerMoorePositionInfo* map = bitmaps_->at(i); 3779 for (int j = 0; j < kSize; j++) { 3780 if (map->at(j)) { 3781 boolean_skip_table->set(j, kDontSkipArrayEntry); 3782 } 3783 } 3784 } 3785 3786 return skip; 3787 } 3788 3789 3790 // See comment above on the implementation of GetSkipTable. 3791 void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) { 3792 const int kSize = RegExpMacroAssembler::kTableSize; 3793 3794 int min_lookahead = 0; 3795 int max_lookahead = 0; 3796 3797 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return; 3798 3799 bool found_single_character = false; 3800 int single_character = 0; 3801 for (int i = max_lookahead; i >= min_lookahead; i--) { 3802 BoyerMoorePositionInfo* map = bitmaps_->at(i); 3803 if (map->map_count() > 1 || 3804 (found_single_character && map->map_count() != 0)) { 3805 found_single_character = false; 3806 break; 3807 } 3808 for (int j = 0; j < kSize; j++) { 3809 if (map->at(j)) { 3810 found_single_character = true; 3811 single_character = j; 3812 break; 3813 } 3814 } 3815 } 3816 3817 int lookahead_width = max_lookahead + 1 - min_lookahead; 3818 3819 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) { 3820 // The mask-compare can probably handle this better. 3821 return; 3822 } 3823 3824 if (found_single_character) { 3825 Label cont, again; 3826 masm->Bind(&again); 3827 masm->LoadCurrentCharacter(max_lookahead, &cont, true); 3828 if (max_char_ > kSize) { 3829 masm->CheckCharacterAfterAnd(single_character, 3830 RegExpMacroAssembler::kTableMask, 3831 &cont); 3832 } else { 3833 masm->CheckCharacter(single_character, &cont); 3834 } 3835 masm->AdvanceCurrentPosition(lookahead_width); 3836 masm->GoTo(&again); 3837 masm->Bind(&cont); 3838 return; 3839 } 3840 3841 Factory* factory = masm->isolate()->factory(); 3842 Handle<ByteArray> boolean_skip_table = factory->NewByteArray(kSize, TENURED); 3843 int skip_distance = GetSkipTable( 3844 min_lookahead, max_lookahead, boolean_skip_table); 3845 DCHECK(skip_distance != 0); 3846 3847 Label cont, again; 3848 masm->Bind(&again); 3849 masm->LoadCurrentCharacter(max_lookahead, &cont, true); 3850 masm->CheckBitInTable(boolean_skip_table, &cont); 3851 masm->AdvanceCurrentPosition(skip_distance); 3852 masm->GoTo(&again); 3853 masm->Bind(&cont); 3854 } 3855 3856 3857 /* Code generation for choice nodes. 3858 * 3859 * We generate quick checks that do a mask and compare to eliminate a 3860 * choice. If the quick check succeeds then it jumps to the continuation to 3861 * do slow checks and check subsequent nodes. If it fails (the common case) 3862 * it falls through to the next choice. 3863 * 3864 * Here is the desired flow graph. Nodes directly below each other imply 3865 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative 3866 * 3 doesn't have a quick check so we have to call the slow check. 3867 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire 3868 * regexp continuation is generated directly after the Sn node, up to the 3869 * next GoTo if we decide to reuse some already generated code. Some 3870 * nodes expect preload_characters to be preloaded into the current 3871 * character register. R nodes do this preloading. Vertices are marked 3872 * F for failures and S for success (possible success in the case of quick 3873 * nodes). L, V, < and > are used as arrow heads. 3874 * 3875 * ----------> R 3876 * | 3877 * V 3878 * Q1 -----> S1 3879 * | S / 3880 * F| / 3881 * | F/ 3882 * | / 3883 * | R 3884 * | / 3885 * V L 3886 * Q2 -----> S2 3887 * | S / 3888 * F| / 3889 * | F/ 3890 * | / 3891 * | R 3892 * | / 3893 * V L 3894 * S3 3895 * | 3896 * F| 3897 * | 3898 * R 3899 * | 3900 * backtrack V 3901 * <----------Q4 3902 * \ F | 3903 * \ |S 3904 * \ F V 3905 * \-----S4 3906 * 3907 * For greedy loops we push the current position, then generate the code that 3908 * eats the input specially in EmitGreedyLoop. The other choice (the 3909 * continuation) is generated by the normal code in EmitChoices, and steps back 3910 * in the input to the starting position when it fails to match. The loop code 3911 * looks like this (U is the unwind code that steps back in the greedy loop). 3912 * 3913 * _____ 3914 * / \ 3915 * V | 3916 * ----------> S1 | 3917 * /| | 3918 * / |S | 3919 * F/ \_____/ 3920 * / 3921 * |<----- 3922 * | \ 3923 * V |S 3924 * Q2 ---> U----->backtrack 3925 * | F / 3926 * S| / 3927 * V F / 3928 * S2--/ 3929 */ 3930 3931 GreedyLoopState::GreedyLoopState(bool not_at_start) { 3932 counter_backtrack_trace_.set_backtrack(&label_); 3933 if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE); 3934 } 3935 3936 3937 void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) { 3938 #ifdef DEBUG 3939 int choice_count = alternatives_->length(); 3940 for (int i = 0; i < choice_count - 1; i++) { 3941 GuardedAlternative alternative = alternatives_->at(i); 3942 ZoneList<Guard*>* guards = alternative.guards(); 3943 int guard_count = (guards == NULL) ? 0 : guards->length(); 3944 for (int j = 0; j < guard_count; j++) { 3945 DCHECK(!trace->mentions_reg(guards->at(j)->reg())); 3946 } 3947 } 3948 #endif 3949 } 3950 3951 3952 void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler, 3953 Trace* current_trace, 3954 PreloadState* state) { 3955 if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) { 3956 // Save some time by looking at most one machine word ahead. 3957 state->eats_at_least_ = 3958 EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget, 3959 current_trace->at_start() == Trace::FALSE_VALUE); 3960 } 3961 state->preload_characters_ = 3962 CalculatePreloadCharacters(compiler, state->eats_at_least_); 3963 3964 state->preload_is_current_ = 3965 (current_trace->characters_preloaded() == state->preload_characters_); 3966 state->preload_has_checked_bounds_ = state->preload_is_current_; 3967 } 3968 3969 3970 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3971 int choice_count = alternatives_->length(); 3972 3973 if (choice_count == 1 && alternatives_->at(0).guards() == NULL) { 3974 alternatives_->at(0).node()->Emit(compiler, trace); 3975 return; 3976 } 3977 3978 AssertGuardsMentionRegisters(trace); 3979 3980 LimitResult limit_result = LimitVersions(compiler, trace); 3981 if (limit_result == DONE) return; 3982 DCHECK(limit_result == CONTINUE); 3983 3984 // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for 3985 // other choice nodes we only flush if we are out of code size budget. 3986 if (trace->flush_budget() == 0 && trace->actions() != NULL) { 3987 trace->Flush(compiler, this); 3988 return; 3989 } 3990 3991 RecursionCheck rc(compiler); 3992 3993 PreloadState preload; 3994 preload.init(); 3995 GreedyLoopState greedy_loop_state(not_at_start()); 3996 3997 int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0)); 3998 AlternativeGenerationList alt_gens(choice_count, zone()); 3999 4000 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) { 4001 trace = EmitGreedyLoop(compiler, 4002 trace, 4003 &alt_gens, 4004 &preload, 4005 &greedy_loop_state, 4006 text_length); 4007 } else { 4008 // TODO(erikcorry): Delete this. We don't need this label, but it makes us 4009 // match the traces produced pre-cleanup. 4010 Label second_choice; 4011 compiler->macro_assembler()->Bind(&second_choice); 4012 4013 preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace); 4014 4015 EmitChoices(compiler, 4016 &alt_gens, 4017 0, 4018 trace, 4019 &preload); 4020 } 4021 4022 // At this point we need to generate slow checks for the alternatives where 4023 // the quick check was inlined. We can recognize these because the associated 4024 // label was bound. 4025 int new_flush_budget = trace->flush_budget() / choice_count; 4026 for (int i = 0; i < choice_count; i++) { 4027 AlternativeGeneration* alt_gen = alt_gens.at(i); 4028 Trace new_trace(*trace); 4029 // If there are actions to be flushed we have to limit how many times 4030 // they are flushed. Take the budget of the parent trace and distribute 4031 // it fairly amongst the children. 4032 if (new_trace.actions() != NULL) { 4033 new_trace.set_flush_budget(new_flush_budget); 4034 } 4035 bool next_expects_preload = 4036 i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload; 4037 EmitOutOfLineContinuation(compiler, 4038 &new_trace, 4039 alternatives_->at(i), 4040 alt_gen, 4041 preload.preload_characters_, 4042 next_expects_preload); 4043 } 4044 } 4045 4046 4047 Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler, 4048 Trace* trace, 4049 AlternativeGenerationList* alt_gens, 4050 PreloadState* preload, 4051 GreedyLoopState* greedy_loop_state, 4052 int text_length) { 4053 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 4054 // Here we have special handling for greedy loops containing only text nodes 4055 // and other simple nodes. These are handled by pushing the current 4056 // position on the stack and then incrementing the current position each 4057 // time around the switch. On backtrack we decrement the current position 4058 // and check it against the pushed value. This avoids pushing backtrack 4059 // information for each iteration of the loop, which could take up a lot of 4060 // space. 4061 DCHECK(trace->stop_node() == NULL); 4062 macro_assembler->PushCurrentPosition(); 4063 Label greedy_match_failed; 4064 Trace greedy_match_trace; 4065 if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE); 4066 greedy_match_trace.set_backtrack(&greedy_match_failed); 4067 Label loop_label; 4068 macro_assembler->Bind(&loop_label); 4069 greedy_match_trace.set_stop_node(this); 4070 greedy_match_trace.set_loop_label(&loop_label); 4071 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace); 4072 macro_assembler->Bind(&greedy_match_failed); 4073 4074 Label second_choice; // For use in greedy matches. 4075 macro_assembler->Bind(&second_choice); 4076 4077 Trace* new_trace = greedy_loop_state->counter_backtrack_trace(); 4078 4079 EmitChoices(compiler, 4080 alt_gens, 4081 1, 4082 new_trace, 4083 preload); 4084 4085 macro_assembler->Bind(greedy_loop_state->label()); 4086 // If we have unwound to the bottom then backtrack. 4087 macro_assembler->CheckGreedyLoop(trace->backtrack()); 4088 // Otherwise try the second priority at an earlier position. 4089 macro_assembler->AdvanceCurrentPosition(-text_length); 4090 macro_assembler->GoTo(&second_choice); 4091 return new_trace; 4092 } 4093 4094 int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler, 4095 Trace* trace) { 4096 int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized; 4097 if (alternatives_->length() != 2) return eats_at_least; 4098 4099 GuardedAlternative alt1 = alternatives_->at(1); 4100 if (alt1.guards() != NULL && alt1.guards()->length() != 0) { 4101 return eats_at_least; 4102 } 4103 RegExpNode* eats_anything_node = alt1.node(); 4104 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) { 4105 return eats_at_least; 4106 } 4107 4108 // Really we should be creating a new trace when we execute this function, 4109 // but there is no need, because the code it generates cannot backtrack, and 4110 // we always arrive here with a trivial trace (since it's the entry to a 4111 // loop. That also implies that there are no preloaded characters, which is 4112 // good, because it means we won't be violating any assumptions by 4113 // overwriting those characters with new load instructions. 4114 DCHECK(trace->is_trivial()); 4115 4116 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 4117 Isolate* isolate = macro_assembler->isolate(); 4118 // At this point we know that we are at a non-greedy loop that will eat 4119 // any character one at a time. Any non-anchored regexp has such a 4120 // loop prepended to it in order to find where it starts. We look for 4121 // a pattern of the form ...abc... where we can look 6 characters ahead 4122 // and step forwards 3 if the character is not one of abc. Abc need 4123 // not be atoms, they can be any reasonably limited character class or 4124 // small alternation. 4125 BoyerMooreLookahead* bm = bm_info(false); 4126 if (bm == NULL) { 4127 eats_at_least = Min(kMaxLookaheadForBoyerMoore, 4128 EatsAtLeast(kMaxLookaheadForBoyerMoore, 4129 kRecursionBudget, 4130 false)); 4131 if (eats_at_least >= 1) { 4132 bm = new(zone()) BoyerMooreLookahead(eats_at_least, 4133 compiler, 4134 zone()); 4135 GuardedAlternative alt0 = alternatives_->at(0); 4136 alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false); 4137 } 4138 } 4139 if (bm != NULL) { 4140 bm->EmitSkipInstructions(macro_assembler); 4141 } 4142 return eats_at_least; 4143 } 4144 4145 4146 void ChoiceNode::EmitChoices(RegExpCompiler* compiler, 4147 AlternativeGenerationList* alt_gens, 4148 int first_choice, 4149 Trace* trace, 4150 PreloadState* preload) { 4151 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 4152 SetUpPreLoad(compiler, trace, preload); 4153 4154 // For now we just call all choices one after the other. The idea ultimately 4155 // is to use the Dispatch table to try only the relevant ones. 4156 int choice_count = alternatives_->length(); 4157 4158 int new_flush_budget = trace->flush_budget() / choice_count; 4159 4160 for (int i = first_choice; i < choice_count; i++) { 4161 bool is_last = i == choice_count - 1; 4162 bool fall_through_on_failure = !is_last; 4163 GuardedAlternative alternative = alternatives_->at(i); 4164 AlternativeGeneration* alt_gen = alt_gens->at(i); 4165 alt_gen->quick_check_details.set_characters(preload->preload_characters_); 4166 ZoneList<Guard*>* guards = alternative.guards(); 4167 int guard_count = (guards == NULL) ? 0 : guards->length(); 4168 Trace new_trace(*trace); 4169 new_trace.set_characters_preloaded(preload->preload_is_current_ ? 4170 preload->preload_characters_ : 4171 0); 4172 if (preload->preload_has_checked_bounds_) { 4173 new_trace.set_bound_checked_up_to(preload->preload_characters_); 4174 } 4175 new_trace.quick_check_performed()->Clear(); 4176 if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE); 4177 if (!is_last) { 4178 new_trace.set_backtrack(&alt_gen->after); 4179 } 4180 alt_gen->expects_preload = preload->preload_is_current_; 4181 bool generate_full_check_inline = false; 4182 if (compiler->optimize() && 4183 try_to_emit_quick_check_for_alternative(i == 0) && 4184 alternative.node()->EmitQuickCheck( 4185 compiler, trace, &new_trace, preload->preload_has_checked_bounds_, 4186 &alt_gen->possible_success, &alt_gen->quick_check_details, 4187 fall_through_on_failure)) { 4188 // Quick check was generated for this choice. 4189 preload->preload_is_current_ = true; 4190 preload->preload_has_checked_bounds_ = true; 4191 // If we generated the quick check to fall through on possible success, 4192 // we now need to generate the full check inline. 4193 if (!fall_through_on_failure) { 4194 macro_assembler->Bind(&alt_gen->possible_success); 4195 new_trace.set_quick_check_performed(&alt_gen->quick_check_details); 4196 new_trace.set_characters_preloaded(preload->preload_characters_); 4197 new_trace.set_bound_checked_up_to(preload->preload_characters_); 4198 generate_full_check_inline = true; 4199 } 4200 } else if (alt_gen->quick_check_details.cannot_match()) { 4201 if (!fall_through_on_failure) { 4202 macro_assembler->GoTo(trace->backtrack()); 4203 } 4204 continue; 4205 } else { 4206 // No quick check was generated. Put the full code here. 4207 // If this is not the first choice then there could be slow checks from 4208 // previous cases that go here when they fail. There's no reason to 4209 // insist that they preload characters since the slow check we are about 4210 // to generate probably can't use it. 4211 if (i != first_choice) { 4212 alt_gen->expects_preload = false; 4213 new_trace.InvalidateCurrentCharacter(); 4214 } 4215 generate_full_check_inline = true; 4216 } 4217 if (generate_full_check_inline) { 4218 if (new_trace.actions() != NULL) { 4219 new_trace.set_flush_budget(new_flush_budget); 4220 } 4221 for (int j = 0; j < guard_count; j++) { 4222 GenerateGuard(macro_assembler, guards->at(j), &new_trace); 4223 } 4224 alternative.node()->Emit(compiler, &new_trace); 4225 preload->preload_is_current_ = false; 4226 } 4227 macro_assembler->Bind(&alt_gen->after); 4228 } 4229 } 4230 4231 4232 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler, 4233 Trace* trace, 4234 GuardedAlternative alternative, 4235 AlternativeGeneration* alt_gen, 4236 int preload_characters, 4237 bool next_expects_preload) { 4238 if (!alt_gen->possible_success.is_linked()) return; 4239 4240 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 4241 macro_assembler->Bind(&alt_gen->possible_success); 4242 Trace out_of_line_trace(*trace); 4243 out_of_line_trace.set_characters_preloaded(preload_characters); 4244 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details); 4245 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE); 4246 ZoneList<Guard*>* guards = alternative.guards(); 4247 int guard_count = (guards == NULL) ? 0 : guards->length(); 4248 if (next_expects_preload) { 4249 Label reload_current_char; 4250 out_of_line_trace.set_backtrack(&reload_current_char); 4251 for (int j = 0; j < guard_count; j++) { 4252 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace); 4253 } 4254 alternative.node()->Emit(compiler, &out_of_line_trace); 4255 macro_assembler->Bind(&reload_current_char); 4256 // Reload the current character, since the next quick check expects that. 4257 // We don't need to check bounds here because we only get into this 4258 // code through a quick check which already did the checked load. 4259 macro_assembler->LoadCurrentCharacter(trace->cp_offset(), 4260 NULL, 4261 false, 4262 preload_characters); 4263 macro_assembler->GoTo(&(alt_gen->after)); 4264 } else { 4265 out_of_line_trace.set_backtrack(&(alt_gen->after)); 4266 for (int j = 0; j < guard_count; j++) { 4267 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace); 4268 } 4269 alternative.node()->Emit(compiler, &out_of_line_trace); 4270 } 4271 } 4272 4273 4274 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) { 4275 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 4276 LimitResult limit_result = LimitVersions(compiler, trace); 4277 if (limit_result == DONE) return; 4278 DCHECK(limit_result == CONTINUE); 4279 4280 RecursionCheck rc(compiler); 4281 4282 switch (action_type_) { 4283 case STORE_POSITION: { 4284 Trace::DeferredCapture 4285 new_capture(data_.u_position_register.reg, 4286 data_.u_position_register.is_capture, 4287 trace); 4288 Trace new_trace = *trace; 4289 new_trace.add_action(&new_capture); 4290 on_success()->Emit(compiler, &new_trace); 4291 break; 4292 } 4293 case INCREMENT_REGISTER: { 4294 Trace::DeferredIncrementRegister 4295 new_increment(data_.u_increment_register.reg); 4296 Trace new_trace = *trace; 4297 new_trace.add_action(&new_increment); 4298 on_success()->Emit(compiler, &new_trace); 4299 break; 4300 } 4301 case SET_REGISTER: { 4302 Trace::DeferredSetRegister 4303 new_set(data_.u_store_register.reg, data_.u_store_register.value); 4304 Trace new_trace = *trace; 4305 new_trace.add_action(&new_set); 4306 on_success()->Emit(compiler, &new_trace); 4307 break; 4308 } 4309 case CLEAR_CAPTURES: { 4310 Trace::DeferredClearCaptures 4311 new_capture(Interval(data_.u_clear_captures.range_from, 4312 data_.u_clear_captures.range_to)); 4313 Trace new_trace = *trace; 4314 new_trace.add_action(&new_capture); 4315 on_success()->Emit(compiler, &new_trace); 4316 break; 4317 } 4318 case BEGIN_SUBMATCH: 4319 if (!trace->is_trivial()) { 4320 trace->Flush(compiler, this); 4321 } else { 4322 assembler->WriteCurrentPositionToRegister( 4323 data_.u_submatch.current_position_register, 0); 4324 assembler->WriteStackPointerToRegister( 4325 data_.u_submatch.stack_pointer_register); 4326 on_success()->Emit(compiler, trace); 4327 } 4328 break; 4329 case EMPTY_MATCH_CHECK: { 4330 int start_pos_reg = data_.u_empty_match_check.start_register; 4331 int stored_pos = 0; 4332 int rep_reg = data_.u_empty_match_check.repetition_register; 4333 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister); 4334 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos); 4335 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) { 4336 // If we know we haven't advanced and there is no minimum we 4337 // can just backtrack immediately. 4338 assembler->GoTo(trace->backtrack()); 4339 } else if (know_dist && stored_pos < trace->cp_offset()) { 4340 // If we know we've advanced we can generate the continuation 4341 // immediately. 4342 on_success()->Emit(compiler, trace); 4343 } else if (!trace->is_trivial()) { 4344 trace->Flush(compiler, this); 4345 } else { 4346 Label skip_empty_check; 4347 // If we have a minimum number of repetitions we check the current 4348 // number first and skip the empty check if it's not enough. 4349 if (has_minimum) { 4350 int limit = data_.u_empty_match_check.repetition_limit; 4351 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check); 4352 } 4353 // If the match is empty we bail out, otherwise we fall through 4354 // to the on-success continuation. 4355 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register, 4356 trace->backtrack()); 4357 assembler->Bind(&skip_empty_check); 4358 on_success()->Emit(compiler, trace); 4359 } 4360 break; 4361 } 4362 case POSITIVE_SUBMATCH_SUCCESS: { 4363 if (!trace->is_trivial()) { 4364 trace->Flush(compiler, this); 4365 return; 4366 } 4367 assembler->ReadCurrentPositionFromRegister( 4368 data_.u_submatch.current_position_register); 4369 assembler->ReadStackPointerFromRegister( 4370 data_.u_submatch.stack_pointer_register); 4371 int clear_register_count = data_.u_submatch.clear_register_count; 4372 if (clear_register_count == 0) { 4373 on_success()->Emit(compiler, trace); 4374 return; 4375 } 4376 int clear_registers_from = data_.u_submatch.clear_register_from; 4377 Label clear_registers_backtrack; 4378 Trace new_trace = *trace; 4379 new_trace.set_backtrack(&clear_registers_backtrack); 4380 on_success()->Emit(compiler, &new_trace); 4381 4382 assembler->Bind(&clear_registers_backtrack); 4383 int clear_registers_to = clear_registers_from + clear_register_count - 1; 4384 assembler->ClearRegisters(clear_registers_from, clear_registers_to); 4385 4386 DCHECK(trace->backtrack() == NULL); 4387 assembler->Backtrack(); 4388 return; 4389 } 4390 default: 4391 UNREACHABLE(); 4392 } 4393 } 4394 4395 4396 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 4397 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 4398 if (!trace->is_trivial()) { 4399 trace->Flush(compiler, this); 4400 return; 4401 } 4402 4403 LimitResult limit_result = LimitVersions(compiler, trace); 4404 if (limit_result == DONE) return; 4405 DCHECK(limit_result == CONTINUE); 4406 4407 RecursionCheck rc(compiler); 4408 4409 DCHECK_EQ(start_reg_ + 1, end_reg_); 4410 if (compiler->ignore_case()) { 4411 assembler->CheckNotBackReferenceIgnoreCase( 4412 start_reg_, read_backward(), compiler->unicode(), trace->backtrack()); 4413 } else { 4414 assembler->CheckNotBackReference(start_reg_, read_backward(), 4415 trace->backtrack()); 4416 } 4417 // We are going to advance backward, so we may end up at the start. 4418 if (read_backward()) trace->set_at_start(Trace::UNKNOWN); 4419 4420 // Check that the back reference does not end inside a surrogate pair. 4421 if (compiler->unicode() && !compiler->one_byte()) { 4422 assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack()); 4423 } 4424 on_success()->Emit(compiler, trace); 4425 } 4426 4427 4428 // ------------------------------------------------------------------- 4429 // Dot/dotty output 4430 4431 4432 #ifdef DEBUG 4433 4434 4435 class DotPrinter: public NodeVisitor { 4436 public: 4437 DotPrinter(std::ostream& os, bool ignore_case) // NOLINT 4438 : os_(os), 4439 ignore_case_(ignore_case) {} 4440 void PrintNode(const char* label, RegExpNode* node); 4441 void Visit(RegExpNode* node); 4442 void PrintAttributes(RegExpNode* from); 4443 void PrintOnFailure(RegExpNode* from, RegExpNode* to); 4444 #define DECLARE_VISIT(Type) \ 4445 virtual void Visit##Type(Type##Node* that); 4446 FOR_EACH_NODE_TYPE(DECLARE_VISIT) 4447 #undef DECLARE_VISIT 4448 private: 4449 std::ostream& os_; 4450 bool ignore_case_; 4451 }; 4452 4453 4454 void DotPrinter::PrintNode(const char* label, RegExpNode* node) { 4455 os_ << "digraph G {\n graph [label=\""; 4456 for (int i = 0; label[i]; i++) { 4457 switch (label[i]) { 4458 case '\\': 4459 os_ << "\\\\"; 4460 break; 4461 case '"': 4462 os_ << "\""; 4463 break; 4464 default: 4465 os_ << label[i]; 4466 break; 4467 } 4468 } 4469 os_ << "\"];\n"; 4470 Visit(node); 4471 os_ << "}" << std::endl; 4472 } 4473 4474 4475 void DotPrinter::Visit(RegExpNode* node) { 4476 if (node->info()->visited) return; 4477 node->info()->visited = true; 4478 node->Accept(this); 4479 } 4480 4481 4482 void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) { 4483 os_ << " n" << from << " -> n" << on_failure << " [style=dotted];\n"; 4484 Visit(on_failure); 4485 } 4486 4487 4488 class TableEntryBodyPrinter { 4489 public: 4490 TableEntryBodyPrinter(std::ostream& os, ChoiceNode* choice) // NOLINT 4491 : os_(os), 4492 choice_(choice) {} 4493 void Call(uc16 from, DispatchTable::Entry entry) { 4494 OutSet* out_set = entry.out_set(); 4495 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { 4496 if (out_set->Get(i)) { 4497 os_ << " n" << choice() << ":s" << from << "o" << i << " -> n" 4498 << choice()->alternatives()->at(i).node() << ";\n"; 4499 } 4500 } 4501 } 4502 private: 4503 ChoiceNode* choice() { return choice_; } 4504 std::ostream& os_; 4505 ChoiceNode* choice_; 4506 }; 4507 4508 4509 class TableEntryHeaderPrinter { 4510 public: 4511 explicit TableEntryHeaderPrinter(std::ostream& os) // NOLINT 4512 : first_(true), 4513 os_(os) {} 4514 void Call(uc16 from, DispatchTable::Entry entry) { 4515 if (first_) { 4516 first_ = false; 4517 } else { 4518 os_ << "|"; 4519 } 4520 os_ << "{\\" << AsUC16(from) << "-\\" << AsUC16(entry.to()) << "|{"; 4521 OutSet* out_set = entry.out_set(); 4522 int priority = 0; 4523 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { 4524 if (out_set->Get(i)) { 4525 if (priority > 0) os_ << "|"; 4526 os_ << "<s" << from << "o" << i << "> " << priority; 4527 priority++; 4528 } 4529 } 4530 os_ << "}}"; 4531 } 4532 4533 private: 4534 bool first_; 4535 std::ostream& os_; 4536 }; 4537 4538 4539 class AttributePrinter { 4540 public: 4541 explicit AttributePrinter(std::ostream& os) // NOLINT 4542 : os_(os), 4543 first_(true) {} 4544 void PrintSeparator() { 4545 if (first_) { 4546 first_ = false; 4547 } else { 4548 os_ << "|"; 4549 } 4550 } 4551 void PrintBit(const char* name, bool value) { 4552 if (!value) return; 4553 PrintSeparator(); 4554 os_ << "{" << name << "}"; 4555 } 4556 void PrintPositive(const char* name, int value) { 4557 if (value < 0) return; 4558 PrintSeparator(); 4559 os_ << "{" << name << "|" << value << "}"; 4560 } 4561 4562 private: 4563 std::ostream& os_; 4564 bool first_; 4565 }; 4566 4567 4568 void DotPrinter::PrintAttributes(RegExpNode* that) { 4569 os_ << " a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, " 4570 << "margin=0.1, fontsize=10, label=\"{"; 4571 AttributePrinter printer(os_); 4572 NodeInfo* info = that->info(); 4573 printer.PrintBit("NI", info->follows_newline_interest); 4574 printer.PrintBit("WI", info->follows_word_interest); 4575 printer.PrintBit("SI", info->follows_start_interest); 4576 Label* label = that->label(); 4577 if (label->is_bound()) 4578 printer.PrintPositive("@", label->pos()); 4579 os_ << "}\"];\n" 4580 << " a" << that << " -> n" << that 4581 << " [style=dashed, color=grey, arrowhead=none];\n"; 4582 } 4583 4584 4585 static const bool kPrintDispatchTable = false; 4586 void DotPrinter::VisitChoice(ChoiceNode* that) { 4587 if (kPrintDispatchTable) { 4588 os_ << " n" << that << " [shape=Mrecord, label=\""; 4589 TableEntryHeaderPrinter header_printer(os_); 4590 that->GetTable(ignore_case_)->ForEach(&header_printer); 4591 os_ << "\"]\n"; 4592 PrintAttributes(that); 4593 TableEntryBodyPrinter body_printer(os_, that); 4594 that->GetTable(ignore_case_)->ForEach(&body_printer); 4595 } else { 4596 os_ << " n" << that << " [shape=Mrecord, label=\"?\"];\n"; 4597 for (int i = 0; i < that->alternatives()->length(); i++) { 4598 GuardedAlternative alt = that->alternatives()->at(i); 4599 os_ << " n" << that << " -> n" << alt.node(); 4600 } 4601 } 4602 for (int i = 0; i < that->alternatives()->length(); i++) { 4603 GuardedAlternative alt = that->alternatives()->at(i); 4604 alt.node()->Accept(this); 4605 } 4606 } 4607 4608 4609 void DotPrinter::VisitText(TextNode* that) { 4610 Zone* zone = that->zone(); 4611 os_ << " n" << that << " [label=\""; 4612 for (int i = 0; i < that->elements()->length(); i++) { 4613 if (i > 0) os_ << " "; 4614 TextElement elm = that->elements()->at(i); 4615 switch (elm.text_type()) { 4616 case TextElement::ATOM: { 4617 Vector<const uc16> data = elm.atom()->data(); 4618 for (int i = 0; i < data.length(); i++) { 4619 os_ << static_cast<char>(data[i]); 4620 } 4621 break; 4622 } 4623 case TextElement::CHAR_CLASS: { 4624 RegExpCharacterClass* node = elm.char_class(); 4625 os_ << "["; 4626 if (node->is_negated()) os_ << "^"; 4627 for (int j = 0; j < node->ranges(zone)->length(); j++) { 4628 CharacterRange range = node->ranges(zone)->at(j); 4629 os_ << AsUC16(range.from()) << "-" << AsUC16(range.to()); 4630 } 4631 os_ << "]"; 4632 break; 4633 } 4634 default: 4635 UNREACHABLE(); 4636 } 4637 } 4638 os_ << "\", shape=box, peripheries=2];\n"; 4639 PrintAttributes(that); 4640 os_ << " n" << that << " -> n" << that->on_success() << ";\n"; 4641 Visit(that->on_success()); 4642 } 4643 4644 4645 void DotPrinter::VisitBackReference(BackReferenceNode* that) { 4646 os_ << " n" << that << " [label=\"$" << that->start_register() << "..$" 4647 << that->end_register() << "\", shape=doubleoctagon];\n"; 4648 PrintAttributes(that); 4649 os_ << " n" << that << " -> n" << that->on_success() << ";\n"; 4650 Visit(that->on_success()); 4651 } 4652 4653 4654 void DotPrinter::VisitEnd(EndNode* that) { 4655 os_ << " n" << that << " [style=bold, shape=point];\n"; 4656 PrintAttributes(that); 4657 } 4658 4659 4660 void DotPrinter::VisitAssertion(AssertionNode* that) { 4661 os_ << " n" << that << " ["; 4662 switch (that->assertion_type()) { 4663 case AssertionNode::AT_END: 4664 os_ << "label=\"$\", shape=septagon"; 4665 break; 4666 case AssertionNode::AT_START: 4667 os_ << "label=\"^\", shape=septagon"; 4668 break; 4669 case AssertionNode::AT_BOUNDARY: 4670 os_ << "label=\"\\b\", shape=septagon"; 4671 break; 4672 case AssertionNode::AT_NON_BOUNDARY: 4673 os_ << "label=\"\\B\", shape=septagon"; 4674 break; 4675 case AssertionNode::AFTER_NEWLINE: 4676 os_ << "label=\"(?<=\\n)\", shape=septagon"; 4677 break; 4678 } 4679 os_ << "];\n"; 4680 PrintAttributes(that); 4681 RegExpNode* successor = that->on_success(); 4682 os_ << " n" << that << " -> n" << successor << ";\n"; 4683 Visit(successor); 4684 } 4685 4686 4687 void DotPrinter::VisitAction(ActionNode* that) { 4688 os_ << " n" << that << " ["; 4689 switch (that->action_type_) { 4690 case ActionNode::SET_REGISTER: 4691 os_ << "label=\"$" << that->data_.u_store_register.reg 4692 << ":=" << that->data_.u_store_register.value << "\", shape=octagon"; 4693 break; 4694 case ActionNode::INCREMENT_REGISTER: 4695 os_ << "label=\"$" << that->data_.u_increment_register.reg 4696 << "++\", shape=octagon"; 4697 break; 4698 case ActionNode::STORE_POSITION: 4699 os_ << "label=\"$" << that->data_.u_position_register.reg 4700 << ":=$pos\", shape=octagon"; 4701 break; 4702 case ActionNode::BEGIN_SUBMATCH: 4703 os_ << "label=\"$" << that->data_.u_submatch.current_position_register 4704 << ":=$pos,begin\", shape=septagon"; 4705 break; 4706 case ActionNode::POSITIVE_SUBMATCH_SUCCESS: 4707 os_ << "label=\"escape\", shape=septagon"; 4708 break; 4709 case ActionNode::EMPTY_MATCH_CHECK: 4710 os_ << "label=\"$" << that->data_.u_empty_match_check.start_register 4711 << "=$pos?,$" << that->data_.u_empty_match_check.repetition_register 4712 << "<" << that->data_.u_empty_match_check.repetition_limit 4713 << "?\", shape=septagon"; 4714 break; 4715 case ActionNode::CLEAR_CAPTURES: { 4716 os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from 4717 << " to $" << that->data_.u_clear_captures.range_to 4718 << "\", shape=septagon"; 4719 break; 4720 } 4721 } 4722 os_ << "];\n"; 4723 PrintAttributes(that); 4724 RegExpNode* successor = that->on_success(); 4725 os_ << " n" << that << " -> n" << successor << ";\n"; 4726 Visit(successor); 4727 } 4728 4729 4730 class DispatchTableDumper { 4731 public: 4732 explicit DispatchTableDumper(std::ostream& os) : os_(os) {} 4733 void Call(uc16 key, DispatchTable::Entry entry); 4734 private: 4735 std::ostream& os_; 4736 }; 4737 4738 4739 void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) { 4740 os_ << "[" << AsUC16(key) << "-" << AsUC16(entry.to()) << "]: {"; 4741 OutSet* set = entry.out_set(); 4742 bool first = true; 4743 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { 4744 if (set->Get(i)) { 4745 if (first) { 4746 first = false; 4747 } else { 4748 os_ << ", "; 4749 } 4750 os_ << i; 4751 } 4752 } 4753 os_ << "}\n"; 4754 } 4755 4756 4757 void DispatchTable::Dump() { 4758 OFStream os(stderr); 4759 DispatchTableDumper dumper(os); 4760 tree()->ForEach(&dumper); 4761 } 4762 4763 4764 void RegExpEngine::DotPrint(const char* label, 4765 RegExpNode* node, 4766 bool ignore_case) { 4767 OFStream os(stdout); 4768 DotPrinter printer(os, ignore_case); 4769 printer.PrintNode(label, node); 4770 } 4771 4772 4773 #endif // DEBUG 4774 4775 4776 // ------------------------------------------------------------------- 4777 // Tree to graph conversion 4778 4779 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler, 4780 RegExpNode* on_success) { 4781 ZoneList<TextElement>* elms = 4782 new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone()); 4783 elms->Add(TextElement::Atom(this), compiler->zone()); 4784 return new (compiler->zone()) 4785 TextNode(elms, compiler->read_backward(), on_success); 4786 } 4787 4788 4789 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, 4790 RegExpNode* on_success) { 4791 return new (compiler->zone()) 4792 TextNode(elements(), compiler->read_backward(), on_success); 4793 } 4794 4795 4796 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges, 4797 const int* special_class, 4798 int length) { 4799 length--; // Remove final marker. 4800 DCHECK(special_class[length] == kRangeEndMarker); 4801 DCHECK(ranges->length() != 0); 4802 DCHECK(length != 0); 4803 DCHECK(special_class[0] != 0); 4804 if (ranges->length() != (length >> 1) + 1) { 4805 return false; 4806 } 4807 CharacterRange range = ranges->at(0); 4808 if (range.from() != 0) { 4809 return false; 4810 } 4811 for (int i = 0; i < length; i += 2) { 4812 if (special_class[i] != (range.to() + 1)) { 4813 return false; 4814 } 4815 range = ranges->at((i >> 1) + 1); 4816 if (special_class[i+1] != range.from()) { 4817 return false; 4818 } 4819 } 4820 if (range.to() != String::kMaxCodePoint) { 4821 return false; 4822 } 4823 return true; 4824 } 4825 4826 4827 static bool CompareRanges(ZoneList<CharacterRange>* ranges, 4828 const int* special_class, 4829 int length) { 4830 length--; // Remove final marker. 4831 DCHECK(special_class[length] == kRangeEndMarker); 4832 if (ranges->length() * 2 != length) { 4833 return false; 4834 } 4835 for (int i = 0; i < length; i += 2) { 4836 CharacterRange range = ranges->at(i >> 1); 4837 if (range.from() != special_class[i] || 4838 range.to() != special_class[i + 1] - 1) { 4839 return false; 4840 } 4841 } 4842 return true; 4843 } 4844 4845 4846 bool RegExpCharacterClass::is_standard(Zone* zone) { 4847 // TODO(lrn): Remove need for this function, by not throwing away information 4848 // along the way. 4849 if (is_negated_) { 4850 return false; 4851 } 4852 if (set_.is_standard()) { 4853 return true; 4854 } 4855 if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) { 4856 set_.set_standard_set_type('s'); 4857 return true; 4858 } 4859 if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) { 4860 set_.set_standard_set_type('S'); 4861 return true; 4862 } 4863 if (CompareInverseRanges(set_.ranges(zone), 4864 kLineTerminatorRanges, 4865 kLineTerminatorRangeCount)) { 4866 set_.set_standard_set_type('.'); 4867 return true; 4868 } 4869 if (CompareRanges(set_.ranges(zone), 4870 kLineTerminatorRanges, 4871 kLineTerminatorRangeCount)) { 4872 set_.set_standard_set_type('n'); 4873 return true; 4874 } 4875 if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) { 4876 set_.set_standard_set_type('w'); 4877 return true; 4878 } 4879 if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) { 4880 set_.set_standard_set_type('W'); 4881 return true; 4882 } 4883 return false; 4884 } 4885 4886 4887 UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone, 4888 ZoneList<CharacterRange>* base) 4889 : zone_(zone), 4890 table_(zone), 4891 bmp_(nullptr), 4892 lead_surrogates_(nullptr), 4893 trail_surrogates_(nullptr), 4894 non_bmp_(nullptr) { 4895 // The unicode range splitter categorizes given character ranges into: 4896 // - Code points from the BMP representable by one code unit. 4897 // - Code points outside the BMP that need to be split into surrogate pairs. 4898 // - Lone lead surrogates. 4899 // - Lone trail surrogates. 4900 // Lone surrogates are valid code points, even though no actual characters. 4901 // They require special matching to make sure we do not split surrogate pairs. 4902 // We use the dispatch table to accomplish this. The base range is split up 4903 // by the table by the overlay ranges, and the Call callback is used to 4904 // filter and collect ranges for each category. 4905 for (int i = 0; i < base->length(); i++) { 4906 table_.AddRange(base->at(i), kBase, zone_); 4907 } 4908 // Add overlay ranges. 4909 table_.AddRange(CharacterRange::Range(0, kLeadSurrogateStart - 1), 4910 kBmpCodePoints, zone_); 4911 table_.AddRange(CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd), 4912 kLeadSurrogates, zone_); 4913 table_.AddRange( 4914 CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd), 4915 kTrailSurrogates, zone_); 4916 table_.AddRange( 4917 CharacterRange::Range(kTrailSurrogateEnd + 1, kNonBmpStart - 1), 4918 kBmpCodePoints, zone_); 4919 table_.AddRange(CharacterRange::Range(kNonBmpStart, kNonBmpEnd), 4920 kNonBmpCodePoints, zone_); 4921 table_.ForEach(this); 4922 } 4923 4924 4925 void UnicodeRangeSplitter::Call(uc32 from, DispatchTable::Entry entry) { 4926 OutSet* outset = entry.out_set(); 4927 if (!outset->Get(kBase)) return; 4928 ZoneList<CharacterRange>** target = NULL; 4929 if (outset->Get(kBmpCodePoints)) { 4930 target = &bmp_; 4931 } else if (outset->Get(kLeadSurrogates)) { 4932 target = &lead_surrogates_; 4933 } else if (outset->Get(kTrailSurrogates)) { 4934 target = &trail_surrogates_; 4935 } else { 4936 DCHECK(outset->Get(kNonBmpCodePoints)); 4937 target = &non_bmp_; 4938 } 4939 if (*target == NULL) *target = new (zone_) ZoneList<CharacterRange>(2, zone_); 4940 (*target)->Add(CharacterRange::Range(entry.from(), entry.to()), zone_); 4941 } 4942 4943 4944 void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result, 4945 RegExpNode* on_success, UnicodeRangeSplitter* splitter) { 4946 ZoneList<CharacterRange>* bmp = splitter->bmp(); 4947 if (bmp == nullptr) return; 4948 result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges( 4949 compiler->zone(), bmp, compiler->read_backward(), on_success))); 4950 } 4951 4952 4953 void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result, 4954 RegExpNode* on_success, 4955 UnicodeRangeSplitter* splitter) { 4956 ZoneList<CharacterRange>* non_bmp = splitter->non_bmp(); 4957 if (non_bmp == nullptr) return; 4958 DCHECK(compiler->unicode()); 4959 DCHECK(!compiler->one_byte()); 4960 Zone* zone = compiler->zone(); 4961 CharacterRange::Canonicalize(non_bmp); 4962 for (int i = 0; i < non_bmp->length(); i++) { 4963 // Match surrogate pair. 4964 // E.g. [\u10005-\u11005] becomes 4965 // \ud800[\udc05-\udfff]| 4966 // [\ud801-\ud803][\udc00-\udfff]| 4967 // \ud804[\udc00-\udc05] 4968 uc32 from = non_bmp->at(i).from(); 4969 uc32 to = non_bmp->at(i).to(); 4970 uc16 from_l = unibrow::Utf16::LeadSurrogate(from); 4971 uc16 from_t = unibrow::Utf16::TrailSurrogate(from); 4972 uc16 to_l = unibrow::Utf16::LeadSurrogate(to); 4973 uc16 to_t = unibrow::Utf16::TrailSurrogate(to); 4974 if (from_l == to_l) { 4975 // The lead surrogate is the same. 4976 result->AddAlternative( 4977 GuardedAlternative(TextNode::CreateForSurrogatePair( 4978 zone, CharacterRange::Singleton(from_l), 4979 CharacterRange::Range(from_t, to_t), compiler->read_backward(), 4980 on_success))); 4981 } else { 4982 if (from_t != kTrailSurrogateStart) { 4983 // Add [from_l][from_t-\udfff] 4984 result->AddAlternative( 4985 GuardedAlternative(TextNode::CreateForSurrogatePair( 4986 zone, CharacterRange::Singleton(from_l), 4987 CharacterRange::Range(from_t, kTrailSurrogateEnd), 4988 compiler->read_backward(), on_success))); 4989 from_l++; 4990 } 4991 if (to_t != kTrailSurrogateEnd) { 4992 // Add [to_l][\udc00-to_t] 4993 result->AddAlternative( 4994 GuardedAlternative(TextNode::CreateForSurrogatePair( 4995 zone, CharacterRange::Singleton(to_l), 4996 CharacterRange::Range(kTrailSurrogateStart, to_t), 4997 compiler->read_backward(), on_success))); 4998 to_l--; 4999 } 5000 if (from_l <= to_l) { 5001 // Add [from_l-to_l][\udc00-\udfff] 5002 result->AddAlternative( 5003 GuardedAlternative(TextNode::CreateForSurrogatePair( 5004 zone, CharacterRange::Range(from_l, to_l), 5005 CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd), 5006 compiler->read_backward(), on_success))); 5007 } 5008 } 5009 } 5010 } 5011 5012 5013 RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch( 5014 RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind, 5015 ZoneList<CharacterRange>* match, RegExpNode* on_success, 5016 bool read_backward) { 5017 Zone* zone = compiler->zone(); 5018 RegExpNode* match_node = TextNode::CreateForCharacterRanges( 5019 zone, match, read_backward, on_success); 5020 int stack_register = compiler->UnicodeLookaroundStackRegister(); 5021 int position_register = compiler->UnicodeLookaroundPositionRegister(); 5022 RegExpLookaround::Builder lookaround(false, match_node, stack_register, 5023 position_register); 5024 RegExpNode* negative_match = TextNode::CreateForCharacterRanges( 5025 zone, lookbehind, !read_backward, lookaround.on_match_success()); 5026 return lookaround.ForMatch(negative_match); 5027 } 5028 5029 5030 RegExpNode* MatchAndNegativeLookaroundInReadDirection( 5031 RegExpCompiler* compiler, ZoneList<CharacterRange>* match, 5032 ZoneList<CharacterRange>* lookahead, RegExpNode* on_success, 5033 bool read_backward) { 5034 Zone* zone = compiler->zone(); 5035 int stack_register = compiler->UnicodeLookaroundStackRegister(); 5036 int position_register = compiler->UnicodeLookaroundPositionRegister(); 5037 RegExpLookaround::Builder lookaround(false, on_success, stack_register, 5038 position_register); 5039 RegExpNode* negative_match = TextNode::CreateForCharacterRanges( 5040 zone, lookahead, read_backward, lookaround.on_match_success()); 5041 return TextNode::CreateForCharacterRanges( 5042 zone, match, read_backward, lookaround.ForMatch(negative_match)); 5043 } 5044 5045 5046 void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result, 5047 RegExpNode* on_success, 5048 UnicodeRangeSplitter* splitter) { 5049 ZoneList<CharacterRange>* lead_surrogates = splitter->lead_surrogates(); 5050 if (lead_surrogates == nullptr) return; 5051 Zone* zone = compiler->zone(); 5052 // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]). 5053 ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List( 5054 zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd)); 5055 5056 RegExpNode* match; 5057 if (compiler->read_backward()) { 5058 // Reading backward. Assert that reading forward, there is no trail 5059 // surrogate, and then backward match the lead surrogate. 5060 match = NegativeLookaroundAgainstReadDirectionAndMatch( 5061 compiler, trail_surrogates, lead_surrogates, on_success, true); 5062 } else { 5063 // Reading forward. Forward match the lead surrogate and assert that 5064 // no trail surrogate follows. 5065 match = MatchAndNegativeLookaroundInReadDirection( 5066 compiler, lead_surrogates, trail_surrogates, on_success, false); 5067 } 5068 result->AddAlternative(GuardedAlternative(match)); 5069 } 5070 5071 5072 void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result, 5073 RegExpNode* on_success, 5074 UnicodeRangeSplitter* splitter) { 5075 ZoneList<CharacterRange>* trail_surrogates = splitter->trail_surrogates(); 5076 if (trail_surrogates == nullptr) return; 5077 Zone* zone = compiler->zone(); 5078 // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01 5079 ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List( 5080 zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd)); 5081 5082 RegExpNode* match; 5083 if (compiler->read_backward()) { 5084 // Reading backward. Backward match the trail surrogate and assert that no 5085 // lead surrogate precedes it. 5086 match = MatchAndNegativeLookaroundInReadDirection( 5087 compiler, trail_surrogates, lead_surrogates, on_success, true); 5088 } else { 5089 // Reading forward. Assert that reading backward, there is no lead 5090 // surrogate, and then forward match the trail surrogate. 5091 match = NegativeLookaroundAgainstReadDirectionAndMatch( 5092 compiler, lead_surrogates, trail_surrogates, on_success, false); 5093 } 5094 result->AddAlternative(GuardedAlternative(match)); 5095 } 5096 5097 RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler, 5098 RegExpNode* on_success) { 5099 // This implements ES2015 21.2.5.2.3, AdvanceStringIndex. 5100 DCHECK(!compiler->read_backward()); 5101 Zone* zone = compiler->zone(); 5102 // Advance any character. If the character happens to be a lead surrogate and 5103 // we advanced into the middle of a surrogate pair, it will work out, as 5104 // nothing will match from there. We will have to advance again, consuming 5105 // the associated trail surrogate. 5106 ZoneList<CharacterRange>* range = CharacterRange::List( 5107 zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit)); 5108 return TextNode::CreateForCharacterRanges(zone, range, false, on_success); 5109 } 5110 5111 5112 void AddUnicodeCaseEquivalents(RegExpCompiler* compiler, 5113 ZoneList<CharacterRange>* ranges) { 5114 #ifdef V8_I18N_SUPPORT 5115 // Use ICU to compute the case fold closure over the ranges. 5116 DCHECK(compiler->unicode()); 5117 DCHECK(compiler->ignore_case()); 5118 icu::UnicodeSet set; 5119 for (int i = 0; i < ranges->length(); i++) { 5120 set.add(ranges->at(i).from(), ranges->at(i).to()); 5121 } 5122 ranges->Clear(); 5123 set.closeOver(USET_CASE_INSENSITIVE); 5124 // Full case mapping map single characters to multiple characters. 5125 // Those are represented as strings in the set. Remove them so that 5126 // we end up with only simple and common case mappings. 5127 set.removeAllStrings(); 5128 Zone* zone = compiler->zone(); 5129 for (int i = 0; i < set.getRangeCount(); i++) { 5130 ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)), 5131 zone); 5132 } 5133 // No errors and everything we collected have been ranges. 5134 #else 5135 // Fallback if ICU is not included. 5136 CharacterRange::AddCaseEquivalents(compiler->isolate(), compiler->zone(), 5137 ranges, compiler->one_byte()); 5138 #endif // V8_I18N_SUPPORT 5139 CharacterRange::Canonicalize(ranges); 5140 } 5141 5142 5143 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, 5144 RegExpNode* on_success) { 5145 set_.Canonicalize(); 5146 Zone* zone = compiler->zone(); 5147 ZoneList<CharacterRange>* ranges = this->ranges(zone); 5148 if (compiler->unicode() && compiler->ignore_case()) { 5149 AddUnicodeCaseEquivalents(compiler, ranges); 5150 } 5151 if (compiler->unicode() && !compiler->one_byte()) { 5152 if (is_negated()) { 5153 ZoneList<CharacterRange>* negated = 5154 new (zone) ZoneList<CharacterRange>(2, zone); 5155 CharacterRange::Negate(ranges, negated, zone); 5156 ranges = negated; 5157 } 5158 if (ranges->length() == 0) { 5159 ranges->Add(CharacterRange::Everything(), zone); 5160 RegExpCharacterClass* fail = 5161 new (zone) RegExpCharacterClass(ranges, true); 5162 return new (zone) TextNode(fail, compiler->read_backward(), on_success); 5163 } 5164 if (standard_type() == '*') { 5165 return UnanchoredAdvance(compiler, on_success); 5166 } else { 5167 ChoiceNode* result = new (zone) ChoiceNode(2, zone); 5168 UnicodeRangeSplitter splitter(zone, ranges); 5169 AddBmpCharacters(compiler, result, on_success, &splitter); 5170 AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter); 5171 AddLoneLeadSurrogates(compiler, result, on_success, &splitter); 5172 AddLoneTrailSurrogates(compiler, result, on_success, &splitter); 5173 return result; 5174 } 5175 } else { 5176 return new (zone) TextNode(this, compiler->read_backward(), on_success); 5177 } 5178 } 5179 5180 5181 int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) { 5182 RegExpAtom* atom1 = (*a)->AsAtom(); 5183 RegExpAtom* atom2 = (*b)->AsAtom(); 5184 uc16 character1 = atom1->data().at(0); 5185 uc16 character2 = atom2->data().at(0); 5186 if (character1 < character2) return -1; 5187 if (character1 > character2) return 1; 5188 return 0; 5189 } 5190 5191 5192 static unibrow::uchar Canonical( 5193 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize, 5194 unibrow::uchar c) { 5195 unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth]; 5196 int length = canonicalize->get(c, '\0', chars); 5197 DCHECK_LE(length, 1); 5198 unibrow::uchar canonical = c; 5199 if (length == 1) canonical = chars[0]; 5200 return canonical; 5201 } 5202 5203 5204 int CompareFirstCharCaseIndependent( 5205 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize, 5206 RegExpTree* const* a, RegExpTree* const* b) { 5207 RegExpAtom* atom1 = (*a)->AsAtom(); 5208 RegExpAtom* atom2 = (*b)->AsAtom(); 5209 unibrow::uchar character1 = atom1->data().at(0); 5210 unibrow::uchar character2 = atom2->data().at(0); 5211 if (character1 == character2) return 0; 5212 if (character1 >= 'a' || character2 >= 'a') { 5213 character1 = Canonical(canonicalize, character1); 5214 character2 = Canonical(canonicalize, character2); 5215 } 5216 return static_cast<int>(character1) - static_cast<int>(character2); 5217 } 5218 5219 5220 // We can stable sort runs of atoms, since the order does not matter if they 5221 // start with different characters. 5222 // Returns true if any consecutive atoms were found. 5223 bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) { 5224 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 5225 int length = alternatives->length(); 5226 bool found_consecutive_atoms = false; 5227 for (int i = 0; i < length; i++) { 5228 while (i < length) { 5229 RegExpTree* alternative = alternatives->at(i); 5230 if (alternative->IsAtom()) break; 5231 i++; 5232 } 5233 // i is length or it is the index of an atom. 5234 if (i == length) break; 5235 int first_atom = i; 5236 i++; 5237 while (i < length) { 5238 RegExpTree* alternative = alternatives->at(i); 5239 if (!alternative->IsAtom()) break; 5240 i++; 5241 } 5242 // Sort atoms to get ones with common prefixes together. 5243 // This step is more tricky if we are in a case-independent regexp, 5244 // because it would change /is|I/ to /I|is/, and order matters when 5245 // the regexp parts don't match only disjoint starting points. To fix 5246 // this we have a version of CompareFirstChar that uses case- 5247 // independent character classes for comparison. 5248 DCHECK_LT(first_atom, alternatives->length()); 5249 DCHECK_LE(i, alternatives->length()); 5250 DCHECK_LE(first_atom, i); 5251 if (compiler->ignore_case()) { 5252 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize = 5253 compiler->isolate()->regexp_macro_assembler_canonicalize(); 5254 auto compare_closure = 5255 [canonicalize](RegExpTree* const* a, RegExpTree* const* b) { 5256 return CompareFirstCharCaseIndependent(canonicalize, a, b); 5257 }; 5258 alternatives->StableSort(compare_closure, first_atom, i - first_atom); 5259 } else { 5260 alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom); 5261 } 5262 if (i - first_atom > 1) found_consecutive_atoms = true; 5263 } 5264 return found_consecutive_atoms; 5265 } 5266 5267 5268 // Optimizes ab|ac|az to a(?:b|c|d). 5269 void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { 5270 Zone* zone = compiler->zone(); 5271 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 5272 int length = alternatives->length(); 5273 5274 int write_posn = 0; 5275 int i = 0; 5276 while (i < length) { 5277 RegExpTree* alternative = alternatives->at(i); 5278 if (!alternative->IsAtom()) { 5279 alternatives->at(write_posn++) = alternatives->at(i); 5280 i++; 5281 continue; 5282 } 5283 RegExpAtom* atom = alternative->AsAtom(); 5284 unibrow::uchar common_prefix = atom->data().at(0); 5285 int first_with_prefix = i; 5286 int prefix_length = atom->length(); 5287 i++; 5288 while (i < length) { 5289 alternative = alternatives->at(i); 5290 if (!alternative->IsAtom()) break; 5291 atom = alternative->AsAtom(); 5292 unibrow::uchar new_prefix = atom->data().at(0); 5293 if (new_prefix != common_prefix) { 5294 if (!compiler->ignore_case()) break; 5295 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize = 5296 compiler->isolate()->regexp_macro_assembler_canonicalize(); 5297 new_prefix = Canonical(canonicalize, new_prefix); 5298 common_prefix = Canonical(canonicalize, common_prefix); 5299 if (new_prefix != common_prefix) break; 5300 } 5301 prefix_length = Min(prefix_length, atom->length()); 5302 i++; 5303 } 5304 if (i > first_with_prefix + 2) { 5305 // Found worthwhile run of alternatives with common prefix of at least one 5306 // character. The sorting function above did not sort on more than one 5307 // character for reasons of correctness, but there may still be a longer 5308 // common prefix if the terms were similar or presorted in the input. 5309 // Find out how long the common prefix is. 5310 int run_length = i - first_with_prefix; 5311 atom = alternatives->at(first_with_prefix)->AsAtom(); 5312 for (int j = 1; j < run_length && prefix_length > 1; j++) { 5313 RegExpAtom* old_atom = 5314 alternatives->at(j + first_with_prefix)->AsAtom(); 5315 for (int k = 1; k < prefix_length; k++) { 5316 if (atom->data().at(k) != old_atom->data().at(k)) { 5317 prefix_length = k; 5318 break; 5319 } 5320 } 5321 } 5322 RegExpAtom* prefix = 5323 new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length)); 5324 ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone); 5325 pair->Add(prefix, zone); 5326 ZoneList<RegExpTree*>* suffixes = 5327 new (zone) ZoneList<RegExpTree*>(run_length, zone); 5328 for (int j = 0; j < run_length; j++) { 5329 RegExpAtom* old_atom = 5330 alternatives->at(j + first_with_prefix)->AsAtom(); 5331 int len = old_atom->length(); 5332 if (len == prefix_length) { 5333 suffixes->Add(new (zone) RegExpEmpty(), zone); 5334 } else { 5335 RegExpTree* suffix = new (zone) RegExpAtom( 5336 old_atom->data().SubVector(prefix_length, old_atom->length())); 5337 suffixes->Add(suffix, zone); 5338 } 5339 } 5340 pair->Add(new (zone) RegExpDisjunction(suffixes), zone); 5341 alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair); 5342 } else { 5343 // Just copy any non-worthwhile alternatives. 5344 for (int j = first_with_prefix; j < i; j++) { 5345 alternatives->at(write_posn++) = alternatives->at(j); 5346 } 5347 } 5348 } 5349 alternatives->Rewind(write_posn); // Trim end of array. 5350 } 5351 5352 5353 // Optimizes b|c|z to [bcz]. 5354 void RegExpDisjunction::FixSingleCharacterDisjunctions( 5355 RegExpCompiler* compiler) { 5356 Zone* zone = compiler->zone(); 5357 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 5358 int length = alternatives->length(); 5359 5360 int write_posn = 0; 5361 int i = 0; 5362 while (i < length) { 5363 RegExpTree* alternative = alternatives->at(i); 5364 if (!alternative->IsAtom()) { 5365 alternatives->at(write_posn++) = alternatives->at(i); 5366 i++; 5367 continue; 5368 } 5369 RegExpAtom* atom = alternative->AsAtom(); 5370 if (atom->length() != 1) { 5371 alternatives->at(write_posn++) = alternatives->at(i); 5372 i++; 5373 continue; 5374 } 5375 int first_in_run = i; 5376 i++; 5377 while (i < length) { 5378 alternative = alternatives->at(i); 5379 if (!alternative->IsAtom()) break; 5380 atom = alternative->AsAtom(); 5381 if (atom->length() != 1) break; 5382 i++; 5383 } 5384 if (i > first_in_run + 1) { 5385 // Found non-trivial run of single-character alternatives. 5386 int run_length = i - first_in_run; 5387 ZoneList<CharacterRange>* ranges = 5388 new (zone) ZoneList<CharacterRange>(2, zone); 5389 for (int j = 0; j < run_length; j++) { 5390 RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom(); 5391 DCHECK_EQ(old_atom->length(), 1); 5392 ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone); 5393 } 5394 alternatives->at(write_posn++) = 5395 new (zone) RegExpCharacterClass(ranges, false); 5396 } else { 5397 // Just copy any trivial alternatives. 5398 for (int j = first_in_run; j < i; j++) { 5399 alternatives->at(write_posn++) = alternatives->at(j); 5400 } 5401 } 5402 } 5403 alternatives->Rewind(write_posn); // Trim end of array. 5404 } 5405 5406 5407 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler, 5408 RegExpNode* on_success) { 5409 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 5410 5411 if (alternatives->length() > 2) { 5412 bool found_consecutive_atoms = SortConsecutiveAtoms(compiler); 5413 if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler); 5414 FixSingleCharacterDisjunctions(compiler); 5415 if (alternatives->length() == 1) { 5416 return alternatives->at(0)->ToNode(compiler, on_success); 5417 } 5418 } 5419 5420 int length = alternatives->length(); 5421 5422 ChoiceNode* result = 5423 new(compiler->zone()) ChoiceNode(length, compiler->zone()); 5424 for (int i = 0; i < length; i++) { 5425 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler, 5426 on_success)); 5427 result->AddAlternative(alternative); 5428 } 5429 return result; 5430 } 5431 5432 5433 RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler, 5434 RegExpNode* on_success) { 5435 return ToNode(min(), 5436 max(), 5437 is_greedy(), 5438 body(), 5439 compiler, 5440 on_success); 5441 } 5442 5443 5444 // Scoped object to keep track of how much we unroll quantifier loops in the 5445 // regexp graph generator. 5446 class RegExpExpansionLimiter { 5447 public: 5448 static const int kMaxExpansionFactor = 6; 5449 RegExpExpansionLimiter(RegExpCompiler* compiler, int factor) 5450 : compiler_(compiler), 5451 saved_expansion_factor_(compiler->current_expansion_factor()), 5452 ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) { 5453 DCHECK(factor > 0); 5454 if (ok_to_expand_) { 5455 if (factor > kMaxExpansionFactor) { 5456 // Avoid integer overflow of the current expansion factor. 5457 ok_to_expand_ = false; 5458 compiler->set_current_expansion_factor(kMaxExpansionFactor + 1); 5459 } else { 5460 int new_factor = saved_expansion_factor_ * factor; 5461 ok_to_expand_ = (new_factor <= kMaxExpansionFactor); 5462 compiler->set_current_expansion_factor(new_factor); 5463 } 5464 } 5465 } 5466 5467 ~RegExpExpansionLimiter() { 5468 compiler_->set_current_expansion_factor(saved_expansion_factor_); 5469 } 5470 5471 bool ok_to_expand() { return ok_to_expand_; } 5472 5473 private: 5474 RegExpCompiler* compiler_; 5475 int saved_expansion_factor_; 5476 bool ok_to_expand_; 5477 5478 DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter); 5479 }; 5480 5481 5482 RegExpNode* RegExpQuantifier::ToNode(int min, 5483 int max, 5484 bool is_greedy, 5485 RegExpTree* body, 5486 RegExpCompiler* compiler, 5487 RegExpNode* on_success, 5488 bool not_at_start) { 5489 // x{f, t} becomes this: 5490 // 5491 // (r++)<-. 5492 // | ` 5493 // | (x) 5494 // v ^ 5495 // (r=0)-->(?)---/ [if r < t] 5496 // | 5497 // [if r >= f] \----> ... 5498 // 5499 5500 // 15.10.2.5 RepeatMatcher algorithm. 5501 // The parser has already eliminated the case where max is 0. In the case 5502 // where max_match is zero the parser has removed the quantifier if min was 5503 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom. 5504 5505 // If we know that we cannot match zero length then things are a little 5506 // simpler since we don't need to make the special zero length match check 5507 // from step 2.1. If the min and max are small we can unroll a little in 5508 // this case. 5509 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,} 5510 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3} 5511 if (max == 0) return on_success; // This can happen due to recursion. 5512 bool body_can_be_empty = (body->min_match() == 0); 5513 int body_start_reg = RegExpCompiler::kNoRegister; 5514 Interval capture_registers = body->CaptureRegisters(); 5515 bool needs_capture_clearing = !capture_registers.is_empty(); 5516 Zone* zone = compiler->zone(); 5517 5518 if (body_can_be_empty) { 5519 body_start_reg = compiler->AllocateRegister(); 5520 } else if (compiler->optimize() && !needs_capture_clearing) { 5521 // Only unroll if there are no captures and the body can't be 5522 // empty. 5523 { 5524 RegExpExpansionLimiter limiter( 5525 compiler, min + ((max != min) ? 1 : 0)); 5526 if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) { 5527 int new_max = (max == kInfinity) ? max : max - min; 5528 // Recurse once to get the loop or optional matches after the fixed 5529 // ones. 5530 RegExpNode* answer = ToNode( 5531 0, new_max, is_greedy, body, compiler, on_success, true); 5532 // Unroll the forced matches from 0 to min. This can cause chains of 5533 // TextNodes (which the parser does not generate). These should be 5534 // combined if it turns out they hinder good code generation. 5535 for (int i = 0; i < min; i++) { 5536 answer = body->ToNode(compiler, answer); 5537 } 5538 return answer; 5539 } 5540 } 5541 if (max <= kMaxUnrolledMaxMatches && min == 0) { 5542 DCHECK(max > 0); // Due to the 'if' above. 5543 RegExpExpansionLimiter limiter(compiler, max); 5544 if (limiter.ok_to_expand()) { 5545 // Unroll the optional matches up to max. 5546 RegExpNode* answer = on_success; 5547 for (int i = 0; i < max; i++) { 5548 ChoiceNode* alternation = new(zone) ChoiceNode(2, zone); 5549 if (is_greedy) { 5550 alternation->AddAlternative( 5551 GuardedAlternative(body->ToNode(compiler, answer))); 5552 alternation->AddAlternative(GuardedAlternative(on_success)); 5553 } else { 5554 alternation->AddAlternative(GuardedAlternative(on_success)); 5555 alternation->AddAlternative( 5556 GuardedAlternative(body->ToNode(compiler, answer))); 5557 } 5558 answer = alternation; 5559 if (not_at_start && !compiler->read_backward()) { 5560 alternation->set_not_at_start(); 5561 } 5562 } 5563 return answer; 5564 } 5565 } 5566 } 5567 bool has_min = min > 0; 5568 bool has_max = max < RegExpTree::kInfinity; 5569 bool needs_counter = has_min || has_max; 5570 int reg_ctr = needs_counter 5571 ? compiler->AllocateRegister() 5572 : RegExpCompiler::kNoRegister; 5573 LoopChoiceNode* center = new (zone) 5574 LoopChoiceNode(body->min_match() == 0, compiler->read_backward(), zone); 5575 if (not_at_start && !compiler->read_backward()) center->set_not_at_start(); 5576 RegExpNode* loop_return = needs_counter 5577 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center)) 5578 : static_cast<RegExpNode*>(center); 5579 if (body_can_be_empty) { 5580 // If the body can be empty we need to check if it was and then 5581 // backtrack. 5582 loop_return = ActionNode::EmptyMatchCheck(body_start_reg, 5583 reg_ctr, 5584 min, 5585 loop_return); 5586 } 5587 RegExpNode* body_node = body->ToNode(compiler, loop_return); 5588 if (body_can_be_empty) { 5589 // If the body can be empty we need to store the start position 5590 // so we can bail out if it was empty. 5591 body_node = ActionNode::StorePosition(body_start_reg, false, body_node); 5592 } 5593 if (needs_capture_clearing) { 5594 // Before entering the body of this loop we need to clear captures. 5595 body_node = ActionNode::ClearCaptures(capture_registers, body_node); 5596 } 5597 GuardedAlternative body_alt(body_node); 5598 if (has_max) { 5599 Guard* body_guard = 5600 new(zone) Guard(reg_ctr, Guard::LT, max); 5601 body_alt.AddGuard(body_guard, zone); 5602 } 5603 GuardedAlternative rest_alt(on_success); 5604 if (has_min) { 5605 Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min); 5606 rest_alt.AddGuard(rest_guard, zone); 5607 } 5608 if (is_greedy) { 5609 center->AddLoopAlternative(body_alt); 5610 center->AddContinueAlternative(rest_alt); 5611 } else { 5612 center->AddContinueAlternative(rest_alt); 5613 center->AddLoopAlternative(body_alt); 5614 } 5615 if (needs_counter) { 5616 return ActionNode::SetRegister(reg_ctr, 0, center); 5617 } else { 5618 return center; 5619 } 5620 } 5621 5622 5623 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, 5624 RegExpNode* on_success) { 5625 NodeInfo info; 5626 Zone* zone = compiler->zone(); 5627 5628 switch (assertion_type()) { 5629 case START_OF_LINE: 5630 return AssertionNode::AfterNewline(on_success); 5631 case START_OF_INPUT: 5632 return AssertionNode::AtStart(on_success); 5633 case BOUNDARY: 5634 return AssertionNode::AtBoundary(on_success); 5635 case NON_BOUNDARY: 5636 return AssertionNode::AtNonBoundary(on_success); 5637 case END_OF_INPUT: 5638 return AssertionNode::AtEnd(on_success); 5639 case END_OF_LINE: { 5640 // Compile $ in multiline regexps as an alternation with a positive 5641 // lookahead in one side and an end-of-input on the other side. 5642 // We need two registers for the lookahead. 5643 int stack_pointer_register = compiler->AllocateRegister(); 5644 int position_register = compiler->AllocateRegister(); 5645 // The ChoiceNode to distinguish between a newline and end-of-input. 5646 ChoiceNode* result = new(zone) ChoiceNode(2, zone); 5647 // Create a newline atom. 5648 ZoneList<CharacterRange>* newline_ranges = 5649 new(zone) ZoneList<CharacterRange>(3, zone); 5650 CharacterRange::AddClassEscape('n', newline_ranges, zone); 5651 RegExpCharacterClass* newline_atom = new (zone) RegExpCharacterClass('n'); 5652 TextNode* newline_matcher = new (zone) TextNode( 5653 newline_atom, false, ActionNode::PositiveSubmatchSuccess( 5654 stack_pointer_register, position_register, 5655 0, // No captures inside. 5656 -1, // Ignored if no captures. 5657 on_success)); 5658 // Create an end-of-input matcher. 5659 RegExpNode* end_of_line = ActionNode::BeginSubmatch( 5660 stack_pointer_register, 5661 position_register, 5662 newline_matcher); 5663 // Add the two alternatives to the ChoiceNode. 5664 GuardedAlternative eol_alternative(end_of_line); 5665 result->AddAlternative(eol_alternative); 5666 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success)); 5667 result->AddAlternative(end_alternative); 5668 return result; 5669 } 5670 default: 5671 UNREACHABLE(); 5672 } 5673 return on_success; 5674 } 5675 5676 5677 RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler, 5678 RegExpNode* on_success) { 5679 return new (compiler->zone()) 5680 BackReferenceNode(RegExpCapture::StartRegister(index()), 5681 RegExpCapture::EndRegister(index()), 5682 compiler->read_backward(), on_success); 5683 } 5684 5685 5686 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler, 5687 RegExpNode* on_success) { 5688 return on_success; 5689 } 5690 5691 5692 RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success, 5693 int stack_pointer_register, 5694 int position_register, 5695 int capture_register_count, 5696 int capture_register_start) 5697 : is_positive_(is_positive), 5698 on_success_(on_success), 5699 stack_pointer_register_(stack_pointer_register), 5700 position_register_(position_register) { 5701 if (is_positive_) { 5702 on_match_success_ = ActionNode::PositiveSubmatchSuccess( 5703 stack_pointer_register, position_register, capture_register_count, 5704 capture_register_start, on_success_); 5705 } else { 5706 Zone* zone = on_success_->zone(); 5707 on_match_success_ = new (zone) NegativeSubmatchSuccess( 5708 stack_pointer_register, position_register, capture_register_count, 5709 capture_register_start, zone); 5710 } 5711 } 5712 5713 5714 RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) { 5715 if (is_positive_) { 5716 return ActionNode::BeginSubmatch(stack_pointer_register_, 5717 position_register_, match); 5718 } else { 5719 Zone* zone = on_success_->zone(); 5720 // We use a ChoiceNode to represent the negative lookaround. The first 5721 // alternative is the negative match. On success, the end node backtracks. 5722 // On failure, the second alternative is tried and leads to success. 5723 // NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the 5724 // first exit when calculating quick checks. 5725 ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode( 5726 GuardedAlternative(match), GuardedAlternative(on_success_), zone); 5727 return ActionNode::BeginSubmatch(stack_pointer_register_, 5728 position_register_, choice_node); 5729 } 5730 } 5731 5732 5733 RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler, 5734 RegExpNode* on_success) { 5735 int stack_pointer_register = compiler->AllocateRegister(); 5736 int position_register = compiler->AllocateRegister(); 5737 5738 const int registers_per_capture = 2; 5739 const int register_of_first_capture = 2; 5740 int register_count = capture_count_ * registers_per_capture; 5741 int register_start = 5742 register_of_first_capture + capture_from_ * registers_per_capture; 5743 5744 RegExpNode* result; 5745 bool was_reading_backward = compiler->read_backward(); 5746 compiler->set_read_backward(type() == LOOKBEHIND); 5747 Builder builder(is_positive(), on_success, stack_pointer_register, 5748 position_register, register_count, register_start); 5749 RegExpNode* match = body_->ToNode(compiler, builder.on_match_success()); 5750 result = builder.ForMatch(match); 5751 compiler->set_read_backward(was_reading_backward); 5752 return result; 5753 } 5754 5755 5756 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler, 5757 RegExpNode* on_success) { 5758 return ToNode(body(), index(), compiler, on_success); 5759 } 5760 5761 5762 RegExpNode* RegExpCapture::ToNode(RegExpTree* body, 5763 int index, 5764 RegExpCompiler* compiler, 5765 RegExpNode* on_success) { 5766 DCHECK_NOT_NULL(body); 5767 int start_reg = RegExpCapture::StartRegister(index); 5768 int end_reg = RegExpCapture::EndRegister(index); 5769 if (compiler->read_backward()) std::swap(start_reg, end_reg); 5770 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success); 5771 RegExpNode* body_node = body->ToNode(compiler, store_end); 5772 return ActionNode::StorePosition(start_reg, true, body_node); 5773 } 5774 5775 5776 RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler, 5777 RegExpNode* on_success) { 5778 ZoneList<RegExpTree*>* children = nodes(); 5779 RegExpNode* current = on_success; 5780 if (compiler->read_backward()) { 5781 for (int i = 0; i < children->length(); i++) { 5782 current = children->at(i)->ToNode(compiler, current); 5783 } 5784 } else { 5785 for (int i = children->length() - 1; i >= 0; i--) { 5786 current = children->at(i)->ToNode(compiler, current); 5787 } 5788 } 5789 return current; 5790 } 5791 5792 5793 static void AddClass(const int* elmv, 5794 int elmc, 5795 ZoneList<CharacterRange>* ranges, 5796 Zone* zone) { 5797 elmc--; 5798 DCHECK(elmv[elmc] == kRangeEndMarker); 5799 for (int i = 0; i < elmc; i += 2) { 5800 DCHECK(elmv[i] < elmv[i + 1]); 5801 ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone); 5802 } 5803 } 5804 5805 5806 static void AddClassNegated(const int *elmv, 5807 int elmc, 5808 ZoneList<CharacterRange>* ranges, 5809 Zone* zone) { 5810 elmc--; 5811 DCHECK(elmv[elmc] == kRangeEndMarker); 5812 DCHECK(elmv[0] != 0x0000); 5813 DCHECK(elmv[elmc - 1] != String::kMaxCodePoint); 5814 uc16 last = 0x0000; 5815 for (int i = 0; i < elmc; i += 2) { 5816 DCHECK(last <= elmv[i] - 1); 5817 DCHECK(elmv[i] < elmv[i + 1]); 5818 ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone); 5819 last = elmv[i + 1]; 5820 } 5821 ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone); 5822 } 5823 5824 5825 void CharacterRange::AddClassEscape(uc16 type, 5826 ZoneList<CharacterRange>* ranges, 5827 Zone* zone) { 5828 switch (type) { 5829 case 's': 5830 AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone); 5831 break; 5832 case 'S': 5833 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone); 5834 break; 5835 case 'w': 5836 AddClass(kWordRanges, kWordRangeCount, ranges, zone); 5837 break; 5838 case 'W': 5839 AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone); 5840 break; 5841 case 'd': 5842 AddClass(kDigitRanges, kDigitRangeCount, ranges, zone); 5843 break; 5844 case 'D': 5845 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone); 5846 break; 5847 case '.': 5848 AddClassNegated(kLineTerminatorRanges, 5849 kLineTerminatorRangeCount, 5850 ranges, 5851 zone); 5852 break; 5853 // This is not a character range as defined by the spec but a 5854 // convenient shorthand for a character class that matches any 5855 // character. 5856 case '*': 5857 ranges->Add(CharacterRange::Everything(), zone); 5858 break; 5859 // This is the set of characters matched by the $ and ^ symbols 5860 // in multiline mode. 5861 case 'n': 5862 AddClass(kLineTerminatorRanges, 5863 kLineTerminatorRangeCount, 5864 ranges, 5865 zone); 5866 break; 5867 default: 5868 UNREACHABLE(); 5869 } 5870 } 5871 5872 5873 Vector<const int> CharacterRange::GetWordBounds() { 5874 return Vector<const int>(kWordRanges, kWordRangeCount - 1); 5875 } 5876 5877 5878 void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone, 5879 ZoneList<CharacterRange>* ranges, 5880 bool is_one_byte) { 5881 CharacterRange::Canonicalize(ranges); 5882 int range_count = ranges->length(); 5883 for (int i = 0; i < range_count; i++) { 5884 CharacterRange range = ranges->at(i); 5885 uc32 bottom = range.from(); 5886 if (bottom > String::kMaxUtf16CodeUnit) return; 5887 uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit); 5888 // Nothing to be done for surrogates. 5889 if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) return; 5890 if (is_one_byte && !RangeContainsLatin1Equivalents(range)) { 5891 if (bottom > String::kMaxOneByteCharCode) return; 5892 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode; 5893 } 5894 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 5895 if (top == bottom) { 5896 // If this is a singleton we just expand the one character. 5897 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars); 5898 for (int i = 0; i < length; i++) { 5899 uc32 chr = chars[i]; 5900 if (chr != bottom) { 5901 ranges->Add(CharacterRange::Singleton(chars[i]), zone); 5902 } 5903 } 5904 } else { 5905 // If this is a range we expand the characters block by block, expanding 5906 // contiguous subranges (blocks) one at a time. The approach is as 5907 // follows. For a given start character we look up the remainder of the 5908 // block that contains it (represented by the end point), for instance we 5909 // find 'z' if the character is 'c'. A block is characterized by the 5910 // property that all characters uncanonicalize in the same way, except 5911 // that each entry in the result is incremented by the distance from the 5912 // first element. So a-z is a block because 'a' uncanonicalizes to ['a', 5913 // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. Once 5914 // we've found the end point we look up its uncanonicalization and 5915 // produce a range for each element. For instance for [c-f] we look up 5916 // ['z', 'Z'] and produce [c-f] and [C-F]. We then only add a range if 5917 // it is not already contained in the input, so [c-f] will be skipped but 5918 // [C-F] will be added. If this range is not completely contained in a 5919 // block we do this for all the blocks covered by the range (handling 5920 // characters that is not in a block as a "singleton block"). 5921 unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 5922 int pos = bottom; 5923 while (pos <= top) { 5924 int length = 5925 isolate->jsregexp_canonrange()->get(pos, '\0', equivalents); 5926 uc32 block_end; 5927 if (length == 0) { 5928 block_end = pos; 5929 } else { 5930 DCHECK_EQ(1, length); 5931 block_end = equivalents[0]; 5932 } 5933 int end = (block_end > top) ? top : block_end; 5934 length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', 5935 equivalents); 5936 for (int i = 0; i < length; i++) { 5937 uc32 c = equivalents[i]; 5938 uc32 range_from = c - (block_end - pos); 5939 uc32 range_to = c - (block_end - end); 5940 if (!(bottom <= range_from && range_to <= top)) { 5941 ranges->Add(CharacterRange::Range(range_from, range_to), zone); 5942 } 5943 } 5944 pos = end + 1; 5945 } 5946 } 5947 } 5948 } 5949 5950 5951 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) { 5952 DCHECK_NOT_NULL(ranges); 5953 int n = ranges->length(); 5954 if (n <= 1) return true; 5955 int max = ranges->at(0).to(); 5956 for (int i = 1; i < n; i++) { 5957 CharacterRange next_range = ranges->at(i); 5958 if (next_range.from() <= max + 1) return false; 5959 max = next_range.to(); 5960 } 5961 return true; 5962 } 5963 5964 5965 ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) { 5966 if (ranges_ == NULL) { 5967 ranges_ = new(zone) ZoneList<CharacterRange>(2, zone); 5968 CharacterRange::AddClassEscape(standard_set_type_, ranges_, zone); 5969 } 5970 return ranges_; 5971 } 5972 5973 5974 // Move a number of elements in a zonelist to another position 5975 // in the same list. Handles overlapping source and target areas. 5976 static void MoveRanges(ZoneList<CharacterRange>* list, 5977 int from, 5978 int to, 5979 int count) { 5980 // Ranges are potentially overlapping. 5981 if (from < to) { 5982 for (int i = count - 1; i >= 0; i--) { 5983 list->at(to + i) = list->at(from + i); 5984 } 5985 } else { 5986 for (int i = 0; i < count; i++) { 5987 list->at(to + i) = list->at(from + i); 5988 } 5989 } 5990 } 5991 5992 5993 static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list, 5994 int count, 5995 CharacterRange insert) { 5996 // Inserts a range into list[0..count[, which must be sorted 5997 // by from value and non-overlapping and non-adjacent, using at most 5998 // list[0..count] for the result. Returns the number of resulting 5999 // canonicalized ranges. Inserting a range may collapse existing ranges into 6000 // fewer ranges, so the return value can be anything in the range 1..count+1. 6001 uc32 from = insert.from(); 6002 uc32 to = insert.to(); 6003 int start_pos = 0; 6004 int end_pos = count; 6005 for (int i = count - 1; i >= 0; i--) { 6006 CharacterRange current = list->at(i); 6007 if (current.from() > to + 1) { 6008 end_pos = i; 6009 } else if (current.to() + 1 < from) { 6010 start_pos = i + 1; 6011 break; 6012 } 6013 } 6014 6015 // Inserted range overlaps, or is adjacent to, ranges at positions 6016 // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are 6017 // not affected by the insertion. 6018 // If start_pos == end_pos, the range must be inserted before start_pos. 6019 // if start_pos < end_pos, the entire range from start_pos to end_pos 6020 // must be merged with the insert range. 6021 6022 if (start_pos == end_pos) { 6023 // Insert between existing ranges at position start_pos. 6024 if (start_pos < count) { 6025 MoveRanges(list, start_pos, start_pos + 1, count - start_pos); 6026 } 6027 list->at(start_pos) = insert; 6028 return count + 1; 6029 } 6030 if (start_pos + 1 == end_pos) { 6031 // Replace single existing range at position start_pos. 6032 CharacterRange to_replace = list->at(start_pos); 6033 int new_from = Min(to_replace.from(), from); 6034 int new_to = Max(to_replace.to(), to); 6035 list->at(start_pos) = CharacterRange::Range(new_from, new_to); 6036 return count; 6037 } 6038 // Replace a number of existing ranges from start_pos to end_pos - 1. 6039 // Move the remaining ranges down. 6040 6041 int new_from = Min(list->at(start_pos).from(), from); 6042 int new_to = Max(list->at(end_pos - 1).to(), to); 6043 if (end_pos < count) { 6044 MoveRanges(list, end_pos, start_pos + 1, count - end_pos); 6045 } 6046 list->at(start_pos) = CharacterRange::Range(new_from, new_to); 6047 return count - (end_pos - start_pos) + 1; 6048 } 6049 6050 6051 void CharacterSet::Canonicalize() { 6052 // Special/default classes are always considered canonical. The result 6053 // of calling ranges() will be sorted. 6054 if (ranges_ == NULL) return; 6055 CharacterRange::Canonicalize(ranges_); 6056 } 6057 6058 6059 void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) { 6060 if (character_ranges->length() <= 1) return; 6061 // Check whether ranges are already canonical (increasing, non-overlapping, 6062 // non-adjacent). 6063 int n = character_ranges->length(); 6064 int max = character_ranges->at(0).to(); 6065 int i = 1; 6066 while (i < n) { 6067 CharacterRange current = character_ranges->at(i); 6068 if (current.from() <= max + 1) { 6069 break; 6070 } 6071 max = current.to(); 6072 i++; 6073 } 6074 // Canonical until the i'th range. If that's all of them, we are done. 6075 if (i == n) return; 6076 6077 // The ranges at index i and forward are not canonicalized. Make them so by 6078 // doing the equivalent of insertion sort (inserting each into the previous 6079 // list, in order). 6080 // Notice that inserting a range can reduce the number of ranges in the 6081 // result due to combining of adjacent and overlapping ranges. 6082 int read = i; // Range to insert. 6083 int num_canonical = i; // Length of canonicalized part of list. 6084 do { 6085 num_canonical = InsertRangeInCanonicalList(character_ranges, 6086 num_canonical, 6087 character_ranges->at(read)); 6088 read++; 6089 } while (read < n); 6090 character_ranges->Rewind(num_canonical); 6091 6092 DCHECK(CharacterRange::IsCanonical(character_ranges)); 6093 } 6094 6095 6096 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges, 6097 ZoneList<CharacterRange>* negated_ranges, 6098 Zone* zone) { 6099 DCHECK(CharacterRange::IsCanonical(ranges)); 6100 DCHECK_EQ(0, negated_ranges->length()); 6101 int range_count = ranges->length(); 6102 uc32 from = 0; 6103 int i = 0; 6104 if (range_count > 0 && ranges->at(0).from() == 0) { 6105 from = ranges->at(0).to() + 1; 6106 i = 1; 6107 } 6108 while (i < range_count) { 6109 CharacterRange range = ranges->at(i); 6110 negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone); 6111 from = range.to() + 1; 6112 i++; 6113 } 6114 if (from < String::kMaxCodePoint) { 6115 negated_ranges->Add(CharacterRange::Range(from, String::kMaxCodePoint), 6116 zone); 6117 } 6118 } 6119 6120 6121 // ------------------------------------------------------------------- 6122 // Splay tree 6123 6124 6125 OutSet* OutSet::Extend(unsigned value, Zone* zone) { 6126 if (Get(value)) 6127 return this; 6128 if (successors(zone) != NULL) { 6129 for (int i = 0; i < successors(zone)->length(); i++) { 6130 OutSet* successor = successors(zone)->at(i); 6131 if (successor->Get(value)) 6132 return successor; 6133 } 6134 } else { 6135 successors_ = new(zone) ZoneList<OutSet*>(2, zone); 6136 } 6137 OutSet* result = new(zone) OutSet(first_, remaining_); 6138 result->Set(value, zone); 6139 successors(zone)->Add(result, zone); 6140 return result; 6141 } 6142 6143 6144 void OutSet::Set(unsigned value, Zone *zone) { 6145 if (value < kFirstLimit) { 6146 first_ |= (1 << value); 6147 } else { 6148 if (remaining_ == NULL) 6149 remaining_ = new(zone) ZoneList<unsigned>(1, zone); 6150 if (remaining_->is_empty() || !remaining_->Contains(value)) 6151 remaining_->Add(value, zone); 6152 } 6153 } 6154 6155 6156 bool OutSet::Get(unsigned value) const { 6157 if (value < kFirstLimit) { 6158 return (first_ & (1 << value)) != 0; 6159 } else if (remaining_ == NULL) { 6160 return false; 6161 } else { 6162 return remaining_->Contains(value); 6163 } 6164 } 6165 6166 6167 const uc32 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar; 6168 6169 6170 void DispatchTable::AddRange(CharacterRange full_range, int value, 6171 Zone* zone) { 6172 CharacterRange current = full_range; 6173 if (tree()->is_empty()) { 6174 // If this is the first range we just insert into the table. 6175 ZoneSplayTree<Config>::Locator loc; 6176 bool inserted = tree()->Insert(current.from(), &loc); 6177 DCHECK(inserted); 6178 USE(inserted); 6179 loc.set_value(Entry(current.from(), current.to(), 6180 empty()->Extend(value, zone))); 6181 return; 6182 } 6183 // First see if there is a range to the left of this one that 6184 // overlaps. 6185 ZoneSplayTree<Config>::Locator loc; 6186 if (tree()->FindGreatestLessThan(current.from(), &loc)) { 6187 Entry* entry = &loc.value(); 6188 // If we've found a range that overlaps with this one, and it 6189 // starts strictly to the left of this one, we have to fix it 6190 // because the following code only handles ranges that start on 6191 // or after the start point of the range we're adding. 6192 if (entry->from() < current.from() && entry->to() >= current.from()) { 6193 // Snap the overlapping range in half around the start point of 6194 // the range we're adding. 6195 CharacterRange left = 6196 CharacterRange::Range(entry->from(), current.from() - 1); 6197 CharacterRange right = CharacterRange::Range(current.from(), entry->to()); 6198 // The left part of the overlapping range doesn't overlap. 6199 // Truncate the whole entry to be just the left part. 6200 entry->set_to(left.to()); 6201 // The right part is the one that overlaps. We add this part 6202 // to the map and let the next step deal with merging it with 6203 // the range we're adding. 6204 ZoneSplayTree<Config>::Locator loc; 6205 bool inserted = tree()->Insert(right.from(), &loc); 6206 DCHECK(inserted); 6207 USE(inserted); 6208 loc.set_value(Entry(right.from(), 6209 right.to(), 6210 entry->out_set())); 6211 } 6212 } 6213 while (current.is_valid()) { 6214 if (tree()->FindLeastGreaterThan(current.from(), &loc) && 6215 (loc.value().from() <= current.to()) && 6216 (loc.value().to() >= current.from())) { 6217 Entry* entry = &loc.value(); 6218 // We have overlap. If there is space between the start point of 6219 // the range we're adding and where the overlapping range starts 6220 // then we have to add a range covering just that space. 6221 if (current.from() < entry->from()) { 6222 ZoneSplayTree<Config>::Locator ins; 6223 bool inserted = tree()->Insert(current.from(), &ins); 6224 DCHECK(inserted); 6225 USE(inserted); 6226 ins.set_value(Entry(current.from(), 6227 entry->from() - 1, 6228 empty()->Extend(value, zone))); 6229 current.set_from(entry->from()); 6230 } 6231 DCHECK_EQ(current.from(), entry->from()); 6232 // If the overlapping range extends beyond the one we want to add 6233 // we have to snap the right part off and add it separately. 6234 if (entry->to() > current.to()) { 6235 ZoneSplayTree<Config>::Locator ins; 6236 bool inserted = tree()->Insert(current.to() + 1, &ins); 6237 DCHECK(inserted); 6238 USE(inserted); 6239 ins.set_value(Entry(current.to() + 1, 6240 entry->to(), 6241 entry->out_set())); 6242 entry->set_to(current.to()); 6243 } 6244 DCHECK(entry->to() <= current.to()); 6245 // The overlapping range is now completely contained by the range 6246 // we're adding so we can just update it and move the start point 6247 // of the range we're adding just past it. 6248 entry->AddValue(value, zone); 6249 DCHECK(entry->to() + 1 > current.from()); 6250 current.set_from(entry->to() + 1); 6251 } else { 6252 // There is no overlap so we can just add the range 6253 ZoneSplayTree<Config>::Locator ins; 6254 bool inserted = tree()->Insert(current.from(), &ins); 6255 DCHECK(inserted); 6256 USE(inserted); 6257 ins.set_value(Entry(current.from(), 6258 current.to(), 6259 empty()->Extend(value, zone))); 6260 break; 6261 } 6262 } 6263 } 6264 6265 6266 OutSet* DispatchTable::Get(uc32 value) { 6267 ZoneSplayTree<Config>::Locator loc; 6268 if (!tree()->FindGreatestLessThan(value, &loc)) 6269 return empty(); 6270 Entry* entry = &loc.value(); 6271 if (value <= entry->to()) 6272 return entry->out_set(); 6273 else 6274 return empty(); 6275 } 6276 6277 6278 // ------------------------------------------------------------------- 6279 // Analysis 6280 6281 6282 void Analysis::EnsureAnalyzed(RegExpNode* that) { 6283 StackLimitCheck check(isolate()); 6284 if (check.HasOverflowed()) { 6285 fail("Stack overflow"); 6286 return; 6287 } 6288 if (that->info()->been_analyzed || that->info()->being_analyzed) 6289 return; 6290 that->info()->being_analyzed = true; 6291 that->Accept(this); 6292 that->info()->being_analyzed = false; 6293 that->info()->been_analyzed = true; 6294 } 6295 6296 6297 void Analysis::VisitEnd(EndNode* that) { 6298 // nothing to do 6299 } 6300 6301 6302 void TextNode::CalculateOffsets() { 6303 int element_count = elements()->length(); 6304 // Set up the offsets of the elements relative to the start. This is a fixed 6305 // quantity since a TextNode can only contain fixed-width things. 6306 int cp_offset = 0; 6307 for (int i = 0; i < element_count; i++) { 6308 TextElement& elm = elements()->at(i); 6309 elm.set_cp_offset(cp_offset); 6310 cp_offset += elm.length(); 6311 } 6312 } 6313 6314 6315 void Analysis::VisitText(TextNode* that) { 6316 if (ignore_case()) { 6317 that->MakeCaseIndependent(isolate(), is_one_byte_); 6318 } 6319 EnsureAnalyzed(that->on_success()); 6320 if (!has_failed()) { 6321 that->CalculateOffsets(); 6322 } 6323 } 6324 6325 6326 void Analysis::VisitAction(ActionNode* that) { 6327 RegExpNode* target = that->on_success(); 6328 EnsureAnalyzed(target); 6329 if (!has_failed()) { 6330 // If the next node is interested in what it follows then this node 6331 // has to be interested too so it can pass the information on. 6332 that->info()->AddFromFollowing(target->info()); 6333 } 6334 } 6335 6336 6337 void Analysis::VisitChoice(ChoiceNode* that) { 6338 NodeInfo* info = that->info(); 6339 for (int i = 0; i < that->alternatives()->length(); i++) { 6340 RegExpNode* node = that->alternatives()->at(i).node(); 6341 EnsureAnalyzed(node); 6342 if (has_failed()) return; 6343 // Anything the following nodes need to know has to be known by 6344 // this node also, so it can pass it on. 6345 info->AddFromFollowing(node->info()); 6346 } 6347 } 6348 6349 6350 void Analysis::VisitLoopChoice(LoopChoiceNode* that) { 6351 NodeInfo* info = that->info(); 6352 for (int i = 0; i < that->alternatives()->length(); i++) { 6353 RegExpNode* node = that->alternatives()->at(i).node(); 6354 if (node != that->loop_node()) { 6355 EnsureAnalyzed(node); 6356 if (has_failed()) return; 6357 info->AddFromFollowing(node->info()); 6358 } 6359 } 6360 // Check the loop last since it may need the value of this node 6361 // to get a correct result. 6362 EnsureAnalyzed(that->loop_node()); 6363 if (!has_failed()) { 6364 info->AddFromFollowing(that->loop_node()->info()); 6365 } 6366 } 6367 6368 6369 void Analysis::VisitBackReference(BackReferenceNode* that) { 6370 EnsureAnalyzed(that->on_success()); 6371 } 6372 6373 6374 void Analysis::VisitAssertion(AssertionNode* that) { 6375 EnsureAnalyzed(that->on_success()); 6376 } 6377 6378 6379 void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget, 6380 BoyerMooreLookahead* bm, 6381 bool not_at_start) { 6382 // Working out the set of characters that a backreference can match is too 6383 // hard, so we just say that any character can match. 6384 bm->SetRest(offset); 6385 SaveBMInfo(bm, not_at_start, offset); 6386 } 6387 6388 6389 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize == 6390 RegExpMacroAssembler::kTableSize); 6391 6392 6393 void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget, 6394 BoyerMooreLookahead* bm, bool not_at_start) { 6395 ZoneList<GuardedAlternative>* alts = alternatives(); 6396 budget = (budget - 1) / alts->length(); 6397 for (int i = 0; i < alts->length(); i++) { 6398 GuardedAlternative& alt = alts->at(i); 6399 if (alt.guards() != NULL && alt.guards()->length() != 0) { 6400 bm->SetRest(offset); // Give up trying to fill in info. 6401 SaveBMInfo(bm, not_at_start, offset); 6402 return; 6403 } 6404 alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start); 6405 } 6406 SaveBMInfo(bm, not_at_start, offset); 6407 } 6408 6409 6410 void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget, 6411 BoyerMooreLookahead* bm, bool not_at_start) { 6412 if (initial_offset >= bm->length()) return; 6413 int offset = initial_offset; 6414 int max_char = bm->max_char(); 6415 for (int i = 0; i < elements()->length(); i++) { 6416 if (offset >= bm->length()) { 6417 if (initial_offset == 0) set_bm_info(not_at_start, bm); 6418 return; 6419 } 6420 TextElement text = elements()->at(i); 6421 if (text.text_type() == TextElement::ATOM) { 6422 RegExpAtom* atom = text.atom(); 6423 for (int j = 0; j < atom->length(); j++, offset++) { 6424 if (offset >= bm->length()) { 6425 if (initial_offset == 0) set_bm_info(not_at_start, bm); 6426 return; 6427 } 6428 uc16 character = atom->data()[j]; 6429 if (bm->compiler()->ignore_case()) { 6430 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 6431 int length = GetCaseIndependentLetters( 6432 isolate, character, bm->max_char() == String::kMaxOneByteCharCode, 6433 chars); 6434 for (int j = 0; j < length; j++) { 6435 bm->Set(offset, chars[j]); 6436 } 6437 } else { 6438 if (character <= max_char) bm->Set(offset, character); 6439 } 6440 } 6441 } else { 6442 DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type()); 6443 RegExpCharacterClass* char_class = text.char_class(); 6444 ZoneList<CharacterRange>* ranges = char_class->ranges(zone()); 6445 if (char_class->is_negated()) { 6446 bm->SetAll(offset); 6447 } else { 6448 for (int k = 0; k < ranges->length(); k++) { 6449 CharacterRange& range = ranges->at(k); 6450 if (range.from() > max_char) continue; 6451 int to = Min(max_char, static_cast<int>(range.to())); 6452 bm->SetInterval(offset, Interval(range.from(), to)); 6453 } 6454 } 6455 offset++; 6456 } 6457 } 6458 if (offset >= bm->length()) { 6459 if (initial_offset == 0) set_bm_info(not_at_start, bm); 6460 return; 6461 } 6462 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, 6463 true); // Not at start after a text node. 6464 if (initial_offset == 0) set_bm_info(not_at_start, bm); 6465 } 6466 6467 6468 // ------------------------------------------------------------------- 6469 // Dispatch table construction 6470 6471 6472 void DispatchTableConstructor::VisitEnd(EndNode* that) { 6473 AddRange(CharacterRange::Everything()); 6474 } 6475 6476 6477 void DispatchTableConstructor::BuildTable(ChoiceNode* node) { 6478 node->set_being_calculated(true); 6479 ZoneList<GuardedAlternative>* alternatives = node->alternatives(); 6480 for (int i = 0; i < alternatives->length(); i++) { 6481 set_choice_index(i); 6482 alternatives->at(i).node()->Accept(this); 6483 } 6484 node->set_being_calculated(false); 6485 } 6486 6487 6488 class AddDispatchRange { 6489 public: 6490 explicit AddDispatchRange(DispatchTableConstructor* constructor) 6491 : constructor_(constructor) { } 6492 void Call(uc32 from, DispatchTable::Entry entry); 6493 private: 6494 DispatchTableConstructor* constructor_; 6495 }; 6496 6497 6498 void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) { 6499 constructor_->AddRange(CharacterRange::Range(from, entry.to())); 6500 } 6501 6502 6503 void DispatchTableConstructor::VisitChoice(ChoiceNode* node) { 6504 if (node->being_calculated()) 6505 return; 6506 DispatchTable* table = node->GetTable(ignore_case_); 6507 AddDispatchRange adder(this); 6508 table->ForEach(&adder); 6509 } 6510 6511 6512 void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) { 6513 // TODO(160): Find the node that we refer back to and propagate its start 6514 // set back to here. For now we just accept anything. 6515 AddRange(CharacterRange::Everything()); 6516 } 6517 6518 6519 void DispatchTableConstructor::VisitAssertion(AssertionNode* that) { 6520 RegExpNode* target = that->on_success(); 6521 target->Accept(this); 6522 } 6523 6524 6525 static int CompareRangeByFrom(const CharacterRange* a, 6526 const CharacterRange* b) { 6527 return Compare<uc16>(a->from(), b->from()); 6528 } 6529 6530 6531 void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) { 6532 ranges->Sort(CompareRangeByFrom); 6533 uc16 last = 0; 6534 for (int i = 0; i < ranges->length(); i++) { 6535 CharacterRange range = ranges->at(i); 6536 if (last < range.from()) 6537 AddRange(CharacterRange::Range(last, range.from() - 1)); 6538 if (range.to() >= last) { 6539 if (range.to() == String::kMaxCodePoint) { 6540 return; 6541 } else { 6542 last = range.to() + 1; 6543 } 6544 } 6545 } 6546 AddRange(CharacterRange::Range(last, String::kMaxCodePoint)); 6547 } 6548 6549 6550 void DispatchTableConstructor::VisitText(TextNode* that) { 6551 TextElement elm = that->elements()->at(0); 6552 switch (elm.text_type()) { 6553 case TextElement::ATOM: { 6554 uc16 c = elm.atom()->data()[0]; 6555 AddRange(CharacterRange::Range(c, c)); 6556 break; 6557 } 6558 case TextElement::CHAR_CLASS: { 6559 RegExpCharacterClass* tree = elm.char_class(); 6560 ZoneList<CharacterRange>* ranges = tree->ranges(that->zone()); 6561 if (tree->is_negated()) { 6562 AddInverse(ranges); 6563 } else { 6564 for (int i = 0; i < ranges->length(); i++) 6565 AddRange(ranges->at(i)); 6566 } 6567 break; 6568 } 6569 default: { 6570 UNIMPLEMENTED(); 6571 } 6572 } 6573 } 6574 6575 6576 void DispatchTableConstructor::VisitAction(ActionNode* that) { 6577 RegExpNode* target = that->on_success(); 6578 target->Accept(this); 6579 } 6580 6581 6582 RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler, 6583 RegExpNode* on_success) { 6584 // If the regexp matching starts within a surrogate pair, step back 6585 // to the lead surrogate and start matching from there. 6586 DCHECK(!compiler->read_backward()); 6587 Zone* zone = compiler->zone(); 6588 ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List( 6589 zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd)); 6590 ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List( 6591 zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd)); 6592 6593 ChoiceNode* optional_step_back = new (zone) ChoiceNode(2, zone); 6594 6595 int stack_register = compiler->UnicodeLookaroundStackRegister(); 6596 int position_register = compiler->UnicodeLookaroundPositionRegister(); 6597 RegExpNode* step_back = TextNode::CreateForCharacterRanges( 6598 zone, lead_surrogates, true, on_success); 6599 RegExpLookaround::Builder builder(true, step_back, stack_register, 6600 position_register); 6601 RegExpNode* match_trail = TextNode::CreateForCharacterRanges( 6602 zone, trail_surrogates, false, builder.on_match_success()); 6603 6604 optional_step_back->AddAlternative( 6605 GuardedAlternative(builder.ForMatch(match_trail))); 6606 optional_step_back->AddAlternative(GuardedAlternative(on_success)); 6607 6608 return optional_step_back; 6609 } 6610 6611 6612 RegExpEngine::CompilationResult RegExpEngine::Compile( 6613 Isolate* isolate, Zone* zone, RegExpCompileData* data, 6614 JSRegExp::Flags flags, Handle<String> pattern, 6615 Handle<String> sample_subject, bool is_one_byte) { 6616 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) { 6617 return IrregexpRegExpTooBig(isolate); 6618 } 6619 bool ignore_case = flags & JSRegExp::kIgnoreCase; 6620 bool is_sticky = flags & JSRegExp::kSticky; 6621 bool is_global = flags & JSRegExp::kGlobal; 6622 bool is_unicode = flags & JSRegExp::kUnicode; 6623 RegExpCompiler compiler(isolate, zone, data->capture_count, flags, 6624 is_one_byte); 6625 6626 if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern)); 6627 6628 // Sample some characters from the middle of the string. 6629 static const int kSampleSize = 128; 6630 6631 sample_subject = String::Flatten(sample_subject); 6632 int chars_sampled = 0; 6633 int half_way = (sample_subject->length() - kSampleSize) / 2; 6634 for (int i = Max(0, half_way); 6635 i < sample_subject->length() && chars_sampled < kSampleSize; 6636 i++, chars_sampled++) { 6637 compiler.frequency_collator()->CountCharacter(sample_subject->Get(i)); 6638 } 6639 6640 // Wrap the body of the regexp in capture #0. 6641 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree, 6642 0, 6643 &compiler, 6644 compiler.accept()); 6645 RegExpNode* node = captured_body; 6646 bool is_end_anchored = data->tree->IsAnchoredAtEnd(); 6647 bool is_start_anchored = data->tree->IsAnchoredAtStart(); 6648 int max_length = data->tree->max_match(); 6649 if (!is_start_anchored && !is_sticky) { 6650 // Add a .*? at the beginning, outside the body capture, unless 6651 // this expression is anchored at the beginning or sticky. 6652 RegExpNode* loop_node = RegExpQuantifier::ToNode( 6653 0, RegExpTree::kInfinity, false, new (zone) RegExpCharacterClass('*'), 6654 &compiler, captured_body, data->contains_anchor); 6655 6656 if (data->contains_anchor) { 6657 // Unroll loop once, to take care of the case that might start 6658 // at the start of input. 6659 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone); 6660 first_step_node->AddAlternative(GuardedAlternative(captured_body)); 6661 first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode( 6662 new (zone) RegExpCharacterClass('*'), false, loop_node))); 6663 node = first_step_node; 6664 } else { 6665 node = loop_node; 6666 } 6667 } 6668 if (is_one_byte) { 6669 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case); 6670 // Do it again to propagate the new nodes to places where they were not 6671 // put because they had not been calculated yet. 6672 if (node != NULL) { 6673 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case); 6674 } 6675 } else if (compiler.unicode() && (is_global || is_sticky)) { 6676 node = OptionallyStepBackToLeadSurrogate(&compiler, node); 6677 } 6678 6679 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone); 6680 data->node = node; 6681 Analysis analysis(isolate, flags, is_one_byte); 6682 analysis.EnsureAnalyzed(node); 6683 if (analysis.has_failed()) { 6684 const char* error_message = analysis.error_message(); 6685 return CompilationResult(isolate, error_message); 6686 } 6687 6688 // Create the correct assembler for the architecture. 6689 #ifndef V8_INTERPRETED_REGEXP 6690 // Native regexp implementation. 6691 6692 NativeRegExpMacroAssembler::Mode mode = 6693 is_one_byte ? NativeRegExpMacroAssembler::LATIN1 6694 : NativeRegExpMacroAssembler::UC16; 6695 6696 #if V8_TARGET_ARCH_IA32 6697 RegExpMacroAssemblerIA32 macro_assembler(isolate, zone, mode, 6698 (data->capture_count + 1) * 2); 6699 #elif V8_TARGET_ARCH_X64 6700 RegExpMacroAssemblerX64 macro_assembler(isolate, zone, mode, 6701 (data->capture_count + 1) * 2); 6702 #elif V8_TARGET_ARCH_ARM 6703 RegExpMacroAssemblerARM macro_assembler(isolate, zone, mode, 6704 (data->capture_count + 1) * 2); 6705 #elif V8_TARGET_ARCH_ARM64 6706 RegExpMacroAssemblerARM64 macro_assembler(isolate, zone, mode, 6707 (data->capture_count + 1) * 2); 6708 #elif V8_TARGET_ARCH_S390 6709 RegExpMacroAssemblerS390 macro_assembler(isolate, zone, mode, 6710 (data->capture_count + 1) * 2); 6711 #elif V8_TARGET_ARCH_PPC 6712 RegExpMacroAssemblerPPC macro_assembler(isolate, zone, mode, 6713 (data->capture_count + 1) * 2); 6714 #elif V8_TARGET_ARCH_MIPS 6715 RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode, 6716 (data->capture_count + 1) * 2); 6717 #elif V8_TARGET_ARCH_MIPS64 6718 RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode, 6719 (data->capture_count + 1) * 2); 6720 #elif V8_TARGET_ARCH_X87 6721 RegExpMacroAssemblerX87 macro_assembler(isolate, zone, mode, 6722 (data->capture_count + 1) * 2); 6723 #else 6724 #error "Unsupported architecture" 6725 #endif 6726 6727 #else // V8_INTERPRETED_REGEXP 6728 // Interpreted regexp implementation. 6729 EmbeddedVector<byte, 1024> codes; 6730 RegExpMacroAssemblerIrregexp macro_assembler(isolate, codes, zone); 6731 #endif // V8_INTERPRETED_REGEXP 6732 6733 macro_assembler.set_slow_safe(TooMuchRegExpCode(pattern)); 6734 6735 // Inserted here, instead of in Assembler, because it depends on information 6736 // in the AST that isn't replicated in the Node structure. 6737 static const int kMaxBacksearchLimit = 1024; 6738 if (is_end_anchored && !is_start_anchored && !is_sticky && 6739 max_length < kMaxBacksearchLimit) { 6740 macro_assembler.SetCurrentPositionFromEnd(max_length); 6741 } 6742 6743 if (is_global) { 6744 RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL; 6745 if (data->tree->min_match() > 0) { 6746 mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK; 6747 } else if (is_unicode) { 6748 mode = RegExpMacroAssembler::GLOBAL_UNICODE; 6749 } 6750 macro_assembler.set_global_mode(mode); 6751 } 6752 6753 return compiler.Assemble(¯o_assembler, 6754 node, 6755 data->capture_count, 6756 pattern); 6757 } 6758 6759 6760 bool RegExpEngine::TooMuchRegExpCode(Handle<String> pattern) { 6761 Heap* heap = pattern->GetHeap(); 6762 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize; 6763 if (heap->isolate()->total_regexp_code_generated() > 6764 RegExpImpl::kRegExpCompiledLimit && 6765 heap->CommittedMemoryExecutable() > 6766 RegExpImpl::kRegExpExecutableMemoryLimit) { 6767 too_much = true; 6768 } 6769 return too_much; 6770 } 6771 6772 6773 Object* RegExpResultsCache::Lookup(Heap* heap, String* key_string, 6774 Object* key_pattern, 6775 FixedArray** last_match_cache, 6776 ResultsCacheType type) { 6777 FixedArray* cache; 6778 if (!key_string->IsInternalizedString()) return Smi::kZero; 6779 if (type == STRING_SPLIT_SUBSTRINGS) { 6780 DCHECK(key_pattern->IsString()); 6781 if (!key_pattern->IsInternalizedString()) return Smi::kZero; 6782 cache = heap->string_split_cache(); 6783 } else { 6784 DCHECK(type == REGEXP_MULTIPLE_INDICES); 6785 DCHECK(key_pattern->IsFixedArray()); 6786 cache = heap->regexp_multiple_cache(); 6787 } 6788 6789 uint32_t hash = key_string->Hash(); 6790 uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) & 6791 ~(kArrayEntriesPerCacheEntry - 1)); 6792 if (cache->get(index + kStringOffset) != key_string || 6793 cache->get(index + kPatternOffset) != key_pattern) { 6794 index = 6795 ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1)); 6796 if (cache->get(index + kStringOffset) != key_string || 6797 cache->get(index + kPatternOffset) != key_pattern) { 6798 return Smi::kZero; 6799 } 6800 } 6801 6802 *last_match_cache = FixedArray::cast(cache->get(index + kLastMatchOffset)); 6803 return cache->get(index + kArrayOffset); 6804 } 6805 6806 6807 void RegExpResultsCache::Enter(Isolate* isolate, Handle<String> key_string, 6808 Handle<Object> key_pattern, 6809 Handle<FixedArray> value_array, 6810 Handle<FixedArray> last_match_cache, 6811 ResultsCacheType type) { 6812 Factory* factory = isolate->factory(); 6813 Handle<FixedArray> cache; 6814 if (!key_string->IsInternalizedString()) return; 6815 if (type == STRING_SPLIT_SUBSTRINGS) { 6816 DCHECK(key_pattern->IsString()); 6817 if (!key_pattern->IsInternalizedString()) return; 6818 cache = factory->string_split_cache(); 6819 } else { 6820 DCHECK(type == REGEXP_MULTIPLE_INDICES); 6821 DCHECK(key_pattern->IsFixedArray()); 6822 cache = factory->regexp_multiple_cache(); 6823 } 6824 6825 uint32_t hash = key_string->Hash(); 6826 uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) & 6827 ~(kArrayEntriesPerCacheEntry - 1)); 6828 if (cache->get(index + kStringOffset) == Smi::kZero) { 6829 cache->set(index + kStringOffset, *key_string); 6830 cache->set(index + kPatternOffset, *key_pattern); 6831 cache->set(index + kArrayOffset, *value_array); 6832 cache->set(index + kLastMatchOffset, *last_match_cache); 6833 } else { 6834 uint32_t index2 = 6835 ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1)); 6836 if (cache->get(index2 + kStringOffset) == Smi::kZero) { 6837 cache->set(index2 + kStringOffset, *key_string); 6838 cache->set(index2 + kPatternOffset, *key_pattern); 6839 cache->set(index2 + kArrayOffset, *value_array); 6840 cache->set(index2 + kLastMatchOffset, *last_match_cache); 6841 } else { 6842 cache->set(index2 + kStringOffset, Smi::kZero); 6843 cache->set(index2 + kPatternOffset, Smi::kZero); 6844 cache->set(index2 + kArrayOffset, Smi::kZero); 6845 cache->set(index2 + kLastMatchOffset, Smi::kZero); 6846 cache->set(index + kStringOffset, *key_string); 6847 cache->set(index + kPatternOffset, *key_pattern); 6848 cache->set(index + kArrayOffset, *value_array); 6849 cache->set(index + kLastMatchOffset, *last_match_cache); 6850 } 6851 } 6852 // If the array is a reasonably short list of substrings, convert it into a 6853 // list of internalized strings. 6854 if (type == STRING_SPLIT_SUBSTRINGS && value_array->length() < 100) { 6855 for (int i = 0; i < value_array->length(); i++) { 6856 Handle<String> str(String::cast(value_array->get(i)), isolate); 6857 Handle<String> internalized_str = factory->InternalizeString(str); 6858 value_array->set(i, *internalized_str); 6859 } 6860 } 6861 // Convert backing store to a copy-on-write array. 6862 value_array->set_map_no_write_barrier(isolate->heap()->fixed_cow_array_map()); 6863 } 6864 6865 6866 void RegExpResultsCache::Clear(FixedArray* cache) { 6867 for (int i = 0; i < kRegExpResultsCacheSize; i++) { 6868 cache->set(i, Smi::kZero); 6869 } 6870 } 6871 6872 } // namespace internal 6873 } // namespace v8 6874