1 // Copyright 2012 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "src/regexp/jsregexp.h" 6 7 #include <memory> 8 9 #include "src/base/platform/platform.h" 10 #include "src/compilation-cache.h" 11 #include "src/elements.h" 12 #include "src/execution.h" 13 #include "src/factory.h" 14 #include "src/isolate-inl.h" 15 #include "src/messages.h" 16 #include "src/ostreams.h" 17 #include "src/regexp/interpreter-irregexp.h" 18 #include "src/regexp/jsregexp-inl.h" 19 #include "src/regexp/regexp-macro-assembler-irregexp.h" 20 #include "src/regexp/regexp-macro-assembler-tracer.h" 21 #include "src/regexp/regexp-macro-assembler.h" 22 #include "src/regexp/regexp-parser.h" 23 #include "src/regexp/regexp-stack.h" 24 #include "src/runtime/runtime.h" 25 #include "src/splay-tree-inl.h" 26 #include "src/string-search.h" 27 #include "src/unicode-decoder.h" 28 29 #ifdef V8_I18N_SUPPORT 30 #include "unicode/uset.h" 31 #include "unicode/utypes.h" 32 #endif // V8_I18N_SUPPORT 33 34 #ifndef V8_INTERPRETED_REGEXP 35 #if V8_TARGET_ARCH_IA32 36 #include "src/regexp/ia32/regexp-macro-assembler-ia32.h" 37 #elif V8_TARGET_ARCH_X64 38 #include "src/regexp/x64/regexp-macro-assembler-x64.h" 39 #elif V8_TARGET_ARCH_ARM64 40 #include "src/regexp/arm64/regexp-macro-assembler-arm64.h" 41 #elif V8_TARGET_ARCH_ARM 42 #include "src/regexp/arm/regexp-macro-assembler-arm.h" 43 #elif V8_TARGET_ARCH_PPC 44 #include "src/regexp/ppc/regexp-macro-assembler-ppc.h" 45 #elif V8_TARGET_ARCH_S390 46 #include "src/regexp/s390/regexp-macro-assembler-s390.h" 47 #elif V8_TARGET_ARCH_MIPS 48 #include "src/regexp/mips/regexp-macro-assembler-mips.h" 49 #elif V8_TARGET_ARCH_MIPS64 50 #include "src/regexp/mips64/regexp-macro-assembler-mips64.h" 51 #elif V8_TARGET_ARCH_X87 52 #include "src/regexp/x87/regexp-macro-assembler-x87.h" 53 #else 54 #error Unsupported target architecture. 55 #endif 56 #endif 57 58 59 namespace v8 { 60 namespace internal { 61 62 MUST_USE_RESULT 63 static inline MaybeHandle<Object> ThrowRegExpException( 64 Handle<JSRegExp> re, Handle<String> pattern, Handle<String> error_text) { 65 Isolate* isolate = re->GetIsolate(); 66 THROW_NEW_ERROR(isolate, NewSyntaxError(MessageTemplate::kMalformedRegExp, 67 pattern, error_text), 68 Object); 69 } 70 71 72 inline void ThrowRegExpException(Handle<JSRegExp> re, 73 Handle<String> error_text) { 74 USE(ThrowRegExpException(re, Handle<String>(re->Pattern()), error_text)); 75 } 76 77 78 ContainedInLattice AddRange(ContainedInLattice containment, 79 const int* ranges, 80 int ranges_length, 81 Interval new_range) { 82 DCHECK((ranges_length & 1) == 1); 83 DCHECK(ranges[ranges_length - 1] == String::kMaxCodePoint + 1); 84 if (containment == kLatticeUnknown) return containment; 85 bool inside = false; 86 int last = 0; 87 for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) { 88 // Consider the range from last to ranges[i]. 89 // We haven't got to the new range yet. 90 if (ranges[i] <= new_range.from()) continue; 91 // New range is wholly inside last-ranges[i]. Note that new_range.to() is 92 // inclusive, but the values in ranges are not. 93 if (last <= new_range.from() && new_range.to() < ranges[i]) { 94 return Combine(containment, inside ? kLatticeIn : kLatticeOut); 95 } 96 return kLatticeUnknown; 97 } 98 return containment; 99 } 100 101 102 // More makes code generation slower, less makes V8 benchmark score lower. 103 const int kMaxLookaheadForBoyerMoore = 8; 104 // In a 3-character pattern you can maximally step forwards 3 characters 105 // at a time, which is not always enough to pay for the extra logic. 106 const int kPatternTooShortForBoyerMoore = 2; 107 108 109 // Identifies the sort of regexps where the regexp engine is faster 110 // than the code used for atom matches. 111 static bool HasFewDifferentCharacters(Handle<String> pattern) { 112 int length = Min(kMaxLookaheadForBoyerMoore, pattern->length()); 113 if (length <= kPatternTooShortForBoyerMoore) return false; 114 const int kMod = 128; 115 bool character_found[kMod]; 116 int different = 0; 117 memset(&character_found[0], 0, sizeof(character_found)); 118 for (int i = 0; i < length; i++) { 119 int ch = (pattern->Get(i) & (kMod - 1)); 120 if (!character_found[ch]) { 121 character_found[ch] = true; 122 different++; 123 // We declare a regexp low-alphabet if it has at least 3 times as many 124 // characters as it has different characters. 125 if (different * 3 > length) return false; 126 } 127 } 128 return true; 129 } 130 131 132 // Generic RegExp methods. Dispatches to implementation specific methods. 133 134 135 MaybeHandle<Object> RegExpImpl::Compile(Handle<JSRegExp> re, 136 Handle<String> pattern, 137 JSRegExp::Flags flags) { 138 Isolate* isolate = re->GetIsolate(); 139 Zone zone(isolate->allocator(), ZONE_NAME); 140 CompilationCache* compilation_cache = isolate->compilation_cache(); 141 MaybeHandle<FixedArray> maybe_cached = 142 compilation_cache->LookupRegExp(pattern, flags); 143 Handle<FixedArray> cached; 144 if (maybe_cached.ToHandle(&cached)) { 145 re->set_data(*cached); 146 return re; 147 } 148 pattern = String::Flatten(pattern); 149 PostponeInterruptsScope postpone(isolate); 150 RegExpCompileData parse_result; 151 FlatStringReader reader(isolate, pattern); 152 if (!RegExpParser::ParseRegExp(re->GetIsolate(), &zone, &reader, flags, 153 &parse_result)) { 154 // Throw an exception if we fail to parse the pattern. 155 return ThrowRegExpException(re, pattern, parse_result.error); 156 } 157 158 bool has_been_compiled = false; 159 160 if (parse_result.simple && !(flags & JSRegExp::kIgnoreCase) && 161 !(flags & JSRegExp::kSticky) && !HasFewDifferentCharacters(pattern)) { 162 // Parse-tree is a single atom that is equal to the pattern. 163 AtomCompile(re, pattern, flags, pattern); 164 has_been_compiled = true; 165 } else if (parse_result.tree->IsAtom() && !(flags & JSRegExp::kIgnoreCase) && 166 !(flags & JSRegExp::kSticky) && parse_result.capture_count == 0) { 167 RegExpAtom* atom = parse_result.tree->AsAtom(); 168 Vector<const uc16> atom_pattern = atom->data(); 169 Handle<String> atom_string; 170 ASSIGN_RETURN_ON_EXCEPTION( 171 isolate, atom_string, 172 isolate->factory()->NewStringFromTwoByte(atom_pattern), 173 Object); 174 if (!HasFewDifferentCharacters(atom_string)) { 175 AtomCompile(re, pattern, flags, atom_string); 176 has_been_compiled = true; 177 } 178 } 179 if (!has_been_compiled) { 180 IrregexpInitialize(re, pattern, flags, parse_result.capture_count); 181 } 182 DCHECK(re->data()->IsFixedArray()); 183 // Compilation succeeded so the data is set on the regexp 184 // and we can store it in the cache. 185 Handle<FixedArray> data(FixedArray::cast(re->data())); 186 compilation_cache->PutRegExp(pattern, flags, data); 187 188 return re; 189 } 190 191 MaybeHandle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp, 192 Handle<String> subject, int index, 193 Handle<RegExpMatchInfo> last_match_info) { 194 switch (regexp->TypeTag()) { 195 case JSRegExp::ATOM: 196 return AtomExec(regexp, subject, index, last_match_info); 197 case JSRegExp::IRREGEXP: { 198 return IrregexpExec(regexp, subject, index, last_match_info); 199 } 200 default: 201 UNREACHABLE(); 202 return MaybeHandle<Object>(); 203 } 204 } 205 206 207 // RegExp Atom implementation: Simple string search using indexOf. 208 209 210 void RegExpImpl::AtomCompile(Handle<JSRegExp> re, 211 Handle<String> pattern, 212 JSRegExp::Flags flags, 213 Handle<String> match_pattern) { 214 re->GetIsolate()->factory()->SetRegExpAtomData(re, 215 JSRegExp::ATOM, 216 pattern, 217 flags, 218 match_pattern); 219 } 220 221 static void SetAtomLastCapture(Handle<RegExpMatchInfo> last_match_info, 222 String* subject, int from, int to) { 223 SealHandleScope shs(last_match_info->GetIsolate()); 224 last_match_info->SetNumberOfCaptureRegisters(2); 225 last_match_info->SetLastSubject(subject); 226 last_match_info->SetLastInput(subject); 227 last_match_info->SetCapture(0, from); 228 last_match_info->SetCapture(1, to); 229 } 230 231 232 int RegExpImpl::AtomExecRaw(Handle<JSRegExp> regexp, 233 Handle<String> subject, 234 int index, 235 int32_t* output, 236 int output_size) { 237 Isolate* isolate = regexp->GetIsolate(); 238 239 DCHECK(0 <= index); 240 DCHECK(index <= subject->length()); 241 242 subject = String::Flatten(subject); 243 DisallowHeapAllocation no_gc; // ensure vectors stay valid 244 245 String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex)); 246 int needle_len = needle->length(); 247 DCHECK(needle->IsFlat()); 248 DCHECK_LT(0, needle_len); 249 250 if (index + needle_len > subject->length()) { 251 return RegExpImpl::RE_FAILURE; 252 } 253 254 for (int i = 0; i < output_size; i += 2) { 255 String::FlatContent needle_content = needle->GetFlatContent(); 256 String::FlatContent subject_content = subject->GetFlatContent(); 257 DCHECK(needle_content.IsFlat()); 258 DCHECK(subject_content.IsFlat()); 259 // dispatch on type of strings 260 index = 261 (needle_content.IsOneByte() 262 ? (subject_content.IsOneByte() 263 ? SearchString(isolate, subject_content.ToOneByteVector(), 264 needle_content.ToOneByteVector(), index) 265 : SearchString(isolate, subject_content.ToUC16Vector(), 266 needle_content.ToOneByteVector(), index)) 267 : (subject_content.IsOneByte() 268 ? SearchString(isolate, subject_content.ToOneByteVector(), 269 needle_content.ToUC16Vector(), index) 270 : SearchString(isolate, subject_content.ToUC16Vector(), 271 needle_content.ToUC16Vector(), index))); 272 if (index == -1) { 273 return i / 2; // Return number of matches. 274 } else { 275 output[i] = index; 276 output[i+1] = index + needle_len; 277 index += needle_len; 278 } 279 } 280 return output_size / 2; 281 } 282 283 Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re, Handle<String> subject, 284 int index, 285 Handle<RegExpMatchInfo> last_match_info) { 286 Isolate* isolate = re->GetIsolate(); 287 288 static const int kNumRegisters = 2; 289 STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize); 290 int32_t* output_registers = isolate->jsregexp_static_offsets_vector(); 291 292 int res = AtomExecRaw(re, subject, index, output_registers, kNumRegisters); 293 294 if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value(); 295 296 DCHECK_EQ(res, RegExpImpl::RE_SUCCESS); 297 SealHandleScope shs(isolate); 298 SetAtomLastCapture(last_match_info, *subject, output_registers[0], 299 output_registers[1]); 300 return last_match_info; 301 } 302 303 304 // Irregexp implementation. 305 306 // Ensures that the regexp object contains a compiled version of the 307 // source for either one-byte or two-byte subject strings. 308 // If the compiled version doesn't already exist, it is compiled 309 // from the source pattern. 310 // If compilation fails, an exception is thrown and this function 311 // returns false. 312 bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, 313 Handle<String> sample_subject, 314 bool is_one_byte) { 315 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_one_byte)); 316 #ifdef V8_INTERPRETED_REGEXP 317 if (compiled_code->IsByteArray()) return true; 318 #else // V8_INTERPRETED_REGEXP (RegExp native code) 319 if (compiled_code->IsCode()) return true; 320 #endif 321 // We could potentially have marked this as flushable, but have kept 322 // a saved version if we did not flush it yet. 323 Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_one_byte)); 324 if (saved_code->IsCode()) { 325 // Reinstate the code in the original place. 326 re->SetDataAt(JSRegExp::code_index(is_one_byte), saved_code); 327 DCHECK(compiled_code->IsSmi()); 328 return true; 329 } 330 return CompileIrregexp(re, sample_subject, is_one_byte); 331 } 332 333 334 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, 335 Handle<String> sample_subject, 336 bool is_one_byte) { 337 // Compile the RegExp. 338 Isolate* isolate = re->GetIsolate(); 339 Zone zone(isolate->allocator(), ZONE_NAME); 340 PostponeInterruptsScope postpone(isolate); 341 // If we had a compilation error the last time this is saved at the 342 // saved code index. 343 Object* entry = re->DataAt(JSRegExp::code_index(is_one_byte)); 344 // When arriving here entry can only be a smi, either representing an 345 // uncompiled regexp, a previous compilation error, or code that has 346 // been flushed. 347 DCHECK(entry->IsSmi()); 348 int entry_value = Smi::cast(entry)->value(); 349 DCHECK(entry_value == JSRegExp::kUninitializedValue || 350 entry_value == JSRegExp::kCompilationErrorValue || 351 (entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0)); 352 353 if (entry_value == JSRegExp::kCompilationErrorValue) { 354 // A previous compilation failed and threw an error which we store in 355 // the saved code index (we store the error message, not the actual 356 // error). Recreate the error object and throw it. 357 Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_one_byte)); 358 DCHECK(error_string->IsString()); 359 Handle<String> error_message(String::cast(error_string)); 360 ThrowRegExpException(re, error_message); 361 return false; 362 } 363 364 JSRegExp::Flags flags = re->GetFlags(); 365 366 Handle<String> pattern(re->Pattern()); 367 pattern = String::Flatten(pattern); 368 RegExpCompileData compile_data; 369 FlatStringReader reader(isolate, pattern); 370 if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags, 371 &compile_data)) { 372 // Throw an exception if we fail to parse the pattern. 373 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once. 374 USE(ThrowRegExpException(re, pattern, compile_data.error)); 375 return false; 376 } 377 RegExpEngine::CompilationResult result = 378 RegExpEngine::Compile(isolate, &zone, &compile_data, flags, pattern, 379 sample_subject, is_one_byte); 380 if (result.error_message != NULL) { 381 // Unable to compile regexp. 382 Handle<String> error_message = isolate->factory()->NewStringFromUtf8( 383 CStrVector(result.error_message)).ToHandleChecked(); 384 ThrowRegExpException(re, error_message); 385 return false; 386 } 387 388 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data())); 389 data->set(JSRegExp::code_index(is_one_byte), result.code); 390 SetIrregexpCaptureNameMap(*data, compile_data.capture_name_map); 391 int register_max = IrregexpMaxRegisterCount(*data); 392 if (result.num_registers > register_max) { 393 SetIrregexpMaxRegisterCount(*data, result.num_registers); 394 } 395 396 return true; 397 } 398 399 400 int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) { 401 return Smi::cast( 402 re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value(); 403 } 404 405 406 void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) { 407 re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value)); 408 } 409 410 void RegExpImpl::SetIrregexpCaptureNameMap(FixedArray* re, 411 Handle<FixedArray> value) { 412 if (value.is_null()) { 413 re->set(JSRegExp::kIrregexpCaptureNameMapIndex, Smi::kZero); 414 } else { 415 re->set(JSRegExp::kIrregexpCaptureNameMapIndex, *value); 416 } 417 } 418 419 int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) { 420 return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value(); 421 } 422 423 424 int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) { 425 return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value(); 426 } 427 428 429 ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_one_byte) { 430 return ByteArray::cast(re->get(JSRegExp::code_index(is_one_byte))); 431 } 432 433 434 Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_one_byte) { 435 return Code::cast(re->get(JSRegExp::code_index(is_one_byte))); 436 } 437 438 439 void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re, 440 Handle<String> pattern, 441 JSRegExp::Flags flags, 442 int capture_count) { 443 // Initialize compiled code entries to null. 444 re->GetIsolate()->factory()->SetRegExpIrregexpData(re, 445 JSRegExp::IRREGEXP, 446 pattern, 447 flags, 448 capture_count); 449 } 450 451 452 int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp, 453 Handle<String> subject) { 454 subject = String::Flatten(subject); 455 456 // Check representation of the underlying storage. 457 bool is_one_byte = subject->IsOneByteRepresentationUnderneath(); 458 if (!EnsureCompiledIrregexp(regexp, subject, is_one_byte)) return -1; 459 460 #ifdef V8_INTERPRETED_REGEXP 461 // Byte-code regexp needs space allocated for all its registers. 462 // The result captures are copied to the start of the registers array 463 // if the match succeeds. This way those registers are not clobbered 464 // when we set the last match info from last successful match. 465 return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) + 466 (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2; 467 #else // V8_INTERPRETED_REGEXP 468 // Native regexp only needs room to output captures. Registers are handled 469 // internally. 470 return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2; 471 #endif // V8_INTERPRETED_REGEXP 472 } 473 474 475 int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp, 476 Handle<String> subject, 477 int index, 478 int32_t* output, 479 int output_size) { 480 Isolate* isolate = regexp->GetIsolate(); 481 482 Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate); 483 484 DCHECK(index >= 0); 485 DCHECK(index <= subject->length()); 486 DCHECK(subject->IsFlat()); 487 488 bool is_one_byte = subject->IsOneByteRepresentationUnderneath(); 489 490 #ifndef V8_INTERPRETED_REGEXP 491 DCHECK(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2); 492 do { 493 EnsureCompiledIrregexp(regexp, subject, is_one_byte); 494 Handle<Code> code(IrregexpNativeCode(*irregexp, is_one_byte), isolate); 495 // The stack is used to allocate registers for the compiled regexp code. 496 // This means that in case of failure, the output registers array is left 497 // untouched and contains the capture results from the previous successful 498 // match. We can use that to set the last match info lazily. 499 NativeRegExpMacroAssembler::Result res = 500 NativeRegExpMacroAssembler::Match(code, 501 subject, 502 output, 503 output_size, 504 index, 505 isolate); 506 if (res != NativeRegExpMacroAssembler::RETRY) { 507 DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION || 508 isolate->has_pending_exception()); 509 STATIC_ASSERT( 510 static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS); 511 STATIC_ASSERT( 512 static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE); 513 STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION) 514 == RE_EXCEPTION); 515 return static_cast<IrregexpResult>(res); 516 } 517 // If result is RETRY, the string has changed representation, and we 518 // must restart from scratch. 519 // In this case, it means we must make sure we are prepared to handle 520 // the, potentially, different subject (the string can switch between 521 // being internal and external, and even between being Latin1 and UC16, 522 // but the characters are always the same). 523 IrregexpPrepare(regexp, subject); 524 is_one_byte = subject->IsOneByteRepresentationUnderneath(); 525 } while (true); 526 UNREACHABLE(); 527 return RE_EXCEPTION; 528 #else // V8_INTERPRETED_REGEXP 529 530 DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp)); 531 // We must have done EnsureCompiledIrregexp, so we can get the number of 532 // registers. 533 int number_of_capture_registers = 534 (IrregexpNumberOfCaptures(*irregexp) + 1) * 2; 535 int32_t* raw_output = &output[number_of_capture_registers]; 536 // We do not touch the actual capture result registers until we know there 537 // has been a match so that we can use those capture results to set the 538 // last match info. 539 for (int i = number_of_capture_registers - 1; i >= 0; i--) { 540 raw_output[i] = -1; 541 } 542 Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_one_byte), 543 isolate); 544 545 IrregexpResult result = IrregexpInterpreter::Match(isolate, 546 byte_codes, 547 subject, 548 raw_output, 549 index); 550 if (result == RE_SUCCESS) { 551 // Copy capture results to the start of the registers array. 552 MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t)); 553 } 554 if (result == RE_EXCEPTION) { 555 DCHECK(!isolate->has_pending_exception()); 556 isolate->StackOverflow(); 557 } 558 return result; 559 #endif // V8_INTERPRETED_REGEXP 560 } 561 562 MaybeHandle<Object> RegExpImpl::IrregexpExec( 563 Handle<JSRegExp> regexp, Handle<String> subject, int previous_index, 564 Handle<RegExpMatchInfo> last_match_info) { 565 Isolate* isolate = regexp->GetIsolate(); 566 DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP); 567 568 // Prepare space for the return values. 569 #if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG) 570 if (FLAG_trace_regexp_bytecodes) { 571 String* pattern = regexp->Pattern(); 572 PrintF("\n\nRegexp match: /%s/\n\n", pattern->ToCString().get()); 573 PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get()); 574 } 575 #endif 576 int required_registers = RegExpImpl::IrregexpPrepare(regexp, subject); 577 if (required_registers < 0) { 578 // Compiling failed with an exception. 579 DCHECK(isolate->has_pending_exception()); 580 return MaybeHandle<Object>(); 581 } 582 583 int32_t* output_registers = NULL; 584 if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) { 585 output_registers = NewArray<int32_t>(required_registers); 586 } 587 std::unique_ptr<int32_t[]> auto_release(output_registers); 588 if (output_registers == NULL) { 589 output_registers = isolate->jsregexp_static_offsets_vector(); 590 } 591 592 int res = RegExpImpl::IrregexpExecRaw( 593 regexp, subject, previous_index, output_registers, required_registers); 594 if (res == RE_SUCCESS) { 595 int capture_count = 596 IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())); 597 return SetLastMatchInfo( 598 last_match_info, subject, capture_count, output_registers); 599 } 600 if (res == RE_EXCEPTION) { 601 DCHECK(isolate->has_pending_exception()); 602 return MaybeHandle<Object>(); 603 } 604 DCHECK(res == RE_FAILURE); 605 return isolate->factory()->null_value(); 606 } 607 608 Handle<RegExpMatchInfo> RegExpImpl::SetLastMatchInfo( 609 Handle<RegExpMatchInfo> last_match_info, Handle<String> subject, 610 int capture_count, int32_t* match) { 611 // This is the only place where match infos can grow. If, after executing the 612 // regexp, RegExpExecStub finds that the match info is too small, it restarts 613 // execution in RegExpImpl::Exec, which finally grows the match info right 614 // here. 615 616 int capture_register_count = (capture_count + 1) * 2; 617 Handle<RegExpMatchInfo> result = 618 RegExpMatchInfo::ReserveCaptures(last_match_info, capture_register_count); 619 result->SetNumberOfCaptureRegisters(capture_register_count); 620 621 if (*result != *last_match_info) { 622 // The match info has been reallocated, update the corresponding reference 623 // on the native context. 624 Isolate* isolate = last_match_info->GetIsolate(); 625 if (*last_match_info == *isolate->regexp_last_match_info()) { 626 isolate->native_context()->set_regexp_last_match_info(*result); 627 } else if (*last_match_info == *isolate->regexp_internal_match_info()) { 628 isolate->native_context()->set_regexp_internal_match_info(*result); 629 } 630 } 631 632 DisallowHeapAllocation no_allocation; 633 if (match != NULL) { 634 for (int i = 0; i < capture_register_count; i += 2) { 635 result->SetCapture(i, match[i]); 636 result->SetCapture(i + 1, match[i + 1]); 637 } 638 } 639 result->SetLastSubject(*subject); 640 result->SetLastInput(*subject); 641 return result; 642 } 643 644 645 RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp, 646 Handle<String> subject, 647 Isolate* isolate) 648 : register_array_(NULL), 649 register_array_size_(0), 650 regexp_(regexp), 651 subject_(subject) { 652 #ifdef V8_INTERPRETED_REGEXP 653 bool interpreted = true; 654 #else 655 bool interpreted = false; 656 #endif // V8_INTERPRETED_REGEXP 657 658 if (regexp_->TypeTag() == JSRegExp::ATOM) { 659 static const int kAtomRegistersPerMatch = 2; 660 registers_per_match_ = kAtomRegistersPerMatch; 661 // There is no distinction between interpreted and native for atom regexps. 662 interpreted = false; 663 } else { 664 registers_per_match_ = RegExpImpl::IrregexpPrepare(regexp_, subject_); 665 if (registers_per_match_ < 0) { 666 num_matches_ = -1; // Signal exception. 667 return; 668 } 669 } 670 671 DCHECK_NE(0, regexp->GetFlags() & JSRegExp::kGlobal); 672 if (!interpreted) { 673 register_array_size_ = 674 Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize); 675 max_matches_ = register_array_size_ / registers_per_match_; 676 } else { 677 // Global loop in interpreted regexp is not implemented. We choose 678 // the size of the offsets vector so that it can only store one match. 679 register_array_size_ = registers_per_match_; 680 max_matches_ = 1; 681 } 682 683 if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) { 684 register_array_ = NewArray<int32_t>(register_array_size_); 685 } else { 686 register_array_ = isolate->jsregexp_static_offsets_vector(); 687 } 688 689 // Set state so that fetching the results the first time triggers a call 690 // to the compiled regexp. 691 current_match_index_ = max_matches_ - 1; 692 num_matches_ = max_matches_; 693 DCHECK(registers_per_match_ >= 2); // Each match has at least one capture. 694 DCHECK_GE(register_array_size_, registers_per_match_); 695 int32_t* last_match = 696 ®ister_array_[current_match_index_ * registers_per_match_]; 697 last_match[0] = -1; 698 last_match[1] = 0; 699 } 700 701 int RegExpImpl::GlobalCache::AdvanceZeroLength(int last_index) { 702 if ((regexp_->GetFlags() & JSRegExp::kUnicode) != 0 && 703 last_index + 1 < subject_->length() && 704 unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) && 705 unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) { 706 // Advance over the surrogate pair. 707 return last_index + 2; 708 } 709 return last_index + 1; 710 } 711 712 // ------------------------------------------------------------------- 713 // Implementation of the Irregexp regular expression engine. 714 // 715 // The Irregexp regular expression engine is intended to be a complete 716 // implementation of ECMAScript regular expressions. It generates either 717 // bytecodes or native code. 718 719 // The Irregexp regexp engine is structured in three steps. 720 // 1) The parser generates an abstract syntax tree. See ast.cc. 721 // 2) From the AST a node network is created. The nodes are all 722 // subclasses of RegExpNode. The nodes represent states when 723 // executing a regular expression. Several optimizations are 724 // performed on the node network. 725 // 3) From the nodes we generate either byte codes or native code 726 // that can actually execute the regular expression (perform 727 // the search). The code generation step is described in more 728 // detail below. 729 730 // Code generation. 731 // 732 // The nodes are divided into four main categories. 733 // * Choice nodes 734 // These represent places where the regular expression can 735 // match in more than one way. For example on entry to an 736 // alternation (foo|bar) or a repetition (*, +, ? or {}). 737 // * Action nodes 738 // These represent places where some action should be 739 // performed. Examples include recording the current position 740 // in the input string to a register (in order to implement 741 // captures) or other actions on register for example in order 742 // to implement the counters needed for {} repetitions. 743 // * Matching nodes 744 // These attempt to match some element part of the input string. 745 // Examples of elements include character classes, plain strings 746 // or back references. 747 // * End nodes 748 // These are used to implement the actions required on finding 749 // a successful match or failing to find a match. 750 // 751 // The code generated (whether as byte codes or native code) maintains 752 // some state as it runs. This consists of the following elements: 753 // 754 // * The capture registers. Used for string captures. 755 // * Other registers. Used for counters etc. 756 // * The current position. 757 // * The stack of backtracking information. Used when a matching node 758 // fails to find a match and needs to try an alternative. 759 // 760 // Conceptual regular expression execution model: 761 // 762 // There is a simple conceptual model of regular expression execution 763 // which will be presented first. The actual code generated is a more 764 // efficient simulation of the simple conceptual model: 765 // 766 // * Choice nodes are implemented as follows: 767 // For each choice except the last { 768 // push current position 769 // push backtrack code location 770 // <generate code to test for choice> 771 // backtrack code location: 772 // pop current position 773 // } 774 // <generate code to test for last choice> 775 // 776 // * Actions nodes are generated as follows 777 // <push affected registers on backtrack stack> 778 // <generate code to perform action> 779 // push backtrack code location 780 // <generate code to test for following nodes> 781 // backtrack code location: 782 // <pop affected registers to restore their state> 783 // <pop backtrack location from stack and go to it> 784 // 785 // * Matching nodes are generated as follows: 786 // if input string matches at current position 787 // update current position 788 // <generate code to test for following nodes> 789 // else 790 // <pop backtrack location from stack and go to it> 791 // 792 // Thus it can be seen that the current position is saved and restored 793 // by the choice nodes, whereas the registers are saved and restored by 794 // by the action nodes that manipulate them. 795 // 796 // The other interesting aspect of this model is that nodes are generated 797 // at the point where they are needed by a recursive call to Emit(). If 798 // the node has already been code generated then the Emit() call will 799 // generate a jump to the previously generated code instead. In order to 800 // limit recursion it is possible for the Emit() function to put the node 801 // on a work list for later generation and instead generate a jump. The 802 // destination of the jump is resolved later when the code is generated. 803 // 804 // Actual regular expression code generation. 805 // 806 // Code generation is actually more complicated than the above. In order 807 // to improve the efficiency of the generated code some optimizations are 808 // performed 809 // 810 // * Choice nodes have 1-character lookahead. 811 // A choice node looks at the following character and eliminates some of 812 // the choices immediately based on that character. This is not yet 813 // implemented. 814 // * Simple greedy loops store reduced backtracking information. 815 // A quantifier like /.*foo/m will greedily match the whole input. It will 816 // then need to backtrack to a point where it can match "foo". The naive 817 // implementation of this would push each character position onto the 818 // backtracking stack, then pop them off one by one. This would use space 819 // proportional to the length of the input string. However since the "." 820 // can only match in one way and always has a constant length (in this case 821 // of 1) it suffices to store the current position on the top of the stack 822 // once. Matching now becomes merely incrementing the current position and 823 // backtracking becomes decrementing the current position and checking the 824 // result against the stored current position. This is faster and saves 825 // space. 826 // * The current state is virtualized. 827 // This is used to defer expensive operations until it is clear that they 828 // are needed and to generate code for a node more than once, allowing 829 // specialized an efficient versions of the code to be created. This is 830 // explained in the section below. 831 // 832 // Execution state virtualization. 833 // 834 // Instead of emitting code, nodes that manipulate the state can record their 835 // manipulation in an object called the Trace. The Trace object can record a 836 // current position offset, an optional backtrack code location on the top of 837 // the virtualized backtrack stack and some register changes. When a node is 838 // to be emitted it can flush the Trace or update it. Flushing the Trace 839 // will emit code to bring the actual state into line with the virtual state. 840 // Avoiding flushing the state can postpone some work (e.g. updates of capture 841 // registers). Postponing work can save time when executing the regular 842 // expression since it may be found that the work never has to be done as a 843 // failure to match can occur. In addition it is much faster to jump to a 844 // known backtrack code location than it is to pop an unknown backtrack 845 // location from the stack and jump there. 846 // 847 // The virtual state found in the Trace affects code generation. For example 848 // the virtual state contains the difference between the actual current 849 // position and the virtual current position, and matching code needs to use 850 // this offset to attempt a match in the correct location of the input 851 // string. Therefore code generated for a non-trivial trace is specialized 852 // to that trace. The code generator therefore has the ability to generate 853 // code for each node several times. In order to limit the size of the 854 // generated code there is an arbitrary limit on how many specialized sets of 855 // code may be generated for a given node. If the limit is reached, the 856 // trace is flushed and a generic version of the code for a node is emitted. 857 // This is subsequently used for that node. The code emitted for non-generic 858 // trace is not recorded in the node and so it cannot currently be reused in 859 // the event that code generation is requested for an identical trace. 860 861 862 void RegExpTree::AppendToText(RegExpText* text, Zone* zone) { 863 UNREACHABLE(); 864 } 865 866 867 void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) { 868 text->AddElement(TextElement::Atom(this), zone); 869 } 870 871 872 void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) { 873 text->AddElement(TextElement::CharClass(this), zone); 874 } 875 876 877 void RegExpText::AppendToText(RegExpText* text, Zone* zone) { 878 for (int i = 0; i < elements()->length(); i++) 879 text->AddElement(elements()->at(i), zone); 880 } 881 882 883 TextElement TextElement::Atom(RegExpAtom* atom) { 884 return TextElement(ATOM, atom); 885 } 886 887 888 TextElement TextElement::CharClass(RegExpCharacterClass* char_class) { 889 return TextElement(CHAR_CLASS, char_class); 890 } 891 892 893 int TextElement::length() const { 894 switch (text_type()) { 895 case ATOM: 896 return atom()->length(); 897 898 case CHAR_CLASS: 899 return 1; 900 } 901 UNREACHABLE(); 902 return 0; 903 } 904 905 906 DispatchTable* ChoiceNode::GetTable(bool ignore_case) { 907 if (table_ == NULL) { 908 table_ = new(zone()) DispatchTable(zone()); 909 DispatchTableConstructor cons(table_, ignore_case, zone()); 910 cons.BuildTable(this); 911 } 912 return table_; 913 } 914 915 916 class FrequencyCollator { 917 public: 918 FrequencyCollator() : total_samples_(0) { 919 for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) { 920 frequencies_[i] = CharacterFrequency(i); 921 } 922 } 923 924 void CountCharacter(int character) { 925 int index = (character & RegExpMacroAssembler::kTableMask); 926 frequencies_[index].Increment(); 927 total_samples_++; 928 } 929 930 // Does not measure in percent, but rather per-128 (the table size from the 931 // regexp macro assembler). 932 int Frequency(int in_character) { 933 DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character); 934 if (total_samples_ < 1) return 1; // Division by zero. 935 int freq_in_per128 = 936 (frequencies_[in_character].counter() * 128) / total_samples_; 937 return freq_in_per128; 938 } 939 940 private: 941 class CharacterFrequency { 942 public: 943 CharacterFrequency() : counter_(0), character_(-1) { } 944 explicit CharacterFrequency(int character) 945 : counter_(0), character_(character) { } 946 947 void Increment() { counter_++; } 948 int counter() { return counter_; } 949 int character() { return character_; } 950 951 private: 952 int counter_; 953 int character_; 954 }; 955 956 957 private: 958 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize]; 959 int total_samples_; 960 }; 961 962 963 class RegExpCompiler { 964 public: 965 RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count, 966 JSRegExp::Flags flags, bool is_one_byte); 967 968 int AllocateRegister() { 969 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) { 970 reg_exp_too_big_ = true; 971 return next_register_; 972 } 973 return next_register_++; 974 } 975 976 // Lookarounds to match lone surrogates for unicode character class matches 977 // are never nested. We can therefore reuse registers. 978 int UnicodeLookaroundStackRegister() { 979 if (unicode_lookaround_stack_register_ == kNoRegister) { 980 unicode_lookaround_stack_register_ = AllocateRegister(); 981 } 982 return unicode_lookaround_stack_register_; 983 } 984 985 int UnicodeLookaroundPositionRegister() { 986 if (unicode_lookaround_position_register_ == kNoRegister) { 987 unicode_lookaround_position_register_ = AllocateRegister(); 988 } 989 return unicode_lookaround_position_register_; 990 } 991 992 RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler, 993 RegExpNode* start, 994 int capture_count, 995 Handle<String> pattern); 996 997 inline void AddWork(RegExpNode* node) { 998 if (!node->on_work_list() && !node->label()->is_bound()) { 999 node->set_on_work_list(true); 1000 work_list_->Add(node); 1001 } 1002 } 1003 1004 static const int kImplementationOffset = 0; 1005 static const int kNumberOfRegistersOffset = 0; 1006 static const int kCodeOffset = 1; 1007 1008 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; } 1009 EndNode* accept() { return accept_; } 1010 1011 static const int kMaxRecursion = 100; 1012 inline int recursion_depth() { return recursion_depth_; } 1013 inline void IncrementRecursionDepth() { recursion_depth_++; } 1014 inline void DecrementRecursionDepth() { recursion_depth_--; } 1015 1016 void SetRegExpTooBig() { reg_exp_too_big_ = true; } 1017 1018 inline bool ignore_case() { return (flags_ & JSRegExp::kIgnoreCase) != 0; } 1019 inline bool unicode() { return (flags_ & JSRegExp::kUnicode) != 0; } 1020 inline bool one_byte() { return one_byte_; } 1021 inline bool optimize() { return optimize_; } 1022 inline void set_optimize(bool value) { optimize_ = value; } 1023 inline bool limiting_recursion() { return limiting_recursion_; } 1024 inline void set_limiting_recursion(bool value) { 1025 limiting_recursion_ = value; 1026 } 1027 bool read_backward() { return read_backward_; } 1028 void set_read_backward(bool value) { read_backward_ = value; } 1029 FrequencyCollator* frequency_collator() { return &frequency_collator_; } 1030 1031 int current_expansion_factor() { return current_expansion_factor_; } 1032 void set_current_expansion_factor(int value) { 1033 current_expansion_factor_ = value; 1034 } 1035 1036 Isolate* isolate() const { return isolate_; } 1037 Zone* zone() const { return zone_; } 1038 1039 static const int kNoRegister = -1; 1040 1041 private: 1042 EndNode* accept_; 1043 int next_register_; 1044 int unicode_lookaround_stack_register_; 1045 int unicode_lookaround_position_register_; 1046 List<RegExpNode*>* work_list_; 1047 int recursion_depth_; 1048 RegExpMacroAssembler* macro_assembler_; 1049 JSRegExp::Flags flags_; 1050 bool one_byte_; 1051 bool reg_exp_too_big_; 1052 bool limiting_recursion_; 1053 bool optimize_; 1054 bool read_backward_; 1055 int current_expansion_factor_; 1056 FrequencyCollator frequency_collator_; 1057 Isolate* isolate_; 1058 Zone* zone_; 1059 }; 1060 1061 1062 class RecursionCheck { 1063 public: 1064 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) { 1065 compiler->IncrementRecursionDepth(); 1066 } 1067 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); } 1068 private: 1069 RegExpCompiler* compiler_; 1070 }; 1071 1072 1073 static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) { 1074 return RegExpEngine::CompilationResult(isolate, "RegExp too big"); 1075 } 1076 1077 1078 // Attempts to compile the regexp using an Irregexp code generator. Returns 1079 // a fixed array or a null handle depending on whether it succeeded. 1080 RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count, 1081 JSRegExp::Flags flags, bool one_byte) 1082 : next_register_(2 * (capture_count + 1)), 1083 unicode_lookaround_stack_register_(kNoRegister), 1084 unicode_lookaround_position_register_(kNoRegister), 1085 work_list_(NULL), 1086 recursion_depth_(0), 1087 flags_(flags), 1088 one_byte_(one_byte), 1089 reg_exp_too_big_(false), 1090 limiting_recursion_(false), 1091 optimize_(FLAG_regexp_optimization), 1092 read_backward_(false), 1093 current_expansion_factor_(1), 1094 frequency_collator_(), 1095 isolate_(isolate), 1096 zone_(zone) { 1097 accept_ = new(zone) EndNode(EndNode::ACCEPT, zone); 1098 DCHECK(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister); 1099 } 1100 1101 1102 RegExpEngine::CompilationResult RegExpCompiler::Assemble( 1103 RegExpMacroAssembler* macro_assembler, 1104 RegExpNode* start, 1105 int capture_count, 1106 Handle<String> pattern) { 1107 Heap* heap = pattern->GetHeap(); 1108 1109 #ifdef DEBUG 1110 if (FLAG_trace_regexp_assembler) 1111 macro_assembler_ = 1112 new RegExpMacroAssemblerTracer(isolate(), macro_assembler); 1113 else 1114 #endif 1115 macro_assembler_ = macro_assembler; 1116 1117 List <RegExpNode*> work_list(0); 1118 work_list_ = &work_list; 1119 Label fail; 1120 macro_assembler_->PushBacktrack(&fail); 1121 Trace new_trace; 1122 start->Emit(this, &new_trace); 1123 macro_assembler_->Bind(&fail); 1124 macro_assembler_->Fail(); 1125 while (!work_list.is_empty()) { 1126 RegExpNode* node = work_list.RemoveLast(); 1127 node->set_on_work_list(false); 1128 if (!node->label()->is_bound()) node->Emit(this, &new_trace); 1129 } 1130 if (reg_exp_too_big_) { 1131 macro_assembler_->AbortedCodeGeneration(); 1132 return IrregexpRegExpTooBig(isolate_); 1133 } 1134 1135 Handle<HeapObject> code = macro_assembler_->GetCode(pattern); 1136 heap->IncreaseTotalRegexpCodeGenerated(code->Size()); 1137 work_list_ = NULL; 1138 #ifdef ENABLE_DISASSEMBLER 1139 if (FLAG_print_code) { 1140 CodeTracer::Scope trace_scope(heap->isolate()->GetCodeTracer()); 1141 OFStream os(trace_scope.file()); 1142 Handle<Code>::cast(code)->Disassemble(pattern->ToCString().get(), os); 1143 } 1144 #endif 1145 #ifdef DEBUG 1146 if (FLAG_trace_regexp_assembler) { 1147 delete macro_assembler_; 1148 } 1149 #endif 1150 return RegExpEngine::CompilationResult(*code, next_register_); 1151 } 1152 1153 1154 bool Trace::DeferredAction::Mentions(int that) { 1155 if (action_type() == ActionNode::CLEAR_CAPTURES) { 1156 Interval range = static_cast<DeferredClearCaptures*>(this)->range(); 1157 return range.Contains(that); 1158 } else { 1159 return reg() == that; 1160 } 1161 } 1162 1163 1164 bool Trace::mentions_reg(int reg) { 1165 for (DeferredAction* action = actions_; 1166 action != NULL; 1167 action = action->next()) { 1168 if (action->Mentions(reg)) 1169 return true; 1170 } 1171 return false; 1172 } 1173 1174 1175 bool Trace::GetStoredPosition(int reg, int* cp_offset) { 1176 DCHECK_EQ(0, *cp_offset); 1177 for (DeferredAction* action = actions_; 1178 action != NULL; 1179 action = action->next()) { 1180 if (action->Mentions(reg)) { 1181 if (action->action_type() == ActionNode::STORE_POSITION) { 1182 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset(); 1183 return true; 1184 } else { 1185 return false; 1186 } 1187 } 1188 } 1189 return false; 1190 } 1191 1192 1193 int Trace::FindAffectedRegisters(OutSet* affected_registers, 1194 Zone* zone) { 1195 int max_register = RegExpCompiler::kNoRegister; 1196 for (DeferredAction* action = actions_; 1197 action != NULL; 1198 action = action->next()) { 1199 if (action->action_type() == ActionNode::CLEAR_CAPTURES) { 1200 Interval range = static_cast<DeferredClearCaptures*>(action)->range(); 1201 for (int i = range.from(); i <= range.to(); i++) 1202 affected_registers->Set(i, zone); 1203 if (range.to() > max_register) max_register = range.to(); 1204 } else { 1205 affected_registers->Set(action->reg(), zone); 1206 if (action->reg() > max_register) max_register = action->reg(); 1207 } 1208 } 1209 return max_register; 1210 } 1211 1212 1213 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler, 1214 int max_register, 1215 const OutSet& registers_to_pop, 1216 const OutSet& registers_to_clear) { 1217 for (int reg = max_register; reg >= 0; reg--) { 1218 if (registers_to_pop.Get(reg)) { 1219 assembler->PopRegister(reg); 1220 } else if (registers_to_clear.Get(reg)) { 1221 int clear_to = reg; 1222 while (reg > 0 && registers_to_clear.Get(reg - 1)) { 1223 reg--; 1224 } 1225 assembler->ClearRegisters(reg, clear_to); 1226 } 1227 } 1228 } 1229 1230 1231 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler, 1232 int max_register, 1233 const OutSet& affected_registers, 1234 OutSet* registers_to_pop, 1235 OutSet* registers_to_clear, 1236 Zone* zone) { 1237 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1. 1238 const int push_limit = (assembler->stack_limit_slack() + 1) / 2; 1239 1240 // Count pushes performed to force a stack limit check occasionally. 1241 int pushes = 0; 1242 1243 for (int reg = 0; reg <= max_register; reg++) { 1244 if (!affected_registers.Get(reg)) { 1245 continue; 1246 } 1247 1248 // The chronologically first deferred action in the trace 1249 // is used to infer the action needed to restore a register 1250 // to its previous state (or not, if it's safe to ignore it). 1251 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR }; 1252 DeferredActionUndoType undo_action = IGNORE; 1253 1254 int value = 0; 1255 bool absolute = false; 1256 bool clear = false; 1257 static const int kNoStore = kMinInt; 1258 int store_position = kNoStore; 1259 // This is a little tricky because we are scanning the actions in reverse 1260 // historical order (newest first). 1261 for (DeferredAction* action = actions_; 1262 action != NULL; 1263 action = action->next()) { 1264 if (action->Mentions(reg)) { 1265 switch (action->action_type()) { 1266 case ActionNode::SET_REGISTER: { 1267 Trace::DeferredSetRegister* psr = 1268 static_cast<Trace::DeferredSetRegister*>(action); 1269 if (!absolute) { 1270 value += psr->value(); 1271 absolute = true; 1272 } 1273 // SET_REGISTER is currently only used for newly introduced loop 1274 // counters. They can have a significant previous value if they 1275 // occour in a loop. TODO(lrn): Propagate this information, so 1276 // we can set undo_action to IGNORE if we know there is no value to 1277 // restore. 1278 undo_action = RESTORE; 1279 DCHECK_EQ(store_position, kNoStore); 1280 DCHECK(!clear); 1281 break; 1282 } 1283 case ActionNode::INCREMENT_REGISTER: 1284 if (!absolute) { 1285 value++; 1286 } 1287 DCHECK_EQ(store_position, kNoStore); 1288 DCHECK(!clear); 1289 undo_action = RESTORE; 1290 break; 1291 case ActionNode::STORE_POSITION: { 1292 Trace::DeferredCapture* pc = 1293 static_cast<Trace::DeferredCapture*>(action); 1294 if (!clear && store_position == kNoStore) { 1295 store_position = pc->cp_offset(); 1296 } 1297 1298 // For captures we know that stores and clears alternate. 1299 // Other register, are never cleared, and if the occur 1300 // inside a loop, they might be assigned more than once. 1301 if (reg <= 1) { 1302 // Registers zero and one, aka "capture zero", is 1303 // always set correctly if we succeed. There is no 1304 // need to undo a setting on backtrack, because we 1305 // will set it again or fail. 1306 undo_action = IGNORE; 1307 } else { 1308 undo_action = pc->is_capture() ? CLEAR : RESTORE; 1309 } 1310 DCHECK(!absolute); 1311 DCHECK_EQ(value, 0); 1312 break; 1313 } 1314 case ActionNode::CLEAR_CAPTURES: { 1315 // Since we're scanning in reverse order, if we've already 1316 // set the position we have to ignore historically earlier 1317 // clearing operations. 1318 if (store_position == kNoStore) { 1319 clear = true; 1320 } 1321 undo_action = RESTORE; 1322 DCHECK(!absolute); 1323 DCHECK_EQ(value, 0); 1324 break; 1325 } 1326 default: 1327 UNREACHABLE(); 1328 break; 1329 } 1330 } 1331 } 1332 // Prepare for the undo-action (e.g., push if it's going to be popped). 1333 if (undo_action == RESTORE) { 1334 pushes++; 1335 RegExpMacroAssembler::StackCheckFlag stack_check = 1336 RegExpMacroAssembler::kNoStackLimitCheck; 1337 if (pushes == push_limit) { 1338 stack_check = RegExpMacroAssembler::kCheckStackLimit; 1339 pushes = 0; 1340 } 1341 1342 assembler->PushRegister(reg, stack_check); 1343 registers_to_pop->Set(reg, zone); 1344 } else if (undo_action == CLEAR) { 1345 registers_to_clear->Set(reg, zone); 1346 } 1347 // Perform the chronologically last action (or accumulated increment) 1348 // for the register. 1349 if (store_position != kNoStore) { 1350 assembler->WriteCurrentPositionToRegister(reg, store_position); 1351 } else if (clear) { 1352 assembler->ClearRegisters(reg, reg); 1353 } else if (absolute) { 1354 assembler->SetRegister(reg, value); 1355 } else if (value != 0) { 1356 assembler->AdvanceRegister(reg, value); 1357 } 1358 } 1359 } 1360 1361 1362 // This is called as we come into a loop choice node and some other tricky 1363 // nodes. It normalizes the state of the code generator to ensure we can 1364 // generate generic code. 1365 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) { 1366 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1367 1368 DCHECK(!is_trivial()); 1369 1370 if (actions_ == NULL && backtrack() == NULL) { 1371 // Here we just have some deferred cp advances to fix and we are back to 1372 // a normal situation. We may also have to forget some information gained 1373 // through a quick check that was already performed. 1374 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_); 1375 // Create a new trivial state and generate the node with that. 1376 Trace new_state; 1377 successor->Emit(compiler, &new_state); 1378 return; 1379 } 1380 1381 // Generate deferred actions here along with code to undo them again. 1382 OutSet affected_registers; 1383 1384 if (backtrack() != NULL) { 1385 // Here we have a concrete backtrack location. These are set up by choice 1386 // nodes and so they indicate that we have a deferred save of the current 1387 // position which we may need to emit here. 1388 assembler->PushCurrentPosition(); 1389 } 1390 1391 int max_register = FindAffectedRegisters(&affected_registers, 1392 compiler->zone()); 1393 OutSet registers_to_pop; 1394 OutSet registers_to_clear; 1395 PerformDeferredActions(assembler, 1396 max_register, 1397 affected_registers, 1398 ®isters_to_pop, 1399 ®isters_to_clear, 1400 compiler->zone()); 1401 if (cp_offset_ != 0) { 1402 assembler->AdvanceCurrentPosition(cp_offset_); 1403 } 1404 1405 // Create a new trivial state and generate the node with that. 1406 Label undo; 1407 assembler->PushBacktrack(&undo); 1408 if (successor->KeepRecursing(compiler)) { 1409 Trace new_state; 1410 successor->Emit(compiler, &new_state); 1411 } else { 1412 compiler->AddWork(successor); 1413 assembler->GoTo(successor->label()); 1414 } 1415 1416 // On backtrack we need to restore state. 1417 assembler->Bind(&undo); 1418 RestoreAffectedRegisters(assembler, 1419 max_register, 1420 registers_to_pop, 1421 registers_to_clear); 1422 if (backtrack() == NULL) { 1423 assembler->Backtrack(); 1424 } else { 1425 assembler->PopCurrentPosition(); 1426 assembler->GoTo(backtrack()); 1427 } 1428 } 1429 1430 1431 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) { 1432 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1433 1434 // Omit flushing the trace. We discard the entire stack frame anyway. 1435 1436 if (!label()->is_bound()) { 1437 // We are completely independent of the trace, since we ignore it, 1438 // so this code can be used as the generic version. 1439 assembler->Bind(label()); 1440 } 1441 1442 // Throw away everything on the backtrack stack since the start 1443 // of the negative submatch and restore the character position. 1444 assembler->ReadCurrentPositionFromRegister(current_position_register_); 1445 assembler->ReadStackPointerFromRegister(stack_pointer_register_); 1446 if (clear_capture_count_ > 0) { 1447 // Clear any captures that might have been performed during the success 1448 // of the body of the negative look-ahead. 1449 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1; 1450 assembler->ClearRegisters(clear_capture_start_, clear_capture_end); 1451 } 1452 // Now that we have unwound the stack we find at the top of the stack the 1453 // backtrack that the BeginSubmatch node got. 1454 assembler->Backtrack(); 1455 } 1456 1457 1458 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) { 1459 if (!trace->is_trivial()) { 1460 trace->Flush(compiler, this); 1461 return; 1462 } 1463 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1464 if (!label()->is_bound()) { 1465 assembler->Bind(label()); 1466 } 1467 switch (action_) { 1468 case ACCEPT: 1469 assembler->Succeed(); 1470 return; 1471 case BACKTRACK: 1472 assembler->GoTo(trace->backtrack()); 1473 return; 1474 case NEGATIVE_SUBMATCH_SUCCESS: 1475 // This case is handled in a different virtual method. 1476 UNREACHABLE(); 1477 } 1478 UNIMPLEMENTED(); 1479 } 1480 1481 1482 void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) { 1483 if (guards_ == NULL) 1484 guards_ = new(zone) ZoneList<Guard*>(1, zone); 1485 guards_->Add(guard, zone); 1486 } 1487 1488 1489 ActionNode* ActionNode::SetRegister(int reg, 1490 int val, 1491 RegExpNode* on_success) { 1492 ActionNode* result = 1493 new(on_success->zone()) ActionNode(SET_REGISTER, on_success); 1494 result->data_.u_store_register.reg = reg; 1495 result->data_.u_store_register.value = val; 1496 return result; 1497 } 1498 1499 1500 ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) { 1501 ActionNode* result = 1502 new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success); 1503 result->data_.u_increment_register.reg = reg; 1504 return result; 1505 } 1506 1507 1508 ActionNode* ActionNode::StorePosition(int reg, 1509 bool is_capture, 1510 RegExpNode* on_success) { 1511 ActionNode* result = 1512 new(on_success->zone()) ActionNode(STORE_POSITION, on_success); 1513 result->data_.u_position_register.reg = reg; 1514 result->data_.u_position_register.is_capture = is_capture; 1515 return result; 1516 } 1517 1518 1519 ActionNode* ActionNode::ClearCaptures(Interval range, 1520 RegExpNode* on_success) { 1521 ActionNode* result = 1522 new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success); 1523 result->data_.u_clear_captures.range_from = range.from(); 1524 result->data_.u_clear_captures.range_to = range.to(); 1525 return result; 1526 } 1527 1528 1529 ActionNode* ActionNode::BeginSubmatch(int stack_reg, 1530 int position_reg, 1531 RegExpNode* on_success) { 1532 ActionNode* result = 1533 new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success); 1534 result->data_.u_submatch.stack_pointer_register = stack_reg; 1535 result->data_.u_submatch.current_position_register = position_reg; 1536 return result; 1537 } 1538 1539 1540 ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg, 1541 int position_reg, 1542 int clear_register_count, 1543 int clear_register_from, 1544 RegExpNode* on_success) { 1545 ActionNode* result = 1546 new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success); 1547 result->data_.u_submatch.stack_pointer_register = stack_reg; 1548 result->data_.u_submatch.current_position_register = position_reg; 1549 result->data_.u_submatch.clear_register_count = clear_register_count; 1550 result->data_.u_submatch.clear_register_from = clear_register_from; 1551 return result; 1552 } 1553 1554 1555 ActionNode* ActionNode::EmptyMatchCheck(int start_register, 1556 int repetition_register, 1557 int repetition_limit, 1558 RegExpNode* on_success) { 1559 ActionNode* result = 1560 new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success); 1561 result->data_.u_empty_match_check.start_register = start_register; 1562 result->data_.u_empty_match_check.repetition_register = repetition_register; 1563 result->data_.u_empty_match_check.repetition_limit = repetition_limit; 1564 return result; 1565 } 1566 1567 1568 #define DEFINE_ACCEPT(Type) \ 1569 void Type##Node::Accept(NodeVisitor* visitor) { \ 1570 visitor->Visit##Type(this); \ 1571 } 1572 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT) 1573 #undef DEFINE_ACCEPT 1574 1575 1576 void LoopChoiceNode::Accept(NodeVisitor* visitor) { 1577 visitor->VisitLoopChoice(this); 1578 } 1579 1580 1581 // ------------------------------------------------------------------- 1582 // Emit code. 1583 1584 1585 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler, 1586 Guard* guard, 1587 Trace* trace) { 1588 switch (guard->op()) { 1589 case Guard::LT: 1590 DCHECK(!trace->mentions_reg(guard->reg())); 1591 macro_assembler->IfRegisterGE(guard->reg(), 1592 guard->value(), 1593 trace->backtrack()); 1594 break; 1595 case Guard::GEQ: 1596 DCHECK(!trace->mentions_reg(guard->reg())); 1597 macro_assembler->IfRegisterLT(guard->reg(), 1598 guard->value(), 1599 trace->backtrack()); 1600 break; 1601 } 1602 } 1603 1604 1605 // Returns the number of characters in the equivalence class, omitting those 1606 // that cannot occur in the source string because it is Latin1. 1607 static int GetCaseIndependentLetters(Isolate* isolate, uc16 character, 1608 bool one_byte_subject, 1609 unibrow::uchar* letters) { 1610 int length = 1611 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters); 1612 // Unibrow returns 0 or 1 for characters where case independence is 1613 // trivial. 1614 if (length == 0) { 1615 letters[0] = character; 1616 length = 1; 1617 } 1618 1619 if (one_byte_subject) { 1620 int new_length = 0; 1621 for (int i = 0; i < length; i++) { 1622 if (letters[i] <= String::kMaxOneByteCharCode) { 1623 letters[new_length++] = letters[i]; 1624 } 1625 } 1626 length = new_length; 1627 } 1628 1629 return length; 1630 } 1631 1632 1633 static inline bool EmitSimpleCharacter(Isolate* isolate, 1634 RegExpCompiler* compiler, 1635 uc16 c, 1636 Label* on_failure, 1637 int cp_offset, 1638 bool check, 1639 bool preloaded) { 1640 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1641 bool bound_checked = false; 1642 if (!preloaded) { 1643 assembler->LoadCurrentCharacter( 1644 cp_offset, 1645 on_failure, 1646 check); 1647 bound_checked = true; 1648 } 1649 assembler->CheckNotCharacter(c, on_failure); 1650 return bound_checked; 1651 } 1652 1653 1654 // Only emits non-letters (things that don't have case). Only used for case 1655 // independent matches. 1656 static inline bool EmitAtomNonLetter(Isolate* isolate, 1657 RegExpCompiler* compiler, 1658 uc16 c, 1659 Label* on_failure, 1660 int cp_offset, 1661 bool check, 1662 bool preloaded) { 1663 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 1664 bool one_byte = compiler->one_byte(); 1665 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1666 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars); 1667 if (length < 1) { 1668 // This can't match. Must be an one-byte subject and a non-one-byte 1669 // character. We do not need to do anything since the one-byte pass 1670 // already handled this. 1671 return false; // Bounds not checked. 1672 } 1673 bool checked = false; 1674 // We handle the length > 1 case in a later pass. 1675 if (length == 1) { 1676 if (one_byte && c > String::kMaxOneByteCharCodeU) { 1677 // Can't match - see above. 1678 return false; // Bounds not checked. 1679 } 1680 if (!preloaded) { 1681 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 1682 checked = check; 1683 } 1684 macro_assembler->CheckNotCharacter(c, on_failure); 1685 } 1686 return checked; 1687 } 1688 1689 1690 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler, 1691 bool one_byte, uc16 c1, uc16 c2, 1692 Label* on_failure) { 1693 uc16 char_mask; 1694 if (one_byte) { 1695 char_mask = String::kMaxOneByteCharCode; 1696 } else { 1697 char_mask = String::kMaxUtf16CodeUnit; 1698 } 1699 uc16 exor = c1 ^ c2; 1700 // Check whether exor has only one bit set. 1701 if (((exor - 1) & exor) == 0) { 1702 // If c1 and c2 differ only by one bit. 1703 // Ecma262UnCanonicalize always gives the highest number last. 1704 DCHECK(c2 > c1); 1705 uc16 mask = char_mask ^ exor; 1706 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure); 1707 return true; 1708 } 1709 DCHECK(c2 > c1); 1710 uc16 diff = c2 - c1; 1711 if (((diff - 1) & diff) == 0 && c1 >= diff) { 1712 // If the characters differ by 2^n but don't differ by one bit then 1713 // subtract the difference from the found character, then do the or 1714 // trick. We avoid the theoretical case where negative numbers are 1715 // involved in order to simplify code generation. 1716 uc16 mask = char_mask ^ diff; 1717 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff, 1718 diff, 1719 mask, 1720 on_failure); 1721 return true; 1722 } 1723 return false; 1724 } 1725 1726 1727 typedef bool EmitCharacterFunction(Isolate* isolate, 1728 RegExpCompiler* compiler, 1729 uc16 c, 1730 Label* on_failure, 1731 int cp_offset, 1732 bool check, 1733 bool preloaded); 1734 1735 // Only emits letters (things that have case). Only used for case independent 1736 // matches. 1737 static inline bool EmitAtomLetter(Isolate* isolate, 1738 RegExpCompiler* compiler, 1739 uc16 c, 1740 Label* on_failure, 1741 int cp_offset, 1742 bool check, 1743 bool preloaded) { 1744 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 1745 bool one_byte = compiler->one_byte(); 1746 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1747 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars); 1748 if (length <= 1) return false; 1749 // We may not need to check against the end of the input string 1750 // if this character lies before a character that matched. 1751 if (!preloaded) { 1752 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 1753 } 1754 Label ok; 1755 DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4); 1756 switch (length) { 1757 case 2: { 1758 if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0], 1759 chars[1], on_failure)) { 1760 } else { 1761 macro_assembler->CheckCharacter(chars[0], &ok); 1762 macro_assembler->CheckNotCharacter(chars[1], on_failure); 1763 macro_assembler->Bind(&ok); 1764 } 1765 break; 1766 } 1767 case 4: 1768 macro_assembler->CheckCharacter(chars[3], &ok); 1769 // Fall through! 1770 case 3: 1771 macro_assembler->CheckCharacter(chars[0], &ok); 1772 macro_assembler->CheckCharacter(chars[1], &ok); 1773 macro_assembler->CheckNotCharacter(chars[2], on_failure); 1774 macro_assembler->Bind(&ok); 1775 break; 1776 default: 1777 UNREACHABLE(); 1778 break; 1779 } 1780 return true; 1781 } 1782 1783 1784 static void EmitBoundaryTest(RegExpMacroAssembler* masm, 1785 int border, 1786 Label* fall_through, 1787 Label* above_or_equal, 1788 Label* below) { 1789 if (below != fall_through) { 1790 masm->CheckCharacterLT(border, below); 1791 if (above_or_equal != fall_through) masm->GoTo(above_or_equal); 1792 } else { 1793 masm->CheckCharacterGT(border - 1, above_or_equal); 1794 } 1795 } 1796 1797 1798 static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm, 1799 int first, 1800 int last, 1801 Label* fall_through, 1802 Label* in_range, 1803 Label* out_of_range) { 1804 if (in_range == fall_through) { 1805 if (first == last) { 1806 masm->CheckNotCharacter(first, out_of_range); 1807 } else { 1808 masm->CheckCharacterNotInRange(first, last, out_of_range); 1809 } 1810 } else { 1811 if (first == last) { 1812 masm->CheckCharacter(first, in_range); 1813 } else { 1814 masm->CheckCharacterInRange(first, last, in_range); 1815 } 1816 if (out_of_range != fall_through) masm->GoTo(out_of_range); 1817 } 1818 } 1819 1820 1821 // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even. 1822 // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd. 1823 static void EmitUseLookupTable( 1824 RegExpMacroAssembler* masm, 1825 ZoneList<int>* ranges, 1826 int start_index, 1827 int end_index, 1828 int min_char, 1829 Label* fall_through, 1830 Label* even_label, 1831 Label* odd_label) { 1832 static const int kSize = RegExpMacroAssembler::kTableSize; 1833 static const int kMask = RegExpMacroAssembler::kTableMask; 1834 1835 int base = (min_char & ~kMask); 1836 USE(base); 1837 1838 // Assert that everything is on one kTableSize page. 1839 for (int i = start_index; i <= end_index; i++) { 1840 DCHECK_EQ(ranges->at(i) & ~kMask, base); 1841 } 1842 DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base); 1843 1844 char templ[kSize]; 1845 Label* on_bit_set; 1846 Label* on_bit_clear; 1847 int bit; 1848 if (even_label == fall_through) { 1849 on_bit_set = odd_label; 1850 on_bit_clear = even_label; 1851 bit = 1; 1852 } else { 1853 on_bit_set = even_label; 1854 on_bit_clear = odd_label; 1855 bit = 0; 1856 } 1857 for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) { 1858 templ[i] = bit; 1859 } 1860 int j = 0; 1861 bit ^= 1; 1862 for (int i = start_index; i < end_index; i++) { 1863 for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) { 1864 templ[j] = bit; 1865 } 1866 bit ^= 1; 1867 } 1868 for (int i = j; i < kSize; i++) { 1869 templ[i] = bit; 1870 } 1871 Factory* factory = masm->isolate()->factory(); 1872 // TODO(erikcorry): Cache these. 1873 Handle<ByteArray> ba = factory->NewByteArray(kSize, TENURED); 1874 for (int i = 0; i < kSize; i++) { 1875 ba->set(i, templ[i]); 1876 } 1877 masm->CheckBitInTable(ba, on_bit_set); 1878 if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear); 1879 } 1880 1881 1882 static void CutOutRange(RegExpMacroAssembler* masm, 1883 ZoneList<int>* ranges, 1884 int start_index, 1885 int end_index, 1886 int cut_index, 1887 Label* even_label, 1888 Label* odd_label) { 1889 bool odd = (((cut_index - start_index) & 1) == 1); 1890 Label* in_range_label = odd ? odd_label : even_label; 1891 Label dummy; 1892 EmitDoubleBoundaryTest(masm, 1893 ranges->at(cut_index), 1894 ranges->at(cut_index + 1) - 1, 1895 &dummy, 1896 in_range_label, 1897 &dummy); 1898 DCHECK(!dummy.is_linked()); 1899 // Cut out the single range by rewriting the array. This creates a new 1900 // range that is a merger of the two ranges on either side of the one we 1901 // are cutting out. The oddity of the labels is preserved. 1902 for (int j = cut_index; j > start_index; j--) { 1903 ranges->at(j) = ranges->at(j - 1); 1904 } 1905 for (int j = cut_index + 1; j < end_index; j++) { 1906 ranges->at(j) = ranges->at(j + 1); 1907 } 1908 } 1909 1910 1911 // Unicode case. Split the search space into kSize spaces that are handled 1912 // with recursion. 1913 static void SplitSearchSpace(ZoneList<int>* ranges, 1914 int start_index, 1915 int end_index, 1916 int* new_start_index, 1917 int* new_end_index, 1918 int* border) { 1919 static const int kSize = RegExpMacroAssembler::kTableSize; 1920 static const int kMask = RegExpMacroAssembler::kTableMask; 1921 1922 int first = ranges->at(start_index); 1923 int last = ranges->at(end_index) - 1; 1924 1925 *new_start_index = start_index; 1926 *border = (ranges->at(start_index) & ~kMask) + kSize; 1927 while (*new_start_index < end_index) { 1928 if (ranges->at(*new_start_index) > *border) break; 1929 (*new_start_index)++; 1930 } 1931 // new_start_index is the index of the first edge that is beyond the 1932 // current kSize space. 1933 1934 // For very large search spaces we do a binary chop search of the non-Latin1 1935 // space instead of just going to the end of the current kSize space. The 1936 // heuristics are complicated a little by the fact that any 128-character 1937 // encoding space can be quickly tested with a table lookup, so we don't 1938 // wish to do binary chop search at a smaller granularity than that. A 1939 // 128-character space can take up a lot of space in the ranges array if, 1940 // for example, we only want to match every second character (eg. the lower 1941 // case characters on some Unicode pages). 1942 int binary_chop_index = (end_index + start_index) / 2; 1943 // The first test ensures that we get to the code that handles the Latin1 1944 // range with a single not-taken branch, speeding up this important 1945 // character range (even non-Latin1 charset-based text has spaces and 1946 // punctuation). 1947 if (*border - 1 > String::kMaxOneByteCharCode && // Latin1 case. 1948 end_index - start_index > (*new_start_index - start_index) * 2 && 1949 last - first > kSize * 2 && binary_chop_index > *new_start_index && 1950 ranges->at(binary_chop_index) >= first + 2 * kSize) { 1951 int scan_forward_for_section_border = binary_chop_index;; 1952 int new_border = (ranges->at(binary_chop_index) | kMask) + 1; 1953 1954 while (scan_forward_for_section_border < end_index) { 1955 if (ranges->at(scan_forward_for_section_border) > new_border) { 1956 *new_start_index = scan_forward_for_section_border; 1957 *border = new_border; 1958 break; 1959 } 1960 scan_forward_for_section_border++; 1961 } 1962 } 1963 1964 DCHECK(*new_start_index > start_index); 1965 *new_end_index = *new_start_index - 1; 1966 if (ranges->at(*new_end_index) == *border) { 1967 (*new_end_index)--; 1968 } 1969 if (*border >= ranges->at(end_index)) { 1970 *border = ranges->at(end_index); 1971 *new_start_index = end_index; // Won't be used. 1972 *new_end_index = end_index - 1; 1973 } 1974 } 1975 1976 1977 // Gets a series of segment boundaries representing a character class. If the 1978 // character is in the range between an even and an odd boundary (counting from 1979 // start_index) then go to even_label, otherwise go to odd_label. We already 1980 // know that the character is in the range of min_char to max_char inclusive. 1981 // Either label can be NULL indicating backtracking. Either label can also be 1982 // equal to the fall_through label. 1983 static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<int>* ranges, 1984 int start_index, int end_index, uc32 min_char, 1985 uc32 max_char, Label* fall_through, 1986 Label* even_label, Label* odd_label) { 1987 DCHECK_LE(min_char, String::kMaxUtf16CodeUnit); 1988 DCHECK_LE(max_char, String::kMaxUtf16CodeUnit); 1989 1990 int first = ranges->at(start_index); 1991 int last = ranges->at(end_index) - 1; 1992 1993 DCHECK_LT(min_char, first); 1994 1995 // Just need to test if the character is before or on-or-after 1996 // a particular character. 1997 if (start_index == end_index) { 1998 EmitBoundaryTest(masm, first, fall_through, even_label, odd_label); 1999 return; 2000 } 2001 2002 // Another almost trivial case: There is one interval in the middle that is 2003 // different from the end intervals. 2004 if (start_index + 1 == end_index) { 2005 EmitDoubleBoundaryTest( 2006 masm, first, last, fall_through, even_label, odd_label); 2007 return; 2008 } 2009 2010 // It's not worth using table lookup if there are very few intervals in the 2011 // character class. 2012 if (end_index - start_index <= 6) { 2013 // It is faster to test for individual characters, so we look for those 2014 // first, then try arbitrary ranges in the second round. 2015 static int kNoCutIndex = -1; 2016 int cut = kNoCutIndex; 2017 for (int i = start_index; i < end_index; i++) { 2018 if (ranges->at(i) == ranges->at(i + 1) - 1) { 2019 cut = i; 2020 break; 2021 } 2022 } 2023 if (cut == kNoCutIndex) cut = start_index; 2024 CutOutRange( 2025 masm, ranges, start_index, end_index, cut, even_label, odd_label); 2026 DCHECK_GE(end_index - start_index, 2); 2027 GenerateBranches(masm, 2028 ranges, 2029 start_index + 1, 2030 end_index - 1, 2031 min_char, 2032 max_char, 2033 fall_through, 2034 even_label, 2035 odd_label); 2036 return; 2037 } 2038 2039 // If there are a lot of intervals in the regexp, then we will use tables to 2040 // determine whether the character is inside or outside the character class. 2041 static const int kBits = RegExpMacroAssembler::kTableSizeBits; 2042 2043 if ((max_char >> kBits) == (min_char >> kBits)) { 2044 EmitUseLookupTable(masm, 2045 ranges, 2046 start_index, 2047 end_index, 2048 min_char, 2049 fall_through, 2050 even_label, 2051 odd_label); 2052 return; 2053 } 2054 2055 if ((min_char >> kBits) != (first >> kBits)) { 2056 masm->CheckCharacterLT(first, odd_label); 2057 GenerateBranches(masm, 2058 ranges, 2059 start_index + 1, 2060 end_index, 2061 first, 2062 max_char, 2063 fall_through, 2064 odd_label, 2065 even_label); 2066 return; 2067 } 2068 2069 int new_start_index = 0; 2070 int new_end_index = 0; 2071 int border = 0; 2072 2073 SplitSearchSpace(ranges, 2074 start_index, 2075 end_index, 2076 &new_start_index, 2077 &new_end_index, 2078 &border); 2079 2080 Label handle_rest; 2081 Label* above = &handle_rest; 2082 if (border == last + 1) { 2083 // We didn't find any section that started after the limit, so everything 2084 // above the border is one of the terminal labels. 2085 above = (end_index & 1) != (start_index & 1) ? odd_label : even_label; 2086 DCHECK(new_end_index == end_index - 1); 2087 } 2088 2089 DCHECK_LE(start_index, new_end_index); 2090 DCHECK_LE(new_start_index, end_index); 2091 DCHECK_LT(start_index, new_start_index); 2092 DCHECK_LT(new_end_index, end_index); 2093 DCHECK(new_end_index + 1 == new_start_index || 2094 (new_end_index + 2 == new_start_index && 2095 border == ranges->at(new_end_index + 1))); 2096 DCHECK_LT(min_char, border - 1); 2097 DCHECK_LT(border, max_char); 2098 DCHECK_LT(ranges->at(new_end_index), border); 2099 DCHECK(border < ranges->at(new_start_index) || 2100 (border == ranges->at(new_start_index) && 2101 new_start_index == end_index && 2102 new_end_index == end_index - 1 && 2103 border == last + 1)); 2104 DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1)); 2105 2106 masm->CheckCharacterGT(border - 1, above); 2107 Label dummy; 2108 GenerateBranches(masm, 2109 ranges, 2110 start_index, 2111 new_end_index, 2112 min_char, 2113 border - 1, 2114 &dummy, 2115 even_label, 2116 odd_label); 2117 if (handle_rest.is_linked()) { 2118 masm->Bind(&handle_rest); 2119 bool flip = (new_start_index & 1) != (start_index & 1); 2120 GenerateBranches(masm, 2121 ranges, 2122 new_start_index, 2123 end_index, 2124 border, 2125 max_char, 2126 &dummy, 2127 flip ? odd_label : even_label, 2128 flip ? even_label : odd_label); 2129 } 2130 } 2131 2132 2133 static void EmitCharClass(RegExpMacroAssembler* macro_assembler, 2134 RegExpCharacterClass* cc, bool one_byte, 2135 Label* on_failure, int cp_offset, bool check_offset, 2136 bool preloaded, Zone* zone) { 2137 ZoneList<CharacterRange>* ranges = cc->ranges(zone); 2138 CharacterRange::Canonicalize(ranges); 2139 2140 int max_char; 2141 if (one_byte) { 2142 max_char = String::kMaxOneByteCharCode; 2143 } else { 2144 max_char = String::kMaxUtf16CodeUnit; 2145 } 2146 2147 int range_count = ranges->length(); 2148 2149 int last_valid_range = range_count - 1; 2150 while (last_valid_range >= 0) { 2151 CharacterRange& range = ranges->at(last_valid_range); 2152 if (range.from() <= max_char) { 2153 break; 2154 } 2155 last_valid_range--; 2156 } 2157 2158 if (last_valid_range < 0) { 2159 if (!cc->is_negated()) { 2160 macro_assembler->GoTo(on_failure); 2161 } 2162 if (check_offset) { 2163 macro_assembler->CheckPosition(cp_offset, on_failure); 2164 } 2165 return; 2166 } 2167 2168 if (last_valid_range == 0 && 2169 ranges->at(0).IsEverything(max_char)) { 2170 if (cc->is_negated()) { 2171 macro_assembler->GoTo(on_failure); 2172 } else { 2173 // This is a common case hit by non-anchored expressions. 2174 if (check_offset) { 2175 macro_assembler->CheckPosition(cp_offset, on_failure); 2176 } 2177 } 2178 return; 2179 } 2180 2181 if (!preloaded) { 2182 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset); 2183 } 2184 2185 if (cc->is_standard(zone) && 2186 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(), 2187 on_failure)) { 2188 return; 2189 } 2190 2191 2192 // A new list with ascending entries. Each entry is a code unit 2193 // where there is a boundary between code units that are part of 2194 // the class and code units that are not. Normally we insert an 2195 // entry at zero which goes to the failure label, but if there 2196 // was already one there we fall through for success on that entry. 2197 // Subsequent entries have alternating meaning (success/failure). 2198 ZoneList<int>* range_boundaries = 2199 new(zone) ZoneList<int>(last_valid_range, zone); 2200 2201 bool zeroth_entry_is_failure = !cc->is_negated(); 2202 2203 for (int i = 0; i <= last_valid_range; i++) { 2204 CharacterRange& range = ranges->at(i); 2205 if (range.from() == 0) { 2206 DCHECK_EQ(i, 0); 2207 zeroth_entry_is_failure = !zeroth_entry_is_failure; 2208 } else { 2209 range_boundaries->Add(range.from(), zone); 2210 } 2211 range_boundaries->Add(range.to() + 1, zone); 2212 } 2213 int end_index = range_boundaries->length() - 1; 2214 if (range_boundaries->at(end_index) > max_char) { 2215 end_index--; 2216 } 2217 2218 Label fall_through; 2219 GenerateBranches(macro_assembler, 2220 range_boundaries, 2221 0, // start_index. 2222 end_index, 2223 0, // min_char. 2224 max_char, 2225 &fall_through, 2226 zeroth_entry_is_failure ? &fall_through : on_failure, 2227 zeroth_entry_is_failure ? on_failure : &fall_through); 2228 macro_assembler->Bind(&fall_through); 2229 } 2230 2231 2232 RegExpNode::~RegExpNode() { 2233 } 2234 2235 2236 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler, 2237 Trace* trace) { 2238 // If we are generating a greedy loop then don't stop and don't reuse code. 2239 if (trace->stop_node() != NULL) { 2240 return CONTINUE; 2241 } 2242 2243 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 2244 if (trace->is_trivial()) { 2245 if (label_.is_bound() || on_work_list() || !KeepRecursing(compiler)) { 2246 // If a generic version is already scheduled to be generated or we have 2247 // recursed too deeply then just generate a jump to that code. 2248 macro_assembler->GoTo(&label_); 2249 // This will queue it up for generation of a generic version if it hasn't 2250 // already been queued. 2251 compiler->AddWork(this); 2252 return DONE; 2253 } 2254 // Generate generic version of the node and bind the label for later use. 2255 macro_assembler->Bind(&label_); 2256 return CONTINUE; 2257 } 2258 2259 // We are being asked to make a non-generic version. Keep track of how many 2260 // non-generic versions we generate so as not to overdo it. 2261 trace_count_++; 2262 if (KeepRecursing(compiler) && compiler->optimize() && 2263 trace_count_ < kMaxCopiesCodeGenerated) { 2264 return CONTINUE; 2265 } 2266 2267 // If we get here code has been generated for this node too many times or 2268 // recursion is too deep. Time to switch to a generic version. The code for 2269 // generic versions above can handle deep recursion properly. 2270 bool was_limiting = compiler->limiting_recursion(); 2271 compiler->set_limiting_recursion(true); 2272 trace->Flush(compiler, this); 2273 compiler->set_limiting_recursion(was_limiting); 2274 return DONE; 2275 } 2276 2277 2278 bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) { 2279 return !compiler->limiting_recursion() && 2280 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion; 2281 } 2282 2283 2284 int ActionNode::EatsAtLeast(int still_to_find, 2285 int budget, 2286 bool not_at_start) { 2287 if (budget <= 0) return 0; 2288 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input! 2289 return on_success()->EatsAtLeast(still_to_find, 2290 budget - 1, 2291 not_at_start); 2292 } 2293 2294 2295 void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget, 2296 BoyerMooreLookahead* bm, bool not_at_start) { 2297 if (action_type_ == BEGIN_SUBMATCH) { 2298 bm->SetRest(offset); 2299 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) { 2300 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); 2301 } 2302 SaveBMInfo(bm, not_at_start, offset); 2303 } 2304 2305 2306 int AssertionNode::EatsAtLeast(int still_to_find, 2307 int budget, 2308 bool not_at_start) { 2309 if (budget <= 0) return 0; 2310 // If we know we are not at the start and we are asked "how many characters 2311 // will you match if you succeed?" then we can answer anything since false 2312 // implies false. So lets just return the max answer (still_to_find) since 2313 // that won't prevent us from preloading a lot of characters for the other 2314 // branches in the node graph. 2315 if (assertion_type() == AT_START && not_at_start) return still_to_find; 2316 return on_success()->EatsAtLeast(still_to_find, 2317 budget - 1, 2318 not_at_start); 2319 } 2320 2321 2322 void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget, 2323 BoyerMooreLookahead* bm, bool not_at_start) { 2324 // Match the behaviour of EatsAtLeast on this node. 2325 if (assertion_type() == AT_START && not_at_start) return; 2326 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); 2327 SaveBMInfo(bm, not_at_start, offset); 2328 } 2329 2330 2331 int BackReferenceNode::EatsAtLeast(int still_to_find, 2332 int budget, 2333 bool not_at_start) { 2334 if (read_backward()) return 0; 2335 if (budget <= 0) return 0; 2336 return on_success()->EatsAtLeast(still_to_find, 2337 budget - 1, 2338 not_at_start); 2339 } 2340 2341 2342 int TextNode::EatsAtLeast(int still_to_find, 2343 int budget, 2344 bool not_at_start) { 2345 if (read_backward()) return 0; 2346 int answer = Length(); 2347 if (answer >= still_to_find) return answer; 2348 if (budget <= 0) return answer; 2349 // We are not at start after this node so we set the last argument to 'true'. 2350 return answer + on_success()->EatsAtLeast(still_to_find - answer, 2351 budget - 1, 2352 true); 2353 } 2354 2355 2356 int NegativeLookaroundChoiceNode::EatsAtLeast(int still_to_find, int budget, 2357 bool not_at_start) { 2358 if (budget <= 0) return 0; 2359 // Alternative 0 is the negative lookahead, alternative 1 is what comes 2360 // afterwards. 2361 RegExpNode* node = alternatives_->at(1).node(); 2362 return node->EatsAtLeast(still_to_find, budget - 1, not_at_start); 2363 } 2364 2365 2366 void NegativeLookaroundChoiceNode::GetQuickCheckDetails( 2367 QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in, 2368 bool not_at_start) { 2369 // Alternative 0 is the negative lookahead, alternative 1 is what comes 2370 // afterwards. 2371 RegExpNode* node = alternatives_->at(1).node(); 2372 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start); 2373 } 2374 2375 2376 int ChoiceNode::EatsAtLeastHelper(int still_to_find, 2377 int budget, 2378 RegExpNode* ignore_this_node, 2379 bool not_at_start) { 2380 if (budget <= 0) return 0; 2381 int min = 100; 2382 int choice_count = alternatives_->length(); 2383 budget = (budget - 1) / choice_count; 2384 for (int i = 0; i < choice_count; i++) { 2385 RegExpNode* node = alternatives_->at(i).node(); 2386 if (node == ignore_this_node) continue; 2387 int node_eats_at_least = 2388 node->EatsAtLeast(still_to_find, budget, not_at_start); 2389 if (node_eats_at_least < min) min = node_eats_at_least; 2390 if (min == 0) return 0; 2391 } 2392 return min; 2393 } 2394 2395 2396 int LoopChoiceNode::EatsAtLeast(int still_to_find, 2397 int budget, 2398 bool not_at_start) { 2399 return EatsAtLeastHelper(still_to_find, 2400 budget - 1, 2401 loop_node_, 2402 not_at_start); 2403 } 2404 2405 2406 int ChoiceNode::EatsAtLeast(int still_to_find, 2407 int budget, 2408 bool not_at_start) { 2409 return EatsAtLeastHelper(still_to_find, 2410 budget, 2411 NULL, 2412 not_at_start); 2413 } 2414 2415 2416 // Takes the left-most 1-bit and smears it out, setting all bits to its right. 2417 static inline uint32_t SmearBitsRight(uint32_t v) { 2418 v |= v >> 1; 2419 v |= v >> 2; 2420 v |= v >> 4; 2421 v |= v >> 8; 2422 v |= v >> 16; 2423 return v; 2424 } 2425 2426 2427 bool QuickCheckDetails::Rationalize(bool asc) { 2428 bool found_useful_op = false; 2429 uint32_t char_mask; 2430 if (asc) { 2431 char_mask = String::kMaxOneByteCharCode; 2432 } else { 2433 char_mask = String::kMaxUtf16CodeUnit; 2434 } 2435 mask_ = 0; 2436 value_ = 0; 2437 int char_shift = 0; 2438 for (int i = 0; i < characters_; i++) { 2439 Position* pos = &positions_[i]; 2440 if ((pos->mask & String::kMaxOneByteCharCode) != 0) { 2441 found_useful_op = true; 2442 } 2443 mask_ |= (pos->mask & char_mask) << char_shift; 2444 value_ |= (pos->value & char_mask) << char_shift; 2445 char_shift += asc ? 8 : 16; 2446 } 2447 return found_useful_op; 2448 } 2449 2450 2451 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler, 2452 Trace* bounds_check_trace, 2453 Trace* trace, 2454 bool preload_has_checked_bounds, 2455 Label* on_possible_success, 2456 QuickCheckDetails* details, 2457 bool fall_through_on_failure) { 2458 if (details->characters() == 0) return false; 2459 GetQuickCheckDetails( 2460 details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE); 2461 if (details->cannot_match()) return false; 2462 if (!details->Rationalize(compiler->one_byte())) return false; 2463 DCHECK(details->characters() == 1 || 2464 compiler->macro_assembler()->CanReadUnaligned()); 2465 uint32_t mask = details->mask(); 2466 uint32_t value = details->value(); 2467 2468 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2469 2470 if (trace->characters_preloaded() != details->characters()) { 2471 DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset()); 2472 // We are attempting to preload the minimum number of characters 2473 // any choice would eat, so if the bounds check fails, then none of the 2474 // choices can succeed, so we can just immediately backtrack, rather 2475 // than go to the next choice. 2476 assembler->LoadCurrentCharacter(trace->cp_offset(), 2477 bounds_check_trace->backtrack(), 2478 !preload_has_checked_bounds, 2479 details->characters()); 2480 } 2481 2482 2483 bool need_mask = true; 2484 2485 if (details->characters() == 1) { 2486 // If number of characters preloaded is 1 then we used a byte or 16 bit 2487 // load so the value is already masked down. 2488 uint32_t char_mask; 2489 if (compiler->one_byte()) { 2490 char_mask = String::kMaxOneByteCharCode; 2491 } else { 2492 char_mask = String::kMaxUtf16CodeUnit; 2493 } 2494 if ((mask & char_mask) == char_mask) need_mask = false; 2495 mask &= char_mask; 2496 } else { 2497 // For 2-character preloads in one-byte mode or 1-character preloads in 2498 // two-byte mode we also use a 16 bit load with zero extend. 2499 static const uint32_t kTwoByteMask = 0xffff; 2500 static const uint32_t kFourByteMask = 0xffffffff; 2501 if (details->characters() == 2 && compiler->one_byte()) { 2502 if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false; 2503 } else if (details->characters() == 1 && !compiler->one_byte()) { 2504 if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false; 2505 } else { 2506 if (mask == kFourByteMask) need_mask = false; 2507 } 2508 } 2509 2510 if (fall_through_on_failure) { 2511 if (need_mask) { 2512 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success); 2513 } else { 2514 assembler->CheckCharacter(value, on_possible_success); 2515 } 2516 } else { 2517 if (need_mask) { 2518 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack()); 2519 } else { 2520 assembler->CheckNotCharacter(value, trace->backtrack()); 2521 } 2522 } 2523 return true; 2524 } 2525 2526 2527 // Here is the meat of GetQuickCheckDetails (see also the comment on the 2528 // super-class in the .h file). 2529 // 2530 // We iterate along the text object, building up for each character a 2531 // mask and value that can be used to test for a quick failure to match. 2532 // The masks and values for the positions will be combined into a single 2533 // machine word for the current character width in order to be used in 2534 // generating a quick check. 2535 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, 2536 RegExpCompiler* compiler, 2537 int characters_filled_in, 2538 bool not_at_start) { 2539 // Do not collect any quick check details if the text node reads backward, 2540 // since it reads in the opposite direction than we use for quick checks. 2541 if (read_backward()) return; 2542 Isolate* isolate = compiler->macro_assembler()->isolate(); 2543 DCHECK(characters_filled_in < details->characters()); 2544 int characters = details->characters(); 2545 int char_mask; 2546 if (compiler->one_byte()) { 2547 char_mask = String::kMaxOneByteCharCode; 2548 } else { 2549 char_mask = String::kMaxUtf16CodeUnit; 2550 } 2551 for (int k = 0; k < elements()->length(); k++) { 2552 TextElement elm = elements()->at(k); 2553 if (elm.text_type() == TextElement::ATOM) { 2554 Vector<const uc16> quarks = elm.atom()->data(); 2555 for (int i = 0; i < characters && i < quarks.length(); i++) { 2556 QuickCheckDetails::Position* pos = 2557 details->positions(characters_filled_in); 2558 uc16 c = quarks[i]; 2559 if (compiler->ignore_case()) { 2560 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 2561 int length = GetCaseIndependentLetters(isolate, c, 2562 compiler->one_byte(), chars); 2563 if (length == 0) { 2564 // This can happen because all case variants are non-Latin1, but we 2565 // know the input is Latin1. 2566 details->set_cannot_match(); 2567 pos->determines_perfectly = false; 2568 return; 2569 } 2570 if (length == 1) { 2571 // This letter has no case equivalents, so it's nice and simple 2572 // and the mask-compare will determine definitely whether we have 2573 // a match at this character position. 2574 pos->mask = char_mask; 2575 pos->value = c; 2576 pos->determines_perfectly = true; 2577 } else { 2578 uint32_t common_bits = char_mask; 2579 uint32_t bits = chars[0]; 2580 for (int j = 1; j < length; j++) { 2581 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits); 2582 common_bits ^= differing_bits; 2583 bits &= common_bits; 2584 } 2585 // If length is 2 and common bits has only one zero in it then 2586 // our mask and compare instruction will determine definitely 2587 // whether we have a match at this character position. Otherwise 2588 // it can only be an approximate check. 2589 uint32_t one_zero = (common_bits | ~char_mask); 2590 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) { 2591 pos->determines_perfectly = true; 2592 } 2593 pos->mask = common_bits; 2594 pos->value = bits; 2595 } 2596 } else { 2597 // Don't ignore case. Nice simple case where the mask-compare will 2598 // determine definitely whether we have a match at this character 2599 // position. 2600 if (c > char_mask) { 2601 details->set_cannot_match(); 2602 pos->determines_perfectly = false; 2603 return; 2604 } 2605 pos->mask = char_mask; 2606 pos->value = c; 2607 pos->determines_perfectly = true; 2608 } 2609 characters_filled_in++; 2610 DCHECK(characters_filled_in <= details->characters()); 2611 if (characters_filled_in == details->characters()) { 2612 return; 2613 } 2614 } 2615 } else { 2616 QuickCheckDetails::Position* pos = 2617 details->positions(characters_filled_in); 2618 RegExpCharacterClass* tree = elm.char_class(); 2619 ZoneList<CharacterRange>* ranges = tree->ranges(zone()); 2620 if (tree->is_negated()) { 2621 // A quick check uses multi-character mask and compare. There is no 2622 // useful way to incorporate a negative char class into this scheme 2623 // so we just conservatively create a mask and value that will always 2624 // succeed. 2625 pos->mask = 0; 2626 pos->value = 0; 2627 } else { 2628 int first_range = 0; 2629 while (ranges->at(first_range).from() > char_mask) { 2630 first_range++; 2631 if (first_range == ranges->length()) { 2632 details->set_cannot_match(); 2633 pos->determines_perfectly = false; 2634 return; 2635 } 2636 } 2637 CharacterRange range = ranges->at(first_range); 2638 uc16 from = range.from(); 2639 uc16 to = range.to(); 2640 if (to > char_mask) { 2641 to = char_mask; 2642 } 2643 uint32_t differing_bits = (from ^ to); 2644 // A mask and compare is only perfect if the differing bits form a 2645 // number like 00011111 with one single block of trailing 1s. 2646 if ((differing_bits & (differing_bits + 1)) == 0 && 2647 from + differing_bits == to) { 2648 pos->determines_perfectly = true; 2649 } 2650 uint32_t common_bits = ~SmearBitsRight(differing_bits); 2651 uint32_t bits = (from & common_bits); 2652 for (int i = first_range + 1; i < ranges->length(); i++) { 2653 CharacterRange range = ranges->at(i); 2654 uc16 from = range.from(); 2655 uc16 to = range.to(); 2656 if (from > char_mask) continue; 2657 if (to > char_mask) to = char_mask; 2658 // Here we are combining more ranges into the mask and compare 2659 // value. With each new range the mask becomes more sparse and 2660 // so the chances of a false positive rise. A character class 2661 // with multiple ranges is assumed never to be equivalent to a 2662 // mask and compare operation. 2663 pos->determines_perfectly = false; 2664 uint32_t new_common_bits = (from ^ to); 2665 new_common_bits = ~SmearBitsRight(new_common_bits); 2666 common_bits &= new_common_bits; 2667 bits &= new_common_bits; 2668 uint32_t differing_bits = (from & common_bits) ^ bits; 2669 common_bits ^= differing_bits; 2670 bits &= common_bits; 2671 } 2672 pos->mask = common_bits; 2673 pos->value = bits; 2674 } 2675 characters_filled_in++; 2676 DCHECK(characters_filled_in <= details->characters()); 2677 if (characters_filled_in == details->characters()) { 2678 return; 2679 } 2680 } 2681 } 2682 DCHECK(characters_filled_in != details->characters()); 2683 if (!details->cannot_match()) { 2684 on_success()-> GetQuickCheckDetails(details, 2685 compiler, 2686 characters_filled_in, 2687 true); 2688 } 2689 } 2690 2691 2692 void QuickCheckDetails::Clear() { 2693 for (int i = 0; i < characters_; i++) { 2694 positions_[i].mask = 0; 2695 positions_[i].value = 0; 2696 positions_[i].determines_perfectly = false; 2697 } 2698 characters_ = 0; 2699 } 2700 2701 2702 void QuickCheckDetails::Advance(int by, bool one_byte) { 2703 if (by >= characters_ || by < 0) { 2704 DCHECK_IMPLIES(by < 0, characters_ == 0); 2705 Clear(); 2706 return; 2707 } 2708 DCHECK_LE(characters_ - by, 4); 2709 DCHECK_LE(characters_, 4); 2710 for (int i = 0; i < characters_ - by; i++) { 2711 positions_[i] = positions_[by + i]; 2712 } 2713 for (int i = characters_ - by; i < characters_; i++) { 2714 positions_[i].mask = 0; 2715 positions_[i].value = 0; 2716 positions_[i].determines_perfectly = false; 2717 } 2718 characters_ -= by; 2719 // We could change mask_ and value_ here but we would never advance unless 2720 // they had already been used in a check and they won't be used again because 2721 // it would gain us nothing. So there's no point. 2722 } 2723 2724 2725 void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) { 2726 DCHECK(characters_ == other->characters_); 2727 if (other->cannot_match_) { 2728 return; 2729 } 2730 if (cannot_match_) { 2731 *this = *other; 2732 return; 2733 } 2734 for (int i = from_index; i < characters_; i++) { 2735 QuickCheckDetails::Position* pos = positions(i); 2736 QuickCheckDetails::Position* other_pos = other->positions(i); 2737 if (pos->mask != other_pos->mask || 2738 pos->value != other_pos->value || 2739 !other_pos->determines_perfectly) { 2740 // Our mask-compare operation will be approximate unless we have the 2741 // exact same operation on both sides of the alternation. 2742 pos->determines_perfectly = false; 2743 } 2744 pos->mask &= other_pos->mask; 2745 pos->value &= pos->mask; 2746 other_pos->value &= pos->mask; 2747 uc16 differing_bits = (pos->value ^ other_pos->value); 2748 pos->mask &= ~differing_bits; 2749 pos->value &= pos->mask; 2750 } 2751 } 2752 2753 2754 class VisitMarker { 2755 public: 2756 explicit VisitMarker(NodeInfo* info) : info_(info) { 2757 DCHECK(!info->visited); 2758 info->visited = true; 2759 } 2760 ~VisitMarker() { 2761 info_->visited = false; 2762 } 2763 private: 2764 NodeInfo* info_; 2765 }; 2766 2767 2768 RegExpNode* SeqRegExpNode::FilterOneByte(int depth, bool ignore_case) { 2769 if (info()->replacement_calculated) return replacement(); 2770 if (depth < 0) return this; 2771 DCHECK(!info()->visited); 2772 VisitMarker marker(info()); 2773 return FilterSuccessor(depth - 1, ignore_case); 2774 } 2775 2776 2777 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) { 2778 RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case); 2779 if (next == NULL) return set_replacement(NULL); 2780 on_success_ = next; 2781 return set_replacement(this); 2782 } 2783 2784 2785 // We need to check for the following characters: 0x39c 0x3bc 0x178. 2786 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) { 2787 // TODO(dcarney): this could be a lot more efficient. 2788 return range.Contains(0x39c) || 2789 range.Contains(0x3bc) || range.Contains(0x178); 2790 } 2791 2792 2793 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) { 2794 for (int i = 0; i < ranges->length(); i++) { 2795 // TODO(dcarney): this could be a lot more efficient. 2796 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true; 2797 } 2798 return false; 2799 } 2800 2801 2802 RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) { 2803 if (info()->replacement_calculated) return replacement(); 2804 if (depth < 0) return this; 2805 DCHECK(!info()->visited); 2806 VisitMarker marker(info()); 2807 int element_count = elements()->length(); 2808 for (int i = 0; i < element_count; i++) { 2809 TextElement elm = elements()->at(i); 2810 if (elm.text_type() == TextElement::ATOM) { 2811 Vector<const uc16> quarks = elm.atom()->data(); 2812 for (int j = 0; j < quarks.length(); j++) { 2813 uint16_t c = quarks[j]; 2814 if (c <= String::kMaxOneByteCharCode) continue; 2815 if (!ignore_case) return set_replacement(NULL); 2816 // Here, we need to check for characters whose upper and lower cases 2817 // are outside the Latin-1 range. 2818 uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c); 2819 // Character is outside Latin-1 completely 2820 if (converted == 0) return set_replacement(NULL); 2821 // Convert quark to Latin-1 in place. 2822 uint16_t* copy = const_cast<uint16_t*>(quarks.start()); 2823 copy[j] = converted; 2824 } 2825 } else { 2826 DCHECK(elm.text_type() == TextElement::CHAR_CLASS); 2827 RegExpCharacterClass* cc = elm.char_class(); 2828 ZoneList<CharacterRange>* ranges = cc->ranges(zone()); 2829 CharacterRange::Canonicalize(ranges); 2830 // Now they are in order so we only need to look at the first. 2831 int range_count = ranges->length(); 2832 if (cc->is_negated()) { 2833 if (range_count != 0 && 2834 ranges->at(0).from() == 0 && 2835 ranges->at(0).to() >= String::kMaxOneByteCharCode) { 2836 // This will be handled in a later filter. 2837 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue; 2838 return set_replacement(NULL); 2839 } 2840 } else { 2841 if (range_count == 0 || 2842 ranges->at(0).from() > String::kMaxOneByteCharCode) { 2843 // This will be handled in a later filter. 2844 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue; 2845 return set_replacement(NULL); 2846 } 2847 } 2848 } 2849 } 2850 return FilterSuccessor(depth - 1, ignore_case); 2851 } 2852 2853 2854 RegExpNode* LoopChoiceNode::FilterOneByte(int depth, bool ignore_case) { 2855 if (info()->replacement_calculated) return replacement(); 2856 if (depth < 0) return this; 2857 if (info()->visited) return this; 2858 { 2859 VisitMarker marker(info()); 2860 2861 RegExpNode* continue_replacement = 2862 continue_node_->FilterOneByte(depth - 1, ignore_case); 2863 // If we can't continue after the loop then there is no sense in doing the 2864 // loop. 2865 if (continue_replacement == NULL) return set_replacement(NULL); 2866 } 2867 2868 return ChoiceNode::FilterOneByte(depth - 1, ignore_case); 2869 } 2870 2871 2872 RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) { 2873 if (info()->replacement_calculated) return replacement(); 2874 if (depth < 0) return this; 2875 if (info()->visited) return this; 2876 VisitMarker marker(info()); 2877 int choice_count = alternatives_->length(); 2878 2879 for (int i = 0; i < choice_count; i++) { 2880 GuardedAlternative alternative = alternatives_->at(i); 2881 if (alternative.guards() != NULL && alternative.guards()->length() != 0) { 2882 set_replacement(this); 2883 return this; 2884 } 2885 } 2886 2887 int surviving = 0; 2888 RegExpNode* survivor = NULL; 2889 for (int i = 0; i < choice_count; i++) { 2890 GuardedAlternative alternative = alternatives_->at(i); 2891 RegExpNode* replacement = 2892 alternative.node()->FilterOneByte(depth - 1, ignore_case); 2893 DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK. 2894 if (replacement != NULL) { 2895 alternatives_->at(i).set_node(replacement); 2896 surviving++; 2897 survivor = replacement; 2898 } 2899 } 2900 if (surviving < 2) return set_replacement(survivor); 2901 2902 set_replacement(this); 2903 if (surviving == choice_count) { 2904 return this; 2905 } 2906 // Only some of the nodes survived the filtering. We need to rebuild the 2907 // alternatives list. 2908 ZoneList<GuardedAlternative>* new_alternatives = 2909 new(zone()) ZoneList<GuardedAlternative>(surviving, zone()); 2910 for (int i = 0; i < choice_count; i++) { 2911 RegExpNode* replacement = 2912 alternatives_->at(i).node()->FilterOneByte(depth - 1, ignore_case); 2913 if (replacement != NULL) { 2914 alternatives_->at(i).set_node(replacement); 2915 new_alternatives->Add(alternatives_->at(i), zone()); 2916 } 2917 } 2918 alternatives_ = new_alternatives; 2919 return this; 2920 } 2921 2922 2923 RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth, 2924 bool ignore_case) { 2925 if (info()->replacement_calculated) return replacement(); 2926 if (depth < 0) return this; 2927 if (info()->visited) return this; 2928 VisitMarker marker(info()); 2929 // Alternative 0 is the negative lookahead, alternative 1 is what comes 2930 // afterwards. 2931 RegExpNode* node = alternatives_->at(1).node(); 2932 RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case); 2933 if (replacement == NULL) return set_replacement(NULL); 2934 alternatives_->at(1).set_node(replacement); 2935 2936 RegExpNode* neg_node = alternatives_->at(0).node(); 2937 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case); 2938 // If the negative lookahead is always going to fail then 2939 // we don't need to check it. 2940 if (neg_replacement == NULL) return set_replacement(replacement); 2941 alternatives_->at(0).set_node(neg_replacement); 2942 return set_replacement(this); 2943 } 2944 2945 2946 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 2947 RegExpCompiler* compiler, 2948 int characters_filled_in, 2949 bool not_at_start) { 2950 if (body_can_be_zero_length_ || info()->visited) return; 2951 VisitMarker marker(info()); 2952 return ChoiceNode::GetQuickCheckDetails(details, 2953 compiler, 2954 characters_filled_in, 2955 not_at_start); 2956 } 2957 2958 2959 void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget, 2960 BoyerMooreLookahead* bm, bool not_at_start) { 2961 if (body_can_be_zero_length_ || budget <= 0) { 2962 bm->SetRest(offset); 2963 SaveBMInfo(bm, not_at_start, offset); 2964 return; 2965 } 2966 ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); 2967 SaveBMInfo(bm, not_at_start, offset); 2968 } 2969 2970 2971 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 2972 RegExpCompiler* compiler, 2973 int characters_filled_in, 2974 bool not_at_start) { 2975 not_at_start = (not_at_start || not_at_start_); 2976 int choice_count = alternatives_->length(); 2977 DCHECK(choice_count > 0); 2978 alternatives_->at(0).node()->GetQuickCheckDetails(details, 2979 compiler, 2980 characters_filled_in, 2981 not_at_start); 2982 for (int i = 1; i < choice_count; i++) { 2983 QuickCheckDetails new_details(details->characters()); 2984 RegExpNode* node = alternatives_->at(i).node(); 2985 node->GetQuickCheckDetails(&new_details, compiler, 2986 characters_filled_in, 2987 not_at_start); 2988 // Here we merge the quick match details of the two branches. 2989 details->Merge(&new_details, characters_filled_in); 2990 } 2991 } 2992 2993 2994 // Check for [0-9A-Z_a-z]. 2995 static void EmitWordCheck(RegExpMacroAssembler* assembler, 2996 Label* word, 2997 Label* non_word, 2998 bool fall_through_on_word) { 2999 if (assembler->CheckSpecialCharacterClass( 3000 fall_through_on_word ? 'w' : 'W', 3001 fall_through_on_word ? non_word : word)) { 3002 // Optimized implementation available. 3003 return; 3004 } 3005 assembler->CheckCharacterGT('z', non_word); 3006 assembler->CheckCharacterLT('0', non_word); 3007 assembler->CheckCharacterGT('a' - 1, word); 3008 assembler->CheckCharacterLT('9' + 1, word); 3009 assembler->CheckCharacterLT('A', non_word); 3010 assembler->CheckCharacterLT('Z' + 1, word); 3011 if (fall_through_on_word) { 3012 assembler->CheckNotCharacter('_', non_word); 3013 } else { 3014 assembler->CheckCharacter('_', word); 3015 } 3016 } 3017 3018 3019 // Emit the code to check for a ^ in multiline mode (1-character lookbehind 3020 // that matches newline or the start of input). 3021 static void EmitHat(RegExpCompiler* compiler, 3022 RegExpNode* on_success, 3023 Trace* trace) { 3024 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3025 // We will be loading the previous character into the current character 3026 // register. 3027 Trace new_trace(*trace); 3028 new_trace.InvalidateCurrentCharacter(); 3029 3030 Label ok; 3031 if (new_trace.cp_offset() == 0) { 3032 // The start of input counts as a newline in this context, so skip to 3033 // ok if we are at the start. 3034 assembler->CheckAtStart(&ok); 3035 } 3036 // We already checked that we are not at the start of input so it must be 3037 // OK to load the previous character. 3038 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1, 3039 new_trace.backtrack(), 3040 false); 3041 if (!assembler->CheckSpecialCharacterClass('n', 3042 new_trace.backtrack())) { 3043 // Newline means \n, \r, 0x2028 or 0x2029. 3044 if (!compiler->one_byte()) { 3045 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok); 3046 } 3047 assembler->CheckCharacter('\n', &ok); 3048 assembler->CheckNotCharacter('\r', new_trace.backtrack()); 3049 } 3050 assembler->Bind(&ok); 3051 on_success->Emit(compiler, &new_trace); 3052 } 3053 3054 3055 // Emit the code to handle \b and \B (word-boundary or non-word-boundary). 3056 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) { 3057 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3058 Isolate* isolate = assembler->isolate(); 3059 Trace::TriBool next_is_word_character = Trace::UNKNOWN; 3060 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE); 3061 BoyerMooreLookahead* lookahead = bm_info(not_at_start); 3062 if (lookahead == NULL) { 3063 int eats_at_least = 3064 Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore, 3065 kRecursionBudget, 3066 not_at_start)); 3067 if (eats_at_least >= 1) { 3068 BoyerMooreLookahead* bm = 3069 new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone()); 3070 FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start); 3071 if (bm->at(0)->is_non_word()) 3072 next_is_word_character = Trace::FALSE_VALUE; 3073 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE; 3074 } 3075 } else { 3076 if (lookahead->at(0)->is_non_word()) 3077 next_is_word_character = Trace::FALSE_VALUE; 3078 if (lookahead->at(0)->is_word()) 3079 next_is_word_character = Trace::TRUE_VALUE; 3080 } 3081 bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY); 3082 if (next_is_word_character == Trace::UNKNOWN) { 3083 Label before_non_word; 3084 Label before_word; 3085 if (trace->characters_preloaded() != 1) { 3086 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word); 3087 } 3088 // Fall through on non-word. 3089 EmitWordCheck(assembler, &before_word, &before_non_word, false); 3090 // Next character is not a word character. 3091 assembler->Bind(&before_non_word); 3092 Label ok; 3093 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord); 3094 assembler->GoTo(&ok); 3095 3096 assembler->Bind(&before_word); 3097 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord); 3098 assembler->Bind(&ok); 3099 } else if (next_is_word_character == Trace::TRUE_VALUE) { 3100 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord); 3101 } else { 3102 DCHECK(next_is_word_character == Trace::FALSE_VALUE); 3103 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord); 3104 } 3105 } 3106 3107 3108 void AssertionNode::BacktrackIfPrevious( 3109 RegExpCompiler* compiler, 3110 Trace* trace, 3111 AssertionNode::IfPrevious backtrack_if_previous) { 3112 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3113 Trace new_trace(*trace); 3114 new_trace.InvalidateCurrentCharacter(); 3115 3116 Label fall_through, dummy; 3117 3118 Label* non_word = backtrack_if_previous == kIsNonWord ? 3119 new_trace.backtrack() : 3120 &fall_through; 3121 Label* word = backtrack_if_previous == kIsNonWord ? 3122 &fall_through : 3123 new_trace.backtrack(); 3124 3125 if (new_trace.cp_offset() == 0) { 3126 // The start of input counts as a non-word character, so the question is 3127 // decided if we are at the start. 3128 assembler->CheckAtStart(non_word); 3129 } 3130 // We already checked that we are not at the start of input so it must be 3131 // OK to load the previous character. 3132 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false); 3133 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord); 3134 3135 assembler->Bind(&fall_through); 3136 on_success()->Emit(compiler, &new_trace); 3137 } 3138 3139 3140 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details, 3141 RegExpCompiler* compiler, 3142 int filled_in, 3143 bool not_at_start) { 3144 if (assertion_type_ == AT_START && not_at_start) { 3145 details->set_cannot_match(); 3146 return; 3147 } 3148 return on_success()->GetQuickCheckDetails(details, 3149 compiler, 3150 filled_in, 3151 not_at_start); 3152 } 3153 3154 3155 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3156 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3157 switch (assertion_type_) { 3158 case AT_END: { 3159 Label ok; 3160 assembler->CheckPosition(trace->cp_offset(), &ok); 3161 assembler->GoTo(trace->backtrack()); 3162 assembler->Bind(&ok); 3163 break; 3164 } 3165 case AT_START: { 3166 if (trace->at_start() == Trace::FALSE_VALUE) { 3167 assembler->GoTo(trace->backtrack()); 3168 return; 3169 } 3170 if (trace->at_start() == Trace::UNKNOWN) { 3171 assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack()); 3172 Trace at_start_trace = *trace; 3173 at_start_trace.set_at_start(Trace::TRUE_VALUE); 3174 on_success()->Emit(compiler, &at_start_trace); 3175 return; 3176 } 3177 } 3178 break; 3179 case AFTER_NEWLINE: 3180 EmitHat(compiler, on_success(), trace); 3181 return; 3182 case AT_BOUNDARY: 3183 case AT_NON_BOUNDARY: { 3184 EmitBoundaryCheck(compiler, trace); 3185 return; 3186 } 3187 } 3188 on_success()->Emit(compiler, trace); 3189 } 3190 3191 3192 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) { 3193 if (quick_check == NULL) return false; 3194 if (offset >= quick_check->characters()) return false; 3195 return quick_check->positions(offset)->determines_perfectly; 3196 } 3197 3198 3199 static void UpdateBoundsCheck(int index, int* checked_up_to) { 3200 if (index > *checked_up_to) { 3201 *checked_up_to = index; 3202 } 3203 } 3204 3205 3206 // We call this repeatedly to generate code for each pass over the text node. 3207 // The passes are in increasing order of difficulty because we hope one 3208 // of the first passes will fail in which case we are saved the work of the 3209 // later passes. for example for the case independent regexp /%[asdfghjkl]a/ 3210 // we will check the '%' in the first pass, the case independent 'a' in the 3211 // second pass and the character class in the last pass. 3212 // 3213 // The passes are done from right to left, so for example to test for /bar/ 3214 // we will first test for an 'r' with offset 2, then an 'a' with offset 1 3215 // and then a 'b' with offset 0. This means we can avoid the end-of-input 3216 // bounds check most of the time. In the example we only need to check for 3217 // end-of-input when loading the putative 'r'. 3218 // 3219 // A slight complication involves the fact that the first character may already 3220 // be fetched into a register by the previous node. In this case we want to 3221 // do the test for that character first. We do this in separate passes. The 3222 // 'preloaded' argument indicates that we are doing such a 'pass'. If such a 3223 // pass has been performed then subsequent passes will have true in 3224 // first_element_checked to indicate that that character does not need to be 3225 // checked again. 3226 // 3227 // In addition to all this we are passed a Trace, which can 3228 // contain an AlternativeGeneration object. In this AlternativeGeneration 3229 // object we can see details of any quick check that was already passed in 3230 // order to get to the code we are now generating. The quick check can involve 3231 // loading characters, which means we do not need to recheck the bounds 3232 // up to the limit the quick check already checked. In addition the quick 3233 // check can have involved a mask and compare operation which may simplify 3234 // or obviate the need for further checks at some character positions. 3235 void TextNode::TextEmitPass(RegExpCompiler* compiler, 3236 TextEmitPassType pass, 3237 bool preloaded, 3238 Trace* trace, 3239 bool first_element_checked, 3240 int* checked_up_to) { 3241 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3242 Isolate* isolate = assembler->isolate(); 3243 bool one_byte = compiler->one_byte(); 3244 Label* backtrack = trace->backtrack(); 3245 QuickCheckDetails* quick_check = trace->quick_check_performed(); 3246 int element_count = elements()->length(); 3247 int backward_offset = read_backward() ? -Length() : 0; 3248 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) { 3249 TextElement elm = elements()->at(i); 3250 int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset; 3251 if (elm.text_type() == TextElement::ATOM) { 3252 Vector<const uc16> quarks = elm.atom()->data(); 3253 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { 3254 if (first_element_checked && i == 0 && j == 0) continue; 3255 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue; 3256 EmitCharacterFunction* emit_function = NULL; 3257 switch (pass) { 3258 case NON_LATIN1_MATCH: 3259 DCHECK(one_byte); 3260 if (quarks[j] > String::kMaxOneByteCharCode) { 3261 assembler->GoTo(backtrack); 3262 return; 3263 } 3264 break; 3265 case NON_LETTER_CHARACTER_MATCH: 3266 emit_function = &EmitAtomNonLetter; 3267 break; 3268 case SIMPLE_CHARACTER_MATCH: 3269 emit_function = &EmitSimpleCharacter; 3270 break; 3271 case CASE_CHARACTER_MATCH: 3272 emit_function = &EmitAtomLetter; 3273 break; 3274 default: 3275 break; 3276 } 3277 if (emit_function != NULL) { 3278 bool bounds_check = *checked_up_to < cp_offset + j || read_backward(); 3279 bool bound_checked = 3280 emit_function(isolate, compiler, quarks[j], backtrack, 3281 cp_offset + j, bounds_check, preloaded); 3282 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); 3283 } 3284 } 3285 } else { 3286 DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type()); 3287 if (pass == CHARACTER_CLASS_MATCH) { 3288 if (first_element_checked && i == 0) continue; 3289 if (DeterminedAlready(quick_check, elm.cp_offset())) continue; 3290 RegExpCharacterClass* cc = elm.char_class(); 3291 bool bounds_check = *checked_up_to < cp_offset || read_backward(); 3292 EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset, 3293 bounds_check, preloaded, zone()); 3294 UpdateBoundsCheck(cp_offset, checked_up_to); 3295 } 3296 } 3297 } 3298 } 3299 3300 3301 int TextNode::Length() { 3302 TextElement elm = elements()->last(); 3303 DCHECK(elm.cp_offset() >= 0); 3304 return elm.cp_offset() + elm.length(); 3305 } 3306 3307 3308 bool TextNode::SkipPass(int int_pass, bool ignore_case) { 3309 TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass); 3310 if (ignore_case) { 3311 return pass == SIMPLE_CHARACTER_MATCH; 3312 } else { 3313 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH; 3314 } 3315 } 3316 3317 3318 TextNode* TextNode::CreateForCharacterRanges(Zone* zone, 3319 ZoneList<CharacterRange>* ranges, 3320 bool read_backward, 3321 RegExpNode* on_success) { 3322 DCHECK_NOT_NULL(ranges); 3323 ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone); 3324 elms->Add( 3325 TextElement::CharClass(new (zone) RegExpCharacterClass(ranges, false)), 3326 zone); 3327 return new (zone) TextNode(elms, read_backward, on_success); 3328 } 3329 3330 3331 TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead, 3332 CharacterRange trail, 3333 bool read_backward, 3334 RegExpNode* on_success) { 3335 ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead); 3336 ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail); 3337 ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone); 3338 elms->Add(TextElement::CharClass( 3339 new (zone) RegExpCharacterClass(lead_ranges, false)), 3340 zone); 3341 elms->Add(TextElement::CharClass( 3342 new (zone) RegExpCharacterClass(trail_ranges, false)), 3343 zone); 3344 return new (zone) TextNode(elms, read_backward, on_success); 3345 } 3346 3347 3348 // This generates the code to match a text node. A text node can contain 3349 // straight character sequences (possibly to be matched in a case-independent 3350 // way) and character classes. For efficiency we do not do this in a single 3351 // pass from left to right. Instead we pass over the text node several times, 3352 // emitting code for some character positions every time. See the comment on 3353 // TextEmitPass for details. 3354 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3355 LimitResult limit_result = LimitVersions(compiler, trace); 3356 if (limit_result == DONE) return; 3357 DCHECK(limit_result == CONTINUE); 3358 3359 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) { 3360 compiler->SetRegExpTooBig(); 3361 return; 3362 } 3363 3364 if (compiler->one_byte()) { 3365 int dummy = 0; 3366 TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy); 3367 } 3368 3369 bool first_elt_done = false; 3370 int bound_checked_to = trace->cp_offset() - 1; 3371 bound_checked_to += trace->bound_checked_up_to(); 3372 3373 // If a character is preloaded into the current character register then 3374 // check that now. 3375 if (trace->characters_preloaded() == 1) { 3376 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { 3377 if (!SkipPass(pass, compiler->ignore_case())) { 3378 TextEmitPass(compiler, 3379 static_cast<TextEmitPassType>(pass), 3380 true, 3381 trace, 3382 false, 3383 &bound_checked_to); 3384 } 3385 } 3386 first_elt_done = true; 3387 } 3388 3389 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { 3390 if (!SkipPass(pass, compiler->ignore_case())) { 3391 TextEmitPass(compiler, 3392 static_cast<TextEmitPassType>(pass), 3393 false, 3394 trace, 3395 first_elt_done, 3396 &bound_checked_to); 3397 } 3398 } 3399 3400 Trace successor_trace(*trace); 3401 // If we advance backward, we may end up at the start. 3402 successor_trace.AdvanceCurrentPositionInTrace( 3403 read_backward() ? -Length() : Length(), compiler); 3404 successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN 3405 : Trace::FALSE_VALUE); 3406 RecursionCheck rc(compiler); 3407 on_success()->Emit(compiler, &successor_trace); 3408 } 3409 3410 3411 void Trace::InvalidateCurrentCharacter() { 3412 characters_preloaded_ = 0; 3413 } 3414 3415 3416 void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) { 3417 // We don't have an instruction for shifting the current character register 3418 // down or for using a shifted value for anything so lets just forget that 3419 // we preloaded any characters into it. 3420 characters_preloaded_ = 0; 3421 // Adjust the offsets of the quick check performed information. This 3422 // information is used to find out what we already determined about the 3423 // characters by means of mask and compare. 3424 quick_check_performed_.Advance(by, compiler->one_byte()); 3425 cp_offset_ += by; 3426 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) { 3427 compiler->SetRegExpTooBig(); 3428 cp_offset_ = 0; 3429 } 3430 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by); 3431 } 3432 3433 3434 void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) { 3435 int element_count = elements()->length(); 3436 for (int i = 0; i < element_count; i++) { 3437 TextElement elm = elements()->at(i); 3438 if (elm.text_type() == TextElement::CHAR_CLASS) { 3439 RegExpCharacterClass* cc = elm.char_class(); 3440 // None of the standard character classes is different in the case 3441 // independent case and it slows us down if we don't know that. 3442 if (cc->is_standard(zone())) continue; 3443 ZoneList<CharacterRange>* ranges = cc->ranges(zone()); 3444 CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, is_one_byte); 3445 } 3446 } 3447 } 3448 3449 3450 int TextNode::GreedyLoopTextLength() { return Length(); } 3451 3452 3453 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode( 3454 RegExpCompiler* compiler) { 3455 if (read_backward()) return NULL; 3456 if (elements()->length() != 1) return NULL; 3457 TextElement elm = elements()->at(0); 3458 if (elm.text_type() != TextElement::CHAR_CLASS) return NULL; 3459 RegExpCharacterClass* node = elm.char_class(); 3460 ZoneList<CharacterRange>* ranges = node->ranges(zone()); 3461 CharacterRange::Canonicalize(ranges); 3462 if (node->is_negated()) { 3463 return ranges->length() == 0 ? on_success() : NULL; 3464 } 3465 if (ranges->length() != 1) return NULL; 3466 uint32_t max_char; 3467 if (compiler->one_byte()) { 3468 max_char = String::kMaxOneByteCharCode; 3469 } else { 3470 max_char = String::kMaxUtf16CodeUnit; 3471 } 3472 return ranges->at(0).IsEverything(max_char) ? on_success() : NULL; 3473 } 3474 3475 3476 // Finds the fixed match length of a sequence of nodes that goes from 3477 // this alternative and back to this choice node. If there are variable 3478 // length nodes or other complications in the way then return a sentinel 3479 // value indicating that a greedy loop cannot be constructed. 3480 int ChoiceNode::GreedyLoopTextLengthForAlternative( 3481 GuardedAlternative* alternative) { 3482 int length = 0; 3483 RegExpNode* node = alternative->node(); 3484 // Later we will generate code for all these text nodes using recursion 3485 // so we have to limit the max number. 3486 int recursion_depth = 0; 3487 while (node != this) { 3488 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) { 3489 return kNodeIsTooComplexForGreedyLoops; 3490 } 3491 int node_length = node->GreedyLoopTextLength(); 3492 if (node_length == kNodeIsTooComplexForGreedyLoops) { 3493 return kNodeIsTooComplexForGreedyLoops; 3494 } 3495 length += node_length; 3496 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node); 3497 node = seq_node->on_success(); 3498 } 3499 return read_backward() ? -length : length; 3500 } 3501 3502 3503 void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) { 3504 DCHECK_NULL(loop_node_); 3505 AddAlternative(alt); 3506 loop_node_ = alt.node(); 3507 } 3508 3509 3510 void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) { 3511 DCHECK_NULL(continue_node_); 3512 AddAlternative(alt); 3513 continue_node_ = alt.node(); 3514 } 3515 3516 3517 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3518 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 3519 if (trace->stop_node() == this) { 3520 // Back edge of greedy optimized loop node graph. 3521 int text_length = 3522 GreedyLoopTextLengthForAlternative(&(alternatives_->at(0))); 3523 DCHECK(text_length != kNodeIsTooComplexForGreedyLoops); 3524 // Update the counter-based backtracking info on the stack. This is an 3525 // optimization for greedy loops (see below). 3526 DCHECK(trace->cp_offset() == text_length); 3527 macro_assembler->AdvanceCurrentPosition(text_length); 3528 macro_assembler->GoTo(trace->loop_label()); 3529 return; 3530 } 3531 DCHECK_NULL(trace->stop_node()); 3532 if (!trace->is_trivial()) { 3533 trace->Flush(compiler, this); 3534 return; 3535 } 3536 ChoiceNode::Emit(compiler, trace); 3537 } 3538 3539 3540 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler, 3541 int eats_at_least) { 3542 int preload_characters = Min(4, eats_at_least); 3543 if (compiler->macro_assembler()->CanReadUnaligned()) { 3544 bool one_byte = compiler->one_byte(); 3545 if (one_byte) { 3546 if (preload_characters > 4) preload_characters = 4; 3547 // We can't preload 3 characters because there is no machine instruction 3548 // to do that. We can't just load 4 because we could be reading 3549 // beyond the end of the string, which could cause a memory fault. 3550 if (preload_characters == 3) preload_characters = 2; 3551 } else { 3552 if (preload_characters > 2) preload_characters = 2; 3553 } 3554 } else { 3555 if (preload_characters > 1) preload_characters = 1; 3556 } 3557 return preload_characters; 3558 } 3559 3560 3561 // This class is used when generating the alternatives in a choice node. It 3562 // records the way the alternative is being code generated. 3563 class AlternativeGeneration: public Malloced { 3564 public: 3565 AlternativeGeneration() 3566 : possible_success(), 3567 expects_preload(false), 3568 after(), 3569 quick_check_details() { } 3570 Label possible_success; 3571 bool expects_preload; 3572 Label after; 3573 QuickCheckDetails quick_check_details; 3574 }; 3575 3576 3577 // Creates a list of AlternativeGenerations. If the list has a reasonable 3578 // size then it is on the stack, otherwise the excess is on the heap. 3579 class AlternativeGenerationList { 3580 public: 3581 AlternativeGenerationList(int count, Zone* zone) 3582 : alt_gens_(count, zone) { 3583 for (int i = 0; i < count && i < kAFew; i++) { 3584 alt_gens_.Add(a_few_alt_gens_ + i, zone); 3585 } 3586 for (int i = kAFew; i < count; i++) { 3587 alt_gens_.Add(new AlternativeGeneration(), zone); 3588 } 3589 } 3590 ~AlternativeGenerationList() { 3591 for (int i = kAFew; i < alt_gens_.length(); i++) { 3592 delete alt_gens_[i]; 3593 alt_gens_[i] = NULL; 3594 } 3595 } 3596 3597 AlternativeGeneration* at(int i) { 3598 return alt_gens_[i]; 3599 } 3600 3601 private: 3602 static const int kAFew = 10; 3603 ZoneList<AlternativeGeneration*> alt_gens_; 3604 AlternativeGeneration a_few_alt_gens_[kAFew]; 3605 }; 3606 3607 3608 static const uc32 kRangeEndMarker = 0x110000; 3609 3610 // The '2' variant is has inclusive from and exclusive to. 3611 // This covers \s as defined in ECMA-262 5.1, 15.10.2.12, 3612 // which include WhiteSpace (7.2) or LineTerminator (7.3) values. 3613 static const int kSpaceRanges[] = { 3614 '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680, 0x1681, 3615 0x180E, 0x180F, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030, 3616 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker}; 3617 static const int kSpaceRangeCount = arraysize(kSpaceRanges); 3618 3619 static const int kWordRanges[] = { 3620 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, kRangeEndMarker}; 3621 static const int kWordRangeCount = arraysize(kWordRanges); 3622 static const int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker}; 3623 static const int kDigitRangeCount = arraysize(kDigitRanges); 3624 static const int kSurrogateRanges[] = { 3625 kLeadSurrogateStart, kLeadSurrogateStart + 1, kRangeEndMarker}; 3626 static const int kSurrogateRangeCount = arraysize(kSurrogateRanges); 3627 static const int kLineTerminatorRanges[] = { 3628 0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, kRangeEndMarker}; 3629 static const int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges); 3630 3631 void BoyerMoorePositionInfo::Set(int character) { 3632 SetInterval(Interval(character, character)); 3633 } 3634 3635 3636 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) { 3637 s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval); 3638 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval); 3639 d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval); 3640 surrogate_ = 3641 AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval); 3642 if (interval.to() - interval.from() >= kMapSize - 1) { 3643 if (map_count_ != kMapSize) { 3644 map_count_ = kMapSize; 3645 for (int i = 0; i < kMapSize; i++) map_->at(i) = true; 3646 } 3647 return; 3648 } 3649 for (int i = interval.from(); i <= interval.to(); i++) { 3650 int mod_character = (i & kMask); 3651 if (!map_->at(mod_character)) { 3652 map_count_++; 3653 map_->at(mod_character) = true; 3654 } 3655 if (map_count_ == kMapSize) return; 3656 } 3657 } 3658 3659 3660 void BoyerMoorePositionInfo::SetAll() { 3661 s_ = w_ = d_ = kLatticeUnknown; 3662 if (map_count_ != kMapSize) { 3663 map_count_ = kMapSize; 3664 for (int i = 0; i < kMapSize; i++) map_->at(i) = true; 3665 } 3666 } 3667 3668 3669 BoyerMooreLookahead::BoyerMooreLookahead( 3670 int length, RegExpCompiler* compiler, Zone* zone) 3671 : length_(length), 3672 compiler_(compiler) { 3673 if (compiler->one_byte()) { 3674 max_char_ = String::kMaxOneByteCharCode; 3675 } else { 3676 max_char_ = String::kMaxUtf16CodeUnit; 3677 } 3678 bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone); 3679 for (int i = 0; i < length; i++) { 3680 bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone); 3681 } 3682 } 3683 3684 3685 // Find the longest range of lookahead that has the fewest number of different 3686 // characters that can occur at a given position. Since we are optimizing two 3687 // different parameters at once this is a tradeoff. 3688 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) { 3689 int biggest_points = 0; 3690 // If more than 32 characters out of 128 can occur it is unlikely that we can 3691 // be lucky enough to step forwards much of the time. 3692 const int kMaxMax = 32; 3693 for (int max_number_of_chars = 4; 3694 max_number_of_chars < kMaxMax; 3695 max_number_of_chars *= 2) { 3696 biggest_points = 3697 FindBestInterval(max_number_of_chars, biggest_points, from, to); 3698 } 3699 if (biggest_points == 0) return false; 3700 return true; 3701 } 3702 3703 3704 // Find the highest-points range between 0 and length_ where the character 3705 // information is not too vague. 'Too vague' means that there are more than 3706 // max_number_of_chars that can occur at this position. Calculates the number 3707 // of points as the product of width-of-the-range and 3708 // probability-of-finding-one-of-the-characters, where the probability is 3709 // calculated using the frequency distribution of the sample subject string. 3710 int BoyerMooreLookahead::FindBestInterval( 3711 int max_number_of_chars, int old_biggest_points, int* from, int* to) { 3712 int biggest_points = old_biggest_points; 3713 static const int kSize = RegExpMacroAssembler::kTableSize; 3714 for (int i = 0; i < length_; ) { 3715 while (i < length_ && Count(i) > max_number_of_chars) i++; 3716 if (i == length_) break; 3717 int remembered_from = i; 3718 bool union_map[kSize]; 3719 for (int j = 0; j < kSize; j++) union_map[j] = false; 3720 while (i < length_ && Count(i) <= max_number_of_chars) { 3721 BoyerMoorePositionInfo* map = bitmaps_->at(i); 3722 for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j); 3723 i++; 3724 } 3725 int frequency = 0; 3726 for (int j = 0; j < kSize; j++) { 3727 if (union_map[j]) { 3728 // Add 1 to the frequency to give a small per-character boost for 3729 // the cases where our sampling is not good enough and many 3730 // characters have a frequency of zero. This means the frequency 3731 // can theoretically be up to 2*kSize though we treat it mostly as 3732 // a fraction of kSize. 3733 frequency += compiler_->frequency_collator()->Frequency(j) + 1; 3734 } 3735 } 3736 // We use the probability of skipping times the distance we are skipping to 3737 // judge the effectiveness of this. Actually we have a cut-off: By 3738 // dividing by 2 we switch off the skipping if the probability of skipping 3739 // is less than 50%. This is because the multibyte mask-and-compare 3740 // skipping in quickcheck is more likely to do well on this case. 3741 bool in_quickcheck_range = 3742 ((i - remembered_from < 4) || 3743 (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2)); 3744 // Called 'probability' but it is only a rough estimate and can actually 3745 // be outside the 0-kSize range. 3746 int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency; 3747 int points = (i - remembered_from) * probability; 3748 if (points > biggest_points) { 3749 *from = remembered_from; 3750 *to = i - 1; 3751 biggest_points = points; 3752 } 3753 } 3754 return biggest_points; 3755 } 3756 3757 3758 // Take all the characters that will not prevent a successful match if they 3759 // occur in the subject string in the range between min_lookahead and 3760 // max_lookahead (inclusive) measured from the current position. If the 3761 // character at max_lookahead offset is not one of these characters, then we 3762 // can safely skip forwards by the number of characters in the range. 3763 int BoyerMooreLookahead::GetSkipTable(int min_lookahead, 3764 int max_lookahead, 3765 Handle<ByteArray> boolean_skip_table) { 3766 const int kSize = RegExpMacroAssembler::kTableSize; 3767 3768 const int kSkipArrayEntry = 0; 3769 const int kDontSkipArrayEntry = 1; 3770 3771 for (int i = 0; i < kSize; i++) { 3772 boolean_skip_table->set(i, kSkipArrayEntry); 3773 } 3774 int skip = max_lookahead + 1 - min_lookahead; 3775 3776 for (int i = max_lookahead; i >= min_lookahead; i--) { 3777 BoyerMoorePositionInfo* map = bitmaps_->at(i); 3778 for (int j = 0; j < kSize; j++) { 3779 if (map->at(j)) { 3780 boolean_skip_table->set(j, kDontSkipArrayEntry); 3781 } 3782 } 3783 } 3784 3785 return skip; 3786 } 3787 3788 3789 // See comment above on the implementation of GetSkipTable. 3790 void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) { 3791 const int kSize = RegExpMacroAssembler::kTableSize; 3792 3793 int min_lookahead = 0; 3794 int max_lookahead = 0; 3795 3796 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return; 3797 3798 bool found_single_character = false; 3799 int single_character = 0; 3800 for (int i = max_lookahead; i >= min_lookahead; i--) { 3801 BoyerMoorePositionInfo* map = bitmaps_->at(i); 3802 if (map->map_count() > 1 || 3803 (found_single_character && map->map_count() != 0)) { 3804 found_single_character = false; 3805 break; 3806 } 3807 for (int j = 0; j < kSize; j++) { 3808 if (map->at(j)) { 3809 found_single_character = true; 3810 single_character = j; 3811 break; 3812 } 3813 } 3814 } 3815 3816 int lookahead_width = max_lookahead + 1 - min_lookahead; 3817 3818 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) { 3819 // The mask-compare can probably handle this better. 3820 return; 3821 } 3822 3823 if (found_single_character) { 3824 Label cont, again; 3825 masm->Bind(&again); 3826 masm->LoadCurrentCharacter(max_lookahead, &cont, true); 3827 if (max_char_ > kSize) { 3828 masm->CheckCharacterAfterAnd(single_character, 3829 RegExpMacroAssembler::kTableMask, 3830 &cont); 3831 } else { 3832 masm->CheckCharacter(single_character, &cont); 3833 } 3834 masm->AdvanceCurrentPosition(lookahead_width); 3835 masm->GoTo(&again); 3836 masm->Bind(&cont); 3837 return; 3838 } 3839 3840 Factory* factory = masm->isolate()->factory(); 3841 Handle<ByteArray> boolean_skip_table = factory->NewByteArray(kSize, TENURED); 3842 int skip_distance = GetSkipTable( 3843 min_lookahead, max_lookahead, boolean_skip_table); 3844 DCHECK(skip_distance != 0); 3845 3846 Label cont, again; 3847 masm->Bind(&again); 3848 masm->LoadCurrentCharacter(max_lookahead, &cont, true); 3849 masm->CheckBitInTable(boolean_skip_table, &cont); 3850 masm->AdvanceCurrentPosition(skip_distance); 3851 masm->GoTo(&again); 3852 masm->Bind(&cont); 3853 } 3854 3855 3856 /* Code generation for choice nodes. 3857 * 3858 * We generate quick checks that do a mask and compare to eliminate a 3859 * choice. If the quick check succeeds then it jumps to the continuation to 3860 * do slow checks and check subsequent nodes. If it fails (the common case) 3861 * it falls through to the next choice. 3862 * 3863 * Here is the desired flow graph. Nodes directly below each other imply 3864 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative 3865 * 3 doesn't have a quick check so we have to call the slow check. 3866 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire 3867 * regexp continuation is generated directly after the Sn node, up to the 3868 * next GoTo if we decide to reuse some already generated code. Some 3869 * nodes expect preload_characters to be preloaded into the current 3870 * character register. R nodes do this preloading. Vertices are marked 3871 * F for failures and S for success (possible success in the case of quick 3872 * nodes). L, V, < and > are used as arrow heads. 3873 * 3874 * ----------> R 3875 * | 3876 * V 3877 * Q1 -----> S1 3878 * | S / 3879 * F| / 3880 * | F/ 3881 * | / 3882 * | R 3883 * | / 3884 * V L 3885 * Q2 -----> S2 3886 * | S / 3887 * F| / 3888 * | F/ 3889 * | / 3890 * | R 3891 * | / 3892 * V L 3893 * S3 3894 * | 3895 * F| 3896 * | 3897 * R 3898 * | 3899 * backtrack V 3900 * <----------Q4 3901 * \ F | 3902 * \ |S 3903 * \ F V 3904 * \-----S4 3905 * 3906 * For greedy loops we push the current position, then generate the code that 3907 * eats the input specially in EmitGreedyLoop. The other choice (the 3908 * continuation) is generated by the normal code in EmitChoices, and steps back 3909 * in the input to the starting position when it fails to match. The loop code 3910 * looks like this (U is the unwind code that steps back in the greedy loop). 3911 * 3912 * _____ 3913 * / \ 3914 * V | 3915 * ----------> S1 | 3916 * /| | 3917 * / |S | 3918 * F/ \_____/ 3919 * / 3920 * |<----- 3921 * | \ 3922 * V |S 3923 * Q2 ---> U----->backtrack 3924 * | F / 3925 * S| / 3926 * V F / 3927 * S2--/ 3928 */ 3929 3930 GreedyLoopState::GreedyLoopState(bool not_at_start) { 3931 counter_backtrack_trace_.set_backtrack(&label_); 3932 if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE); 3933 } 3934 3935 3936 void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) { 3937 #ifdef DEBUG 3938 int choice_count = alternatives_->length(); 3939 for (int i = 0; i < choice_count - 1; i++) { 3940 GuardedAlternative alternative = alternatives_->at(i); 3941 ZoneList<Guard*>* guards = alternative.guards(); 3942 int guard_count = (guards == NULL) ? 0 : guards->length(); 3943 for (int j = 0; j < guard_count; j++) { 3944 DCHECK(!trace->mentions_reg(guards->at(j)->reg())); 3945 } 3946 } 3947 #endif 3948 } 3949 3950 3951 void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler, 3952 Trace* current_trace, 3953 PreloadState* state) { 3954 if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) { 3955 // Save some time by looking at most one machine word ahead. 3956 state->eats_at_least_ = 3957 EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget, 3958 current_trace->at_start() == Trace::FALSE_VALUE); 3959 } 3960 state->preload_characters_ = 3961 CalculatePreloadCharacters(compiler, state->eats_at_least_); 3962 3963 state->preload_is_current_ = 3964 (current_trace->characters_preloaded() == state->preload_characters_); 3965 state->preload_has_checked_bounds_ = state->preload_is_current_; 3966 } 3967 3968 3969 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3970 int choice_count = alternatives_->length(); 3971 3972 if (choice_count == 1 && alternatives_->at(0).guards() == NULL) { 3973 alternatives_->at(0).node()->Emit(compiler, trace); 3974 return; 3975 } 3976 3977 AssertGuardsMentionRegisters(trace); 3978 3979 LimitResult limit_result = LimitVersions(compiler, trace); 3980 if (limit_result == DONE) return; 3981 DCHECK(limit_result == CONTINUE); 3982 3983 // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for 3984 // other choice nodes we only flush if we are out of code size budget. 3985 if (trace->flush_budget() == 0 && trace->actions() != NULL) { 3986 trace->Flush(compiler, this); 3987 return; 3988 } 3989 3990 RecursionCheck rc(compiler); 3991 3992 PreloadState preload; 3993 preload.init(); 3994 GreedyLoopState greedy_loop_state(not_at_start()); 3995 3996 int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0)); 3997 AlternativeGenerationList alt_gens(choice_count, zone()); 3998 3999 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) { 4000 trace = EmitGreedyLoop(compiler, 4001 trace, 4002 &alt_gens, 4003 &preload, 4004 &greedy_loop_state, 4005 text_length); 4006 } else { 4007 // TODO(erikcorry): Delete this. We don't need this label, but it makes us 4008 // match the traces produced pre-cleanup. 4009 Label second_choice; 4010 compiler->macro_assembler()->Bind(&second_choice); 4011 4012 preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace); 4013 4014 EmitChoices(compiler, 4015 &alt_gens, 4016 0, 4017 trace, 4018 &preload); 4019 } 4020 4021 // At this point we need to generate slow checks for the alternatives where 4022 // the quick check was inlined. We can recognize these because the associated 4023 // label was bound. 4024 int new_flush_budget = trace->flush_budget() / choice_count; 4025 for (int i = 0; i < choice_count; i++) { 4026 AlternativeGeneration* alt_gen = alt_gens.at(i); 4027 Trace new_trace(*trace); 4028 // If there are actions to be flushed we have to limit how many times 4029 // they are flushed. Take the budget of the parent trace and distribute 4030 // it fairly amongst the children. 4031 if (new_trace.actions() != NULL) { 4032 new_trace.set_flush_budget(new_flush_budget); 4033 } 4034 bool next_expects_preload = 4035 i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload; 4036 EmitOutOfLineContinuation(compiler, 4037 &new_trace, 4038 alternatives_->at(i), 4039 alt_gen, 4040 preload.preload_characters_, 4041 next_expects_preload); 4042 } 4043 } 4044 4045 4046 Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler, 4047 Trace* trace, 4048 AlternativeGenerationList* alt_gens, 4049 PreloadState* preload, 4050 GreedyLoopState* greedy_loop_state, 4051 int text_length) { 4052 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 4053 // Here we have special handling for greedy loops containing only text nodes 4054 // and other simple nodes. These are handled by pushing the current 4055 // position on the stack and then incrementing the current position each 4056 // time around the switch. On backtrack we decrement the current position 4057 // and check it against the pushed value. This avoids pushing backtrack 4058 // information for each iteration of the loop, which could take up a lot of 4059 // space. 4060 DCHECK(trace->stop_node() == NULL); 4061 macro_assembler->PushCurrentPosition(); 4062 Label greedy_match_failed; 4063 Trace greedy_match_trace; 4064 if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE); 4065 greedy_match_trace.set_backtrack(&greedy_match_failed); 4066 Label loop_label; 4067 macro_assembler->Bind(&loop_label); 4068 greedy_match_trace.set_stop_node(this); 4069 greedy_match_trace.set_loop_label(&loop_label); 4070 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace); 4071 macro_assembler->Bind(&greedy_match_failed); 4072 4073 Label second_choice; // For use in greedy matches. 4074 macro_assembler->Bind(&second_choice); 4075 4076 Trace* new_trace = greedy_loop_state->counter_backtrack_trace(); 4077 4078 EmitChoices(compiler, 4079 alt_gens, 4080 1, 4081 new_trace, 4082 preload); 4083 4084 macro_assembler->Bind(greedy_loop_state->label()); 4085 // If we have unwound to the bottom then backtrack. 4086 macro_assembler->CheckGreedyLoop(trace->backtrack()); 4087 // Otherwise try the second priority at an earlier position. 4088 macro_assembler->AdvanceCurrentPosition(-text_length); 4089 macro_assembler->GoTo(&second_choice); 4090 return new_trace; 4091 } 4092 4093 int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler, 4094 Trace* trace) { 4095 int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized; 4096 if (alternatives_->length() != 2) return eats_at_least; 4097 4098 GuardedAlternative alt1 = alternatives_->at(1); 4099 if (alt1.guards() != NULL && alt1.guards()->length() != 0) { 4100 return eats_at_least; 4101 } 4102 RegExpNode* eats_anything_node = alt1.node(); 4103 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) { 4104 return eats_at_least; 4105 } 4106 4107 // Really we should be creating a new trace when we execute this function, 4108 // but there is no need, because the code it generates cannot backtrack, and 4109 // we always arrive here with a trivial trace (since it's the entry to a 4110 // loop. That also implies that there are no preloaded characters, which is 4111 // good, because it means we won't be violating any assumptions by 4112 // overwriting those characters with new load instructions. 4113 DCHECK(trace->is_trivial()); 4114 4115 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 4116 Isolate* isolate = macro_assembler->isolate(); 4117 // At this point we know that we are at a non-greedy loop that will eat 4118 // any character one at a time. Any non-anchored regexp has such a 4119 // loop prepended to it in order to find where it starts. We look for 4120 // a pattern of the form ...abc... where we can look 6 characters ahead 4121 // and step forwards 3 if the character is not one of abc. Abc need 4122 // not be atoms, they can be any reasonably limited character class or 4123 // small alternation. 4124 BoyerMooreLookahead* bm = bm_info(false); 4125 if (bm == NULL) { 4126 eats_at_least = Min(kMaxLookaheadForBoyerMoore, 4127 EatsAtLeast(kMaxLookaheadForBoyerMoore, 4128 kRecursionBudget, 4129 false)); 4130 if (eats_at_least >= 1) { 4131 bm = new(zone()) BoyerMooreLookahead(eats_at_least, 4132 compiler, 4133 zone()); 4134 GuardedAlternative alt0 = alternatives_->at(0); 4135 alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false); 4136 } 4137 } 4138 if (bm != NULL) { 4139 bm->EmitSkipInstructions(macro_assembler); 4140 } 4141 return eats_at_least; 4142 } 4143 4144 4145 void ChoiceNode::EmitChoices(RegExpCompiler* compiler, 4146 AlternativeGenerationList* alt_gens, 4147 int first_choice, 4148 Trace* trace, 4149 PreloadState* preload) { 4150 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 4151 SetUpPreLoad(compiler, trace, preload); 4152 4153 // For now we just call all choices one after the other. The idea ultimately 4154 // is to use the Dispatch table to try only the relevant ones. 4155 int choice_count = alternatives_->length(); 4156 4157 int new_flush_budget = trace->flush_budget() / choice_count; 4158 4159 for (int i = first_choice; i < choice_count; i++) { 4160 bool is_last = i == choice_count - 1; 4161 bool fall_through_on_failure = !is_last; 4162 GuardedAlternative alternative = alternatives_->at(i); 4163 AlternativeGeneration* alt_gen = alt_gens->at(i); 4164 alt_gen->quick_check_details.set_characters(preload->preload_characters_); 4165 ZoneList<Guard*>* guards = alternative.guards(); 4166 int guard_count = (guards == NULL) ? 0 : guards->length(); 4167 Trace new_trace(*trace); 4168 new_trace.set_characters_preloaded(preload->preload_is_current_ ? 4169 preload->preload_characters_ : 4170 0); 4171 if (preload->preload_has_checked_bounds_) { 4172 new_trace.set_bound_checked_up_to(preload->preload_characters_); 4173 } 4174 new_trace.quick_check_performed()->Clear(); 4175 if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE); 4176 if (!is_last) { 4177 new_trace.set_backtrack(&alt_gen->after); 4178 } 4179 alt_gen->expects_preload = preload->preload_is_current_; 4180 bool generate_full_check_inline = false; 4181 if (compiler->optimize() && 4182 try_to_emit_quick_check_for_alternative(i == 0) && 4183 alternative.node()->EmitQuickCheck( 4184 compiler, trace, &new_trace, preload->preload_has_checked_bounds_, 4185 &alt_gen->possible_success, &alt_gen->quick_check_details, 4186 fall_through_on_failure)) { 4187 // Quick check was generated for this choice. 4188 preload->preload_is_current_ = true; 4189 preload->preload_has_checked_bounds_ = true; 4190 // If we generated the quick check to fall through on possible success, 4191 // we now need to generate the full check inline. 4192 if (!fall_through_on_failure) { 4193 macro_assembler->Bind(&alt_gen->possible_success); 4194 new_trace.set_quick_check_performed(&alt_gen->quick_check_details); 4195 new_trace.set_characters_preloaded(preload->preload_characters_); 4196 new_trace.set_bound_checked_up_to(preload->preload_characters_); 4197 generate_full_check_inline = true; 4198 } 4199 } else if (alt_gen->quick_check_details.cannot_match()) { 4200 if (!fall_through_on_failure) { 4201 macro_assembler->GoTo(trace->backtrack()); 4202 } 4203 continue; 4204 } else { 4205 // No quick check was generated. Put the full code here. 4206 // If this is not the first choice then there could be slow checks from 4207 // previous cases that go here when they fail. There's no reason to 4208 // insist that they preload characters since the slow check we are about 4209 // to generate probably can't use it. 4210 if (i != first_choice) { 4211 alt_gen->expects_preload = false; 4212 new_trace.InvalidateCurrentCharacter(); 4213 } 4214 generate_full_check_inline = true; 4215 } 4216 if (generate_full_check_inline) { 4217 if (new_trace.actions() != NULL) { 4218 new_trace.set_flush_budget(new_flush_budget); 4219 } 4220 for (int j = 0; j < guard_count; j++) { 4221 GenerateGuard(macro_assembler, guards->at(j), &new_trace); 4222 } 4223 alternative.node()->Emit(compiler, &new_trace); 4224 preload->preload_is_current_ = false; 4225 } 4226 macro_assembler->Bind(&alt_gen->after); 4227 } 4228 } 4229 4230 4231 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler, 4232 Trace* trace, 4233 GuardedAlternative alternative, 4234 AlternativeGeneration* alt_gen, 4235 int preload_characters, 4236 bool next_expects_preload) { 4237 if (!alt_gen->possible_success.is_linked()) return; 4238 4239 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 4240 macro_assembler->Bind(&alt_gen->possible_success); 4241 Trace out_of_line_trace(*trace); 4242 out_of_line_trace.set_characters_preloaded(preload_characters); 4243 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details); 4244 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE); 4245 ZoneList<Guard*>* guards = alternative.guards(); 4246 int guard_count = (guards == NULL) ? 0 : guards->length(); 4247 if (next_expects_preload) { 4248 Label reload_current_char; 4249 out_of_line_trace.set_backtrack(&reload_current_char); 4250 for (int j = 0; j < guard_count; j++) { 4251 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace); 4252 } 4253 alternative.node()->Emit(compiler, &out_of_line_trace); 4254 macro_assembler->Bind(&reload_current_char); 4255 // Reload the current character, since the next quick check expects that. 4256 // We don't need to check bounds here because we only get into this 4257 // code through a quick check which already did the checked load. 4258 macro_assembler->LoadCurrentCharacter(trace->cp_offset(), 4259 NULL, 4260 false, 4261 preload_characters); 4262 macro_assembler->GoTo(&(alt_gen->after)); 4263 } else { 4264 out_of_line_trace.set_backtrack(&(alt_gen->after)); 4265 for (int j = 0; j < guard_count; j++) { 4266 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace); 4267 } 4268 alternative.node()->Emit(compiler, &out_of_line_trace); 4269 } 4270 } 4271 4272 4273 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) { 4274 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 4275 LimitResult limit_result = LimitVersions(compiler, trace); 4276 if (limit_result == DONE) return; 4277 DCHECK(limit_result == CONTINUE); 4278 4279 RecursionCheck rc(compiler); 4280 4281 switch (action_type_) { 4282 case STORE_POSITION: { 4283 Trace::DeferredCapture 4284 new_capture(data_.u_position_register.reg, 4285 data_.u_position_register.is_capture, 4286 trace); 4287 Trace new_trace = *trace; 4288 new_trace.add_action(&new_capture); 4289 on_success()->Emit(compiler, &new_trace); 4290 break; 4291 } 4292 case INCREMENT_REGISTER: { 4293 Trace::DeferredIncrementRegister 4294 new_increment(data_.u_increment_register.reg); 4295 Trace new_trace = *trace; 4296 new_trace.add_action(&new_increment); 4297 on_success()->Emit(compiler, &new_trace); 4298 break; 4299 } 4300 case SET_REGISTER: { 4301 Trace::DeferredSetRegister 4302 new_set(data_.u_store_register.reg, data_.u_store_register.value); 4303 Trace new_trace = *trace; 4304 new_trace.add_action(&new_set); 4305 on_success()->Emit(compiler, &new_trace); 4306 break; 4307 } 4308 case CLEAR_CAPTURES: { 4309 Trace::DeferredClearCaptures 4310 new_capture(Interval(data_.u_clear_captures.range_from, 4311 data_.u_clear_captures.range_to)); 4312 Trace new_trace = *trace; 4313 new_trace.add_action(&new_capture); 4314 on_success()->Emit(compiler, &new_trace); 4315 break; 4316 } 4317 case BEGIN_SUBMATCH: 4318 if (!trace->is_trivial()) { 4319 trace->Flush(compiler, this); 4320 } else { 4321 assembler->WriteCurrentPositionToRegister( 4322 data_.u_submatch.current_position_register, 0); 4323 assembler->WriteStackPointerToRegister( 4324 data_.u_submatch.stack_pointer_register); 4325 on_success()->Emit(compiler, trace); 4326 } 4327 break; 4328 case EMPTY_MATCH_CHECK: { 4329 int start_pos_reg = data_.u_empty_match_check.start_register; 4330 int stored_pos = 0; 4331 int rep_reg = data_.u_empty_match_check.repetition_register; 4332 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister); 4333 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos); 4334 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) { 4335 // If we know we haven't advanced and there is no minimum we 4336 // can just backtrack immediately. 4337 assembler->GoTo(trace->backtrack()); 4338 } else if (know_dist && stored_pos < trace->cp_offset()) { 4339 // If we know we've advanced we can generate the continuation 4340 // immediately. 4341 on_success()->Emit(compiler, trace); 4342 } else if (!trace->is_trivial()) { 4343 trace->Flush(compiler, this); 4344 } else { 4345 Label skip_empty_check; 4346 // If we have a minimum number of repetitions we check the current 4347 // number first and skip the empty check if it's not enough. 4348 if (has_minimum) { 4349 int limit = data_.u_empty_match_check.repetition_limit; 4350 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check); 4351 } 4352 // If the match is empty we bail out, otherwise we fall through 4353 // to the on-success continuation. 4354 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register, 4355 trace->backtrack()); 4356 assembler->Bind(&skip_empty_check); 4357 on_success()->Emit(compiler, trace); 4358 } 4359 break; 4360 } 4361 case POSITIVE_SUBMATCH_SUCCESS: { 4362 if (!trace->is_trivial()) { 4363 trace->Flush(compiler, this); 4364 return; 4365 } 4366 assembler->ReadCurrentPositionFromRegister( 4367 data_.u_submatch.current_position_register); 4368 assembler->ReadStackPointerFromRegister( 4369 data_.u_submatch.stack_pointer_register); 4370 int clear_register_count = data_.u_submatch.clear_register_count; 4371 if (clear_register_count == 0) { 4372 on_success()->Emit(compiler, trace); 4373 return; 4374 } 4375 int clear_registers_from = data_.u_submatch.clear_register_from; 4376 Label clear_registers_backtrack; 4377 Trace new_trace = *trace; 4378 new_trace.set_backtrack(&clear_registers_backtrack); 4379 on_success()->Emit(compiler, &new_trace); 4380 4381 assembler->Bind(&clear_registers_backtrack); 4382 int clear_registers_to = clear_registers_from + clear_register_count - 1; 4383 assembler->ClearRegisters(clear_registers_from, clear_registers_to); 4384 4385 DCHECK(trace->backtrack() == NULL); 4386 assembler->Backtrack(); 4387 return; 4388 } 4389 default: 4390 UNREACHABLE(); 4391 } 4392 } 4393 4394 4395 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 4396 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 4397 if (!trace->is_trivial()) { 4398 trace->Flush(compiler, this); 4399 return; 4400 } 4401 4402 LimitResult limit_result = LimitVersions(compiler, trace); 4403 if (limit_result == DONE) return; 4404 DCHECK(limit_result == CONTINUE); 4405 4406 RecursionCheck rc(compiler); 4407 4408 DCHECK_EQ(start_reg_ + 1, end_reg_); 4409 if (compiler->ignore_case()) { 4410 assembler->CheckNotBackReferenceIgnoreCase( 4411 start_reg_, read_backward(), compiler->unicode(), trace->backtrack()); 4412 } else { 4413 assembler->CheckNotBackReference(start_reg_, read_backward(), 4414 trace->backtrack()); 4415 } 4416 // We are going to advance backward, so we may end up at the start. 4417 if (read_backward()) trace->set_at_start(Trace::UNKNOWN); 4418 4419 // Check that the back reference does not end inside a surrogate pair. 4420 if (compiler->unicode() && !compiler->one_byte()) { 4421 assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack()); 4422 } 4423 on_success()->Emit(compiler, trace); 4424 } 4425 4426 4427 // ------------------------------------------------------------------- 4428 // Dot/dotty output 4429 4430 4431 #ifdef DEBUG 4432 4433 4434 class DotPrinter: public NodeVisitor { 4435 public: 4436 DotPrinter(std::ostream& os, bool ignore_case) // NOLINT 4437 : os_(os), 4438 ignore_case_(ignore_case) {} 4439 void PrintNode(const char* label, RegExpNode* node); 4440 void Visit(RegExpNode* node); 4441 void PrintAttributes(RegExpNode* from); 4442 void PrintOnFailure(RegExpNode* from, RegExpNode* to); 4443 #define DECLARE_VISIT(Type) \ 4444 virtual void Visit##Type(Type##Node* that); 4445 FOR_EACH_NODE_TYPE(DECLARE_VISIT) 4446 #undef DECLARE_VISIT 4447 private: 4448 std::ostream& os_; 4449 bool ignore_case_; 4450 }; 4451 4452 4453 void DotPrinter::PrintNode(const char* label, RegExpNode* node) { 4454 os_ << "digraph G {\n graph [label=\""; 4455 for (int i = 0; label[i]; i++) { 4456 switch (label[i]) { 4457 case '\\': 4458 os_ << "\\\\"; 4459 break; 4460 case '"': 4461 os_ << "\""; 4462 break; 4463 default: 4464 os_ << label[i]; 4465 break; 4466 } 4467 } 4468 os_ << "\"];\n"; 4469 Visit(node); 4470 os_ << "}" << std::endl; 4471 } 4472 4473 4474 void DotPrinter::Visit(RegExpNode* node) { 4475 if (node->info()->visited) return; 4476 node->info()->visited = true; 4477 node->Accept(this); 4478 } 4479 4480 4481 void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) { 4482 os_ << " n" << from << " -> n" << on_failure << " [style=dotted];\n"; 4483 Visit(on_failure); 4484 } 4485 4486 4487 class TableEntryBodyPrinter { 4488 public: 4489 TableEntryBodyPrinter(std::ostream& os, ChoiceNode* choice) // NOLINT 4490 : os_(os), 4491 choice_(choice) {} 4492 void Call(uc16 from, DispatchTable::Entry entry) { 4493 OutSet* out_set = entry.out_set(); 4494 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { 4495 if (out_set->Get(i)) { 4496 os_ << " n" << choice() << ":s" << from << "o" << i << " -> n" 4497 << choice()->alternatives()->at(i).node() << ";\n"; 4498 } 4499 } 4500 } 4501 private: 4502 ChoiceNode* choice() { return choice_; } 4503 std::ostream& os_; 4504 ChoiceNode* choice_; 4505 }; 4506 4507 4508 class TableEntryHeaderPrinter { 4509 public: 4510 explicit TableEntryHeaderPrinter(std::ostream& os) // NOLINT 4511 : first_(true), 4512 os_(os) {} 4513 void Call(uc16 from, DispatchTable::Entry entry) { 4514 if (first_) { 4515 first_ = false; 4516 } else { 4517 os_ << "|"; 4518 } 4519 os_ << "{\\" << AsUC16(from) << "-\\" << AsUC16(entry.to()) << "|{"; 4520 OutSet* out_set = entry.out_set(); 4521 int priority = 0; 4522 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { 4523 if (out_set->Get(i)) { 4524 if (priority > 0) os_ << "|"; 4525 os_ << "<s" << from << "o" << i << "> " << priority; 4526 priority++; 4527 } 4528 } 4529 os_ << "}}"; 4530 } 4531 4532 private: 4533 bool first_; 4534 std::ostream& os_; 4535 }; 4536 4537 4538 class AttributePrinter { 4539 public: 4540 explicit AttributePrinter(std::ostream& os) // NOLINT 4541 : os_(os), 4542 first_(true) {} 4543 void PrintSeparator() { 4544 if (first_) { 4545 first_ = false; 4546 } else { 4547 os_ << "|"; 4548 } 4549 } 4550 void PrintBit(const char* name, bool value) { 4551 if (!value) return; 4552 PrintSeparator(); 4553 os_ << "{" << name << "}"; 4554 } 4555 void PrintPositive(const char* name, int value) { 4556 if (value < 0) return; 4557 PrintSeparator(); 4558 os_ << "{" << name << "|" << value << "}"; 4559 } 4560 4561 private: 4562 std::ostream& os_; 4563 bool first_; 4564 }; 4565 4566 4567 void DotPrinter::PrintAttributes(RegExpNode* that) { 4568 os_ << " a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, " 4569 << "margin=0.1, fontsize=10, label=\"{"; 4570 AttributePrinter printer(os_); 4571 NodeInfo* info = that->info(); 4572 printer.PrintBit("NI", info->follows_newline_interest); 4573 printer.PrintBit("WI", info->follows_word_interest); 4574 printer.PrintBit("SI", info->follows_start_interest); 4575 Label* label = that->label(); 4576 if (label->is_bound()) 4577 printer.PrintPositive("@", label->pos()); 4578 os_ << "}\"];\n" 4579 << " a" << that << " -> n" << that 4580 << " [style=dashed, color=grey, arrowhead=none];\n"; 4581 } 4582 4583 4584 static const bool kPrintDispatchTable = false; 4585 void DotPrinter::VisitChoice(ChoiceNode* that) { 4586 if (kPrintDispatchTable) { 4587 os_ << " n" << that << " [shape=Mrecord, label=\""; 4588 TableEntryHeaderPrinter header_printer(os_); 4589 that->GetTable(ignore_case_)->ForEach(&header_printer); 4590 os_ << "\"]\n"; 4591 PrintAttributes(that); 4592 TableEntryBodyPrinter body_printer(os_, that); 4593 that->GetTable(ignore_case_)->ForEach(&body_printer); 4594 } else { 4595 os_ << " n" << that << " [shape=Mrecord, label=\"?\"];\n"; 4596 for (int i = 0; i < that->alternatives()->length(); i++) { 4597 GuardedAlternative alt = that->alternatives()->at(i); 4598 os_ << " n" << that << " -> n" << alt.node(); 4599 } 4600 } 4601 for (int i = 0; i < that->alternatives()->length(); i++) { 4602 GuardedAlternative alt = that->alternatives()->at(i); 4603 alt.node()->Accept(this); 4604 } 4605 } 4606 4607 4608 void DotPrinter::VisitText(TextNode* that) { 4609 Zone* zone = that->zone(); 4610 os_ << " n" << that << " [label=\""; 4611 for (int i = 0; i < that->elements()->length(); i++) { 4612 if (i > 0) os_ << " "; 4613 TextElement elm = that->elements()->at(i); 4614 switch (elm.text_type()) { 4615 case TextElement::ATOM: { 4616 Vector<const uc16> data = elm.atom()->data(); 4617 for (int i = 0; i < data.length(); i++) { 4618 os_ << static_cast<char>(data[i]); 4619 } 4620 break; 4621 } 4622 case TextElement::CHAR_CLASS: { 4623 RegExpCharacterClass* node = elm.char_class(); 4624 os_ << "["; 4625 if (node->is_negated()) os_ << "^"; 4626 for (int j = 0; j < node->ranges(zone)->length(); j++) { 4627 CharacterRange range = node->ranges(zone)->at(j); 4628 os_ << AsUC16(range.from()) << "-" << AsUC16(range.to()); 4629 } 4630 os_ << "]"; 4631 break; 4632 } 4633 default: 4634 UNREACHABLE(); 4635 } 4636 } 4637 os_ << "\", shape=box, peripheries=2];\n"; 4638 PrintAttributes(that); 4639 os_ << " n" << that << " -> n" << that->on_success() << ";\n"; 4640 Visit(that->on_success()); 4641 } 4642 4643 4644 void DotPrinter::VisitBackReference(BackReferenceNode* that) { 4645 os_ << " n" << that << " [label=\"$" << that->start_register() << "..$" 4646 << that->end_register() << "\", shape=doubleoctagon];\n"; 4647 PrintAttributes(that); 4648 os_ << " n" << that << " -> n" << that->on_success() << ";\n"; 4649 Visit(that->on_success()); 4650 } 4651 4652 4653 void DotPrinter::VisitEnd(EndNode* that) { 4654 os_ << " n" << that << " [style=bold, shape=point];\n"; 4655 PrintAttributes(that); 4656 } 4657 4658 4659 void DotPrinter::VisitAssertion(AssertionNode* that) { 4660 os_ << " n" << that << " ["; 4661 switch (that->assertion_type()) { 4662 case AssertionNode::AT_END: 4663 os_ << "label=\"$\", shape=septagon"; 4664 break; 4665 case AssertionNode::AT_START: 4666 os_ << "label=\"^\", shape=septagon"; 4667 break; 4668 case AssertionNode::AT_BOUNDARY: 4669 os_ << "label=\"\\b\", shape=septagon"; 4670 break; 4671 case AssertionNode::AT_NON_BOUNDARY: 4672 os_ << "label=\"\\B\", shape=septagon"; 4673 break; 4674 case AssertionNode::AFTER_NEWLINE: 4675 os_ << "label=\"(?<=\\n)\", shape=septagon"; 4676 break; 4677 } 4678 os_ << "];\n"; 4679 PrintAttributes(that); 4680 RegExpNode* successor = that->on_success(); 4681 os_ << " n" << that << " -> n" << successor << ";\n"; 4682 Visit(successor); 4683 } 4684 4685 4686 void DotPrinter::VisitAction(ActionNode* that) { 4687 os_ << " n" << that << " ["; 4688 switch (that->action_type_) { 4689 case ActionNode::SET_REGISTER: 4690 os_ << "label=\"$" << that->data_.u_store_register.reg 4691 << ":=" << that->data_.u_store_register.value << "\", shape=octagon"; 4692 break; 4693 case ActionNode::INCREMENT_REGISTER: 4694 os_ << "label=\"$" << that->data_.u_increment_register.reg 4695 << "++\", shape=octagon"; 4696 break; 4697 case ActionNode::STORE_POSITION: 4698 os_ << "label=\"$" << that->data_.u_position_register.reg 4699 << ":=$pos\", shape=octagon"; 4700 break; 4701 case ActionNode::BEGIN_SUBMATCH: 4702 os_ << "label=\"$" << that->data_.u_submatch.current_position_register 4703 << ":=$pos,begin\", shape=septagon"; 4704 break; 4705 case ActionNode::POSITIVE_SUBMATCH_SUCCESS: 4706 os_ << "label=\"escape\", shape=septagon"; 4707 break; 4708 case ActionNode::EMPTY_MATCH_CHECK: 4709 os_ << "label=\"$" << that->data_.u_empty_match_check.start_register 4710 << "=$pos?,$" << that->data_.u_empty_match_check.repetition_register 4711 << "<" << that->data_.u_empty_match_check.repetition_limit 4712 << "?\", shape=septagon"; 4713 break; 4714 case ActionNode::CLEAR_CAPTURES: { 4715 os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from 4716 << " to $" << that->data_.u_clear_captures.range_to 4717 << "\", shape=septagon"; 4718 break; 4719 } 4720 } 4721 os_ << "];\n"; 4722 PrintAttributes(that); 4723 RegExpNode* successor = that->on_success(); 4724 os_ << " n" << that << " -> n" << successor << ";\n"; 4725 Visit(successor); 4726 } 4727 4728 4729 class DispatchTableDumper { 4730 public: 4731 explicit DispatchTableDumper(std::ostream& os) : os_(os) {} 4732 void Call(uc16 key, DispatchTable::Entry entry); 4733 private: 4734 std::ostream& os_; 4735 }; 4736 4737 4738 void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) { 4739 os_ << "[" << AsUC16(key) << "-" << AsUC16(entry.to()) << "]: {"; 4740 OutSet* set = entry.out_set(); 4741 bool first = true; 4742 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { 4743 if (set->Get(i)) { 4744 if (first) { 4745 first = false; 4746 } else { 4747 os_ << ", "; 4748 } 4749 os_ << i; 4750 } 4751 } 4752 os_ << "}\n"; 4753 } 4754 4755 4756 void DispatchTable::Dump() { 4757 OFStream os(stderr); 4758 DispatchTableDumper dumper(os); 4759 tree()->ForEach(&dumper); 4760 } 4761 4762 4763 void RegExpEngine::DotPrint(const char* label, 4764 RegExpNode* node, 4765 bool ignore_case) { 4766 OFStream os(stdout); 4767 DotPrinter printer(os, ignore_case); 4768 printer.PrintNode(label, node); 4769 } 4770 4771 4772 #endif // DEBUG 4773 4774 4775 // ------------------------------------------------------------------- 4776 // Tree to graph conversion 4777 4778 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler, 4779 RegExpNode* on_success) { 4780 ZoneList<TextElement>* elms = 4781 new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone()); 4782 elms->Add(TextElement::Atom(this), compiler->zone()); 4783 return new (compiler->zone()) 4784 TextNode(elms, compiler->read_backward(), on_success); 4785 } 4786 4787 4788 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, 4789 RegExpNode* on_success) { 4790 return new (compiler->zone()) 4791 TextNode(elements(), compiler->read_backward(), on_success); 4792 } 4793 4794 4795 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges, 4796 const int* special_class, 4797 int length) { 4798 length--; // Remove final marker. 4799 DCHECK(special_class[length] == kRangeEndMarker); 4800 DCHECK(ranges->length() != 0); 4801 DCHECK(length != 0); 4802 DCHECK(special_class[0] != 0); 4803 if (ranges->length() != (length >> 1) + 1) { 4804 return false; 4805 } 4806 CharacterRange range = ranges->at(0); 4807 if (range.from() != 0) { 4808 return false; 4809 } 4810 for (int i = 0; i < length; i += 2) { 4811 if (special_class[i] != (range.to() + 1)) { 4812 return false; 4813 } 4814 range = ranges->at((i >> 1) + 1); 4815 if (special_class[i+1] != range.from()) { 4816 return false; 4817 } 4818 } 4819 if (range.to() != String::kMaxCodePoint) { 4820 return false; 4821 } 4822 return true; 4823 } 4824 4825 4826 static bool CompareRanges(ZoneList<CharacterRange>* ranges, 4827 const int* special_class, 4828 int length) { 4829 length--; // Remove final marker. 4830 DCHECK(special_class[length] == kRangeEndMarker); 4831 if (ranges->length() * 2 != length) { 4832 return false; 4833 } 4834 for (int i = 0; i < length; i += 2) { 4835 CharacterRange range = ranges->at(i >> 1); 4836 if (range.from() != special_class[i] || 4837 range.to() != special_class[i + 1] - 1) { 4838 return false; 4839 } 4840 } 4841 return true; 4842 } 4843 4844 4845 bool RegExpCharacterClass::is_standard(Zone* zone) { 4846 // TODO(lrn): Remove need for this function, by not throwing away information 4847 // along the way. 4848 if (is_negated_) { 4849 return false; 4850 } 4851 if (set_.is_standard()) { 4852 return true; 4853 } 4854 if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) { 4855 set_.set_standard_set_type('s'); 4856 return true; 4857 } 4858 if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) { 4859 set_.set_standard_set_type('S'); 4860 return true; 4861 } 4862 if (CompareInverseRanges(set_.ranges(zone), 4863 kLineTerminatorRanges, 4864 kLineTerminatorRangeCount)) { 4865 set_.set_standard_set_type('.'); 4866 return true; 4867 } 4868 if (CompareRanges(set_.ranges(zone), 4869 kLineTerminatorRanges, 4870 kLineTerminatorRangeCount)) { 4871 set_.set_standard_set_type('n'); 4872 return true; 4873 } 4874 if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) { 4875 set_.set_standard_set_type('w'); 4876 return true; 4877 } 4878 if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) { 4879 set_.set_standard_set_type('W'); 4880 return true; 4881 } 4882 return false; 4883 } 4884 4885 4886 UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone, 4887 ZoneList<CharacterRange>* base) 4888 : zone_(zone), 4889 table_(zone), 4890 bmp_(nullptr), 4891 lead_surrogates_(nullptr), 4892 trail_surrogates_(nullptr), 4893 non_bmp_(nullptr) { 4894 // The unicode range splitter categorizes given character ranges into: 4895 // - Code points from the BMP representable by one code unit. 4896 // - Code points outside the BMP that need to be split into surrogate pairs. 4897 // - Lone lead surrogates. 4898 // - Lone trail surrogates. 4899 // Lone surrogates are valid code points, even though no actual characters. 4900 // They require special matching to make sure we do not split surrogate pairs. 4901 // We use the dispatch table to accomplish this. The base range is split up 4902 // by the table by the overlay ranges, and the Call callback is used to 4903 // filter and collect ranges for each category. 4904 for (int i = 0; i < base->length(); i++) { 4905 table_.AddRange(base->at(i), kBase, zone_); 4906 } 4907 // Add overlay ranges. 4908 table_.AddRange(CharacterRange::Range(0, kLeadSurrogateStart - 1), 4909 kBmpCodePoints, zone_); 4910 table_.AddRange(CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd), 4911 kLeadSurrogates, zone_); 4912 table_.AddRange( 4913 CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd), 4914 kTrailSurrogates, zone_); 4915 table_.AddRange( 4916 CharacterRange::Range(kTrailSurrogateEnd + 1, kNonBmpStart - 1), 4917 kBmpCodePoints, zone_); 4918 table_.AddRange(CharacterRange::Range(kNonBmpStart, kNonBmpEnd), 4919 kNonBmpCodePoints, zone_); 4920 table_.ForEach(this); 4921 } 4922 4923 4924 void UnicodeRangeSplitter::Call(uc32 from, DispatchTable::Entry entry) { 4925 OutSet* outset = entry.out_set(); 4926 if (!outset->Get(kBase)) return; 4927 ZoneList<CharacterRange>** target = NULL; 4928 if (outset->Get(kBmpCodePoints)) { 4929 target = &bmp_; 4930 } else if (outset->Get(kLeadSurrogates)) { 4931 target = &lead_surrogates_; 4932 } else if (outset->Get(kTrailSurrogates)) { 4933 target = &trail_surrogates_; 4934 } else { 4935 DCHECK(outset->Get(kNonBmpCodePoints)); 4936 target = &non_bmp_; 4937 } 4938 if (*target == NULL) *target = new (zone_) ZoneList<CharacterRange>(2, zone_); 4939 (*target)->Add(CharacterRange::Range(entry.from(), entry.to()), zone_); 4940 } 4941 4942 4943 void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result, 4944 RegExpNode* on_success, UnicodeRangeSplitter* splitter) { 4945 ZoneList<CharacterRange>* bmp = splitter->bmp(); 4946 if (bmp == nullptr) return; 4947 result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges( 4948 compiler->zone(), bmp, compiler->read_backward(), on_success))); 4949 } 4950 4951 4952 void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result, 4953 RegExpNode* on_success, 4954 UnicodeRangeSplitter* splitter) { 4955 ZoneList<CharacterRange>* non_bmp = splitter->non_bmp(); 4956 if (non_bmp == nullptr) return; 4957 DCHECK(compiler->unicode()); 4958 DCHECK(!compiler->one_byte()); 4959 Zone* zone = compiler->zone(); 4960 CharacterRange::Canonicalize(non_bmp); 4961 for (int i = 0; i < non_bmp->length(); i++) { 4962 // Match surrogate pair. 4963 // E.g. [\u10005-\u11005] becomes 4964 // \ud800[\udc05-\udfff]| 4965 // [\ud801-\ud803][\udc00-\udfff]| 4966 // \ud804[\udc00-\udc05] 4967 uc32 from = non_bmp->at(i).from(); 4968 uc32 to = non_bmp->at(i).to(); 4969 uc16 from_l = unibrow::Utf16::LeadSurrogate(from); 4970 uc16 from_t = unibrow::Utf16::TrailSurrogate(from); 4971 uc16 to_l = unibrow::Utf16::LeadSurrogate(to); 4972 uc16 to_t = unibrow::Utf16::TrailSurrogate(to); 4973 if (from_l == to_l) { 4974 // The lead surrogate is the same. 4975 result->AddAlternative( 4976 GuardedAlternative(TextNode::CreateForSurrogatePair( 4977 zone, CharacterRange::Singleton(from_l), 4978 CharacterRange::Range(from_t, to_t), compiler->read_backward(), 4979 on_success))); 4980 } else { 4981 if (from_t != kTrailSurrogateStart) { 4982 // Add [from_l][from_t-\udfff] 4983 result->AddAlternative( 4984 GuardedAlternative(TextNode::CreateForSurrogatePair( 4985 zone, CharacterRange::Singleton(from_l), 4986 CharacterRange::Range(from_t, kTrailSurrogateEnd), 4987 compiler->read_backward(), on_success))); 4988 from_l++; 4989 } 4990 if (to_t != kTrailSurrogateEnd) { 4991 // Add [to_l][\udc00-to_t] 4992 result->AddAlternative( 4993 GuardedAlternative(TextNode::CreateForSurrogatePair( 4994 zone, CharacterRange::Singleton(to_l), 4995 CharacterRange::Range(kTrailSurrogateStart, to_t), 4996 compiler->read_backward(), on_success))); 4997 to_l--; 4998 } 4999 if (from_l <= to_l) { 5000 // Add [from_l-to_l][\udc00-\udfff] 5001 result->AddAlternative( 5002 GuardedAlternative(TextNode::CreateForSurrogatePair( 5003 zone, CharacterRange::Range(from_l, to_l), 5004 CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd), 5005 compiler->read_backward(), on_success))); 5006 } 5007 } 5008 } 5009 } 5010 5011 5012 RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch( 5013 RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind, 5014 ZoneList<CharacterRange>* match, RegExpNode* on_success, 5015 bool read_backward) { 5016 Zone* zone = compiler->zone(); 5017 RegExpNode* match_node = TextNode::CreateForCharacterRanges( 5018 zone, match, read_backward, on_success); 5019 int stack_register = compiler->UnicodeLookaroundStackRegister(); 5020 int position_register = compiler->UnicodeLookaroundPositionRegister(); 5021 RegExpLookaround::Builder lookaround(false, match_node, stack_register, 5022 position_register); 5023 RegExpNode* negative_match = TextNode::CreateForCharacterRanges( 5024 zone, lookbehind, !read_backward, lookaround.on_match_success()); 5025 return lookaround.ForMatch(negative_match); 5026 } 5027 5028 5029 RegExpNode* MatchAndNegativeLookaroundInReadDirection( 5030 RegExpCompiler* compiler, ZoneList<CharacterRange>* match, 5031 ZoneList<CharacterRange>* lookahead, RegExpNode* on_success, 5032 bool read_backward) { 5033 Zone* zone = compiler->zone(); 5034 int stack_register = compiler->UnicodeLookaroundStackRegister(); 5035 int position_register = compiler->UnicodeLookaroundPositionRegister(); 5036 RegExpLookaround::Builder lookaround(false, on_success, stack_register, 5037 position_register); 5038 RegExpNode* negative_match = TextNode::CreateForCharacterRanges( 5039 zone, lookahead, read_backward, lookaround.on_match_success()); 5040 return TextNode::CreateForCharacterRanges( 5041 zone, match, read_backward, lookaround.ForMatch(negative_match)); 5042 } 5043 5044 5045 void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result, 5046 RegExpNode* on_success, 5047 UnicodeRangeSplitter* splitter) { 5048 ZoneList<CharacterRange>* lead_surrogates = splitter->lead_surrogates(); 5049 if (lead_surrogates == nullptr) return; 5050 Zone* zone = compiler->zone(); 5051 // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]). 5052 ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List( 5053 zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd)); 5054 5055 RegExpNode* match; 5056 if (compiler->read_backward()) { 5057 // Reading backward. Assert that reading forward, there is no trail 5058 // surrogate, and then backward match the lead surrogate. 5059 match = NegativeLookaroundAgainstReadDirectionAndMatch( 5060 compiler, trail_surrogates, lead_surrogates, on_success, true); 5061 } else { 5062 // Reading forward. Forward match the lead surrogate and assert that 5063 // no trail surrogate follows. 5064 match = MatchAndNegativeLookaroundInReadDirection( 5065 compiler, lead_surrogates, trail_surrogates, on_success, false); 5066 } 5067 result->AddAlternative(GuardedAlternative(match)); 5068 } 5069 5070 5071 void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result, 5072 RegExpNode* on_success, 5073 UnicodeRangeSplitter* splitter) { 5074 ZoneList<CharacterRange>* trail_surrogates = splitter->trail_surrogates(); 5075 if (trail_surrogates == nullptr) return; 5076 Zone* zone = compiler->zone(); 5077 // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01 5078 ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List( 5079 zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd)); 5080 5081 RegExpNode* match; 5082 if (compiler->read_backward()) { 5083 // Reading backward. Backward match the trail surrogate and assert that no 5084 // lead surrogate precedes it. 5085 match = MatchAndNegativeLookaroundInReadDirection( 5086 compiler, trail_surrogates, lead_surrogates, on_success, true); 5087 } else { 5088 // Reading forward. Assert that reading backward, there is no lead 5089 // surrogate, and then forward match the trail surrogate. 5090 match = NegativeLookaroundAgainstReadDirectionAndMatch( 5091 compiler, lead_surrogates, trail_surrogates, on_success, false); 5092 } 5093 result->AddAlternative(GuardedAlternative(match)); 5094 } 5095 5096 RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler, 5097 RegExpNode* on_success) { 5098 // This implements ES2015 21.2.5.2.3, AdvanceStringIndex. 5099 DCHECK(!compiler->read_backward()); 5100 Zone* zone = compiler->zone(); 5101 // Advance any character. If the character happens to be a lead surrogate and 5102 // we advanced into the middle of a surrogate pair, it will work out, as 5103 // nothing will match from there. We will have to advance again, consuming 5104 // the associated trail surrogate. 5105 ZoneList<CharacterRange>* range = CharacterRange::List( 5106 zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit)); 5107 return TextNode::CreateForCharacterRanges(zone, range, false, on_success); 5108 } 5109 5110 5111 void AddUnicodeCaseEquivalents(RegExpCompiler* compiler, 5112 ZoneList<CharacterRange>* ranges) { 5113 #ifdef V8_I18N_SUPPORT 5114 // Use ICU to compute the case fold closure over the ranges. 5115 DCHECK(compiler->unicode()); 5116 DCHECK(compiler->ignore_case()); 5117 USet* set = uset_openEmpty(); 5118 for (int i = 0; i < ranges->length(); i++) { 5119 uset_addRange(set, ranges->at(i).from(), ranges->at(i).to()); 5120 } 5121 ranges->Clear(); 5122 uset_closeOver(set, USET_CASE_INSENSITIVE); 5123 // Full case mapping map single characters to multiple characters. 5124 // Those are represented as strings in the set. Remove them so that 5125 // we end up with only simple and common case mappings. 5126 uset_removeAllStrings(set); 5127 int item_count = uset_getItemCount(set); 5128 int item_result = 0; 5129 UErrorCode ec = U_ZERO_ERROR; 5130 Zone* zone = compiler->zone(); 5131 for (int i = 0; i < item_count; i++) { 5132 uc32 start = 0; 5133 uc32 end = 0; 5134 item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec); 5135 ranges->Add(CharacterRange::Range(start, end), zone); 5136 } 5137 // No errors and everything we collected have been ranges. 5138 DCHECK_EQ(U_ZERO_ERROR, ec); 5139 DCHECK_EQ(0, item_result); 5140 uset_close(set); 5141 #else 5142 // Fallback if ICU is not included. 5143 CharacterRange::AddCaseEquivalents(compiler->isolate(), compiler->zone(), 5144 ranges, compiler->one_byte()); 5145 #endif // V8_I18N_SUPPORT 5146 CharacterRange::Canonicalize(ranges); 5147 } 5148 5149 5150 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, 5151 RegExpNode* on_success) { 5152 set_.Canonicalize(); 5153 Zone* zone = compiler->zone(); 5154 ZoneList<CharacterRange>* ranges = this->ranges(zone); 5155 if (compiler->unicode() && compiler->ignore_case()) { 5156 AddUnicodeCaseEquivalents(compiler, ranges); 5157 } 5158 if (compiler->unicode() && !compiler->one_byte()) { 5159 if (is_negated()) { 5160 ZoneList<CharacterRange>* negated = 5161 new (zone) ZoneList<CharacterRange>(2, zone); 5162 CharacterRange::Negate(ranges, negated, zone); 5163 ranges = negated; 5164 } 5165 if (ranges->length() == 0) { 5166 ranges->Add(CharacterRange::Everything(), zone); 5167 RegExpCharacterClass* fail = 5168 new (zone) RegExpCharacterClass(ranges, true); 5169 return new (zone) TextNode(fail, compiler->read_backward(), on_success); 5170 } 5171 if (standard_type() == '*') { 5172 return UnanchoredAdvance(compiler, on_success); 5173 } else { 5174 ChoiceNode* result = new (zone) ChoiceNode(2, zone); 5175 UnicodeRangeSplitter splitter(zone, ranges); 5176 AddBmpCharacters(compiler, result, on_success, &splitter); 5177 AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter); 5178 AddLoneLeadSurrogates(compiler, result, on_success, &splitter); 5179 AddLoneTrailSurrogates(compiler, result, on_success, &splitter); 5180 return result; 5181 } 5182 } else { 5183 return new (zone) TextNode(this, compiler->read_backward(), on_success); 5184 } 5185 } 5186 5187 5188 int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) { 5189 RegExpAtom* atom1 = (*a)->AsAtom(); 5190 RegExpAtom* atom2 = (*b)->AsAtom(); 5191 uc16 character1 = atom1->data().at(0); 5192 uc16 character2 = atom2->data().at(0); 5193 if (character1 < character2) return -1; 5194 if (character1 > character2) return 1; 5195 return 0; 5196 } 5197 5198 5199 static unibrow::uchar Canonical( 5200 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize, 5201 unibrow::uchar c) { 5202 unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth]; 5203 int length = canonicalize->get(c, '\0', chars); 5204 DCHECK_LE(length, 1); 5205 unibrow::uchar canonical = c; 5206 if (length == 1) canonical = chars[0]; 5207 return canonical; 5208 } 5209 5210 5211 int CompareFirstCharCaseIndependent( 5212 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize, 5213 RegExpTree* const* a, RegExpTree* const* b) { 5214 RegExpAtom* atom1 = (*a)->AsAtom(); 5215 RegExpAtom* atom2 = (*b)->AsAtom(); 5216 unibrow::uchar character1 = atom1->data().at(0); 5217 unibrow::uchar character2 = atom2->data().at(0); 5218 if (character1 == character2) return 0; 5219 if (character1 >= 'a' || character2 >= 'a') { 5220 character1 = Canonical(canonicalize, character1); 5221 character2 = Canonical(canonicalize, character2); 5222 } 5223 return static_cast<int>(character1) - static_cast<int>(character2); 5224 } 5225 5226 5227 // We can stable sort runs of atoms, since the order does not matter if they 5228 // start with different characters. 5229 // Returns true if any consecutive atoms were found. 5230 bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) { 5231 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 5232 int length = alternatives->length(); 5233 bool found_consecutive_atoms = false; 5234 for (int i = 0; i < length; i++) { 5235 while (i < length) { 5236 RegExpTree* alternative = alternatives->at(i); 5237 if (alternative->IsAtom()) break; 5238 i++; 5239 } 5240 // i is length or it is the index of an atom. 5241 if (i == length) break; 5242 int first_atom = i; 5243 i++; 5244 while (i < length) { 5245 RegExpTree* alternative = alternatives->at(i); 5246 if (!alternative->IsAtom()) break; 5247 i++; 5248 } 5249 // Sort atoms to get ones with common prefixes together. 5250 // This step is more tricky if we are in a case-independent regexp, 5251 // because it would change /is|I/ to /I|is/, and order matters when 5252 // the regexp parts don't match only disjoint starting points. To fix 5253 // this we have a version of CompareFirstChar that uses case- 5254 // independent character classes for comparison. 5255 DCHECK_LT(first_atom, alternatives->length()); 5256 DCHECK_LE(i, alternatives->length()); 5257 DCHECK_LE(first_atom, i); 5258 if (compiler->ignore_case()) { 5259 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize = 5260 compiler->isolate()->regexp_macro_assembler_canonicalize(); 5261 auto compare_closure = 5262 [canonicalize](RegExpTree* const* a, RegExpTree* const* b) { 5263 return CompareFirstCharCaseIndependent(canonicalize, a, b); 5264 }; 5265 alternatives->StableSort(compare_closure, first_atom, i - first_atom); 5266 } else { 5267 alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom); 5268 } 5269 if (i - first_atom > 1) found_consecutive_atoms = true; 5270 } 5271 return found_consecutive_atoms; 5272 } 5273 5274 5275 // Optimizes ab|ac|az to a(?:b|c|d). 5276 void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { 5277 Zone* zone = compiler->zone(); 5278 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 5279 int length = alternatives->length(); 5280 5281 int write_posn = 0; 5282 int i = 0; 5283 while (i < length) { 5284 RegExpTree* alternative = alternatives->at(i); 5285 if (!alternative->IsAtom()) { 5286 alternatives->at(write_posn++) = alternatives->at(i); 5287 i++; 5288 continue; 5289 } 5290 RegExpAtom* atom = alternative->AsAtom(); 5291 unibrow::uchar common_prefix = atom->data().at(0); 5292 int first_with_prefix = i; 5293 int prefix_length = atom->length(); 5294 i++; 5295 while (i < length) { 5296 alternative = alternatives->at(i); 5297 if (!alternative->IsAtom()) break; 5298 atom = alternative->AsAtom(); 5299 unibrow::uchar new_prefix = atom->data().at(0); 5300 if (new_prefix != common_prefix) { 5301 if (!compiler->ignore_case()) break; 5302 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize = 5303 compiler->isolate()->regexp_macro_assembler_canonicalize(); 5304 new_prefix = Canonical(canonicalize, new_prefix); 5305 common_prefix = Canonical(canonicalize, common_prefix); 5306 if (new_prefix != common_prefix) break; 5307 } 5308 prefix_length = Min(prefix_length, atom->length()); 5309 i++; 5310 } 5311 if (i > first_with_prefix + 2) { 5312 // Found worthwhile run of alternatives with common prefix of at least one 5313 // character. The sorting function above did not sort on more than one 5314 // character for reasons of correctness, but there may still be a longer 5315 // common prefix if the terms were similar or presorted in the input. 5316 // Find out how long the common prefix is. 5317 int run_length = i - first_with_prefix; 5318 atom = alternatives->at(first_with_prefix)->AsAtom(); 5319 for (int j = 1; j < run_length && prefix_length > 1; j++) { 5320 RegExpAtom* old_atom = 5321 alternatives->at(j + first_with_prefix)->AsAtom(); 5322 for (int k = 1; k < prefix_length; k++) { 5323 if (atom->data().at(k) != old_atom->data().at(k)) { 5324 prefix_length = k; 5325 break; 5326 } 5327 } 5328 } 5329 RegExpAtom* prefix = 5330 new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length)); 5331 ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone); 5332 pair->Add(prefix, zone); 5333 ZoneList<RegExpTree*>* suffixes = 5334 new (zone) ZoneList<RegExpTree*>(run_length, zone); 5335 for (int j = 0; j < run_length; j++) { 5336 RegExpAtom* old_atom = 5337 alternatives->at(j + first_with_prefix)->AsAtom(); 5338 int len = old_atom->length(); 5339 if (len == prefix_length) { 5340 suffixes->Add(new (zone) RegExpEmpty(), zone); 5341 } else { 5342 RegExpTree* suffix = new (zone) RegExpAtom( 5343 old_atom->data().SubVector(prefix_length, old_atom->length())); 5344 suffixes->Add(suffix, zone); 5345 } 5346 } 5347 pair->Add(new (zone) RegExpDisjunction(suffixes), zone); 5348 alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair); 5349 } else { 5350 // Just copy any non-worthwhile alternatives. 5351 for (int j = first_with_prefix; j < i; j++) { 5352 alternatives->at(write_posn++) = alternatives->at(j); 5353 } 5354 } 5355 } 5356 alternatives->Rewind(write_posn); // Trim end of array. 5357 } 5358 5359 5360 // Optimizes b|c|z to [bcz]. 5361 void RegExpDisjunction::FixSingleCharacterDisjunctions( 5362 RegExpCompiler* compiler) { 5363 Zone* zone = compiler->zone(); 5364 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 5365 int length = alternatives->length(); 5366 5367 int write_posn = 0; 5368 int i = 0; 5369 while (i < length) { 5370 RegExpTree* alternative = alternatives->at(i); 5371 if (!alternative->IsAtom()) { 5372 alternatives->at(write_posn++) = alternatives->at(i); 5373 i++; 5374 continue; 5375 } 5376 RegExpAtom* atom = alternative->AsAtom(); 5377 if (atom->length() != 1) { 5378 alternatives->at(write_posn++) = alternatives->at(i); 5379 i++; 5380 continue; 5381 } 5382 int first_in_run = i; 5383 i++; 5384 while (i < length) { 5385 alternative = alternatives->at(i); 5386 if (!alternative->IsAtom()) break; 5387 atom = alternative->AsAtom(); 5388 if (atom->length() != 1) break; 5389 i++; 5390 } 5391 if (i > first_in_run + 1) { 5392 // Found non-trivial run of single-character alternatives. 5393 int run_length = i - first_in_run; 5394 ZoneList<CharacterRange>* ranges = 5395 new (zone) ZoneList<CharacterRange>(2, zone); 5396 for (int j = 0; j < run_length; j++) { 5397 RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom(); 5398 DCHECK_EQ(old_atom->length(), 1); 5399 ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone); 5400 } 5401 alternatives->at(write_posn++) = 5402 new (zone) RegExpCharacterClass(ranges, false); 5403 } else { 5404 // Just copy any trivial alternatives. 5405 for (int j = first_in_run; j < i; j++) { 5406 alternatives->at(write_posn++) = alternatives->at(j); 5407 } 5408 } 5409 } 5410 alternatives->Rewind(write_posn); // Trim end of array. 5411 } 5412 5413 5414 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler, 5415 RegExpNode* on_success) { 5416 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 5417 5418 if (alternatives->length() > 2) { 5419 bool found_consecutive_atoms = SortConsecutiveAtoms(compiler); 5420 if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler); 5421 FixSingleCharacterDisjunctions(compiler); 5422 if (alternatives->length() == 1) { 5423 return alternatives->at(0)->ToNode(compiler, on_success); 5424 } 5425 } 5426 5427 int length = alternatives->length(); 5428 5429 ChoiceNode* result = 5430 new(compiler->zone()) ChoiceNode(length, compiler->zone()); 5431 for (int i = 0; i < length; i++) { 5432 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler, 5433 on_success)); 5434 result->AddAlternative(alternative); 5435 } 5436 return result; 5437 } 5438 5439 5440 RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler, 5441 RegExpNode* on_success) { 5442 return ToNode(min(), 5443 max(), 5444 is_greedy(), 5445 body(), 5446 compiler, 5447 on_success); 5448 } 5449 5450 5451 // Scoped object to keep track of how much we unroll quantifier loops in the 5452 // regexp graph generator. 5453 class RegExpExpansionLimiter { 5454 public: 5455 static const int kMaxExpansionFactor = 6; 5456 RegExpExpansionLimiter(RegExpCompiler* compiler, int factor) 5457 : compiler_(compiler), 5458 saved_expansion_factor_(compiler->current_expansion_factor()), 5459 ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) { 5460 DCHECK(factor > 0); 5461 if (ok_to_expand_) { 5462 if (factor > kMaxExpansionFactor) { 5463 // Avoid integer overflow of the current expansion factor. 5464 ok_to_expand_ = false; 5465 compiler->set_current_expansion_factor(kMaxExpansionFactor + 1); 5466 } else { 5467 int new_factor = saved_expansion_factor_ * factor; 5468 ok_to_expand_ = (new_factor <= kMaxExpansionFactor); 5469 compiler->set_current_expansion_factor(new_factor); 5470 } 5471 } 5472 } 5473 5474 ~RegExpExpansionLimiter() { 5475 compiler_->set_current_expansion_factor(saved_expansion_factor_); 5476 } 5477 5478 bool ok_to_expand() { return ok_to_expand_; } 5479 5480 private: 5481 RegExpCompiler* compiler_; 5482 int saved_expansion_factor_; 5483 bool ok_to_expand_; 5484 5485 DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter); 5486 }; 5487 5488 5489 RegExpNode* RegExpQuantifier::ToNode(int min, 5490 int max, 5491 bool is_greedy, 5492 RegExpTree* body, 5493 RegExpCompiler* compiler, 5494 RegExpNode* on_success, 5495 bool not_at_start) { 5496 // x{f, t} becomes this: 5497 // 5498 // (r++)<-. 5499 // | ` 5500 // | (x) 5501 // v ^ 5502 // (r=0)-->(?)---/ [if r < t] 5503 // | 5504 // [if r >= f] \----> ... 5505 // 5506 5507 // 15.10.2.5 RepeatMatcher algorithm. 5508 // The parser has already eliminated the case where max is 0. In the case 5509 // where max_match is zero the parser has removed the quantifier if min was 5510 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom. 5511 5512 // If we know that we cannot match zero length then things are a little 5513 // simpler since we don't need to make the special zero length match check 5514 // from step 2.1. If the min and max are small we can unroll a little in 5515 // this case. 5516 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,} 5517 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3} 5518 if (max == 0) return on_success; // This can happen due to recursion. 5519 bool body_can_be_empty = (body->min_match() == 0); 5520 int body_start_reg = RegExpCompiler::kNoRegister; 5521 Interval capture_registers = body->CaptureRegisters(); 5522 bool needs_capture_clearing = !capture_registers.is_empty(); 5523 Zone* zone = compiler->zone(); 5524 5525 if (body_can_be_empty) { 5526 body_start_reg = compiler->AllocateRegister(); 5527 } else if (compiler->optimize() && !needs_capture_clearing) { 5528 // Only unroll if there are no captures and the body can't be 5529 // empty. 5530 { 5531 RegExpExpansionLimiter limiter( 5532 compiler, min + ((max != min) ? 1 : 0)); 5533 if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) { 5534 int new_max = (max == kInfinity) ? max : max - min; 5535 // Recurse once to get the loop or optional matches after the fixed 5536 // ones. 5537 RegExpNode* answer = ToNode( 5538 0, new_max, is_greedy, body, compiler, on_success, true); 5539 // Unroll the forced matches from 0 to min. This can cause chains of 5540 // TextNodes (which the parser does not generate). These should be 5541 // combined if it turns out they hinder good code generation. 5542 for (int i = 0; i < min; i++) { 5543 answer = body->ToNode(compiler, answer); 5544 } 5545 return answer; 5546 } 5547 } 5548 if (max <= kMaxUnrolledMaxMatches && min == 0) { 5549 DCHECK(max > 0); // Due to the 'if' above. 5550 RegExpExpansionLimiter limiter(compiler, max); 5551 if (limiter.ok_to_expand()) { 5552 // Unroll the optional matches up to max. 5553 RegExpNode* answer = on_success; 5554 for (int i = 0; i < max; i++) { 5555 ChoiceNode* alternation = new(zone) ChoiceNode(2, zone); 5556 if (is_greedy) { 5557 alternation->AddAlternative( 5558 GuardedAlternative(body->ToNode(compiler, answer))); 5559 alternation->AddAlternative(GuardedAlternative(on_success)); 5560 } else { 5561 alternation->AddAlternative(GuardedAlternative(on_success)); 5562 alternation->AddAlternative( 5563 GuardedAlternative(body->ToNode(compiler, answer))); 5564 } 5565 answer = alternation; 5566 if (not_at_start && !compiler->read_backward()) { 5567 alternation->set_not_at_start(); 5568 } 5569 } 5570 return answer; 5571 } 5572 } 5573 } 5574 bool has_min = min > 0; 5575 bool has_max = max < RegExpTree::kInfinity; 5576 bool needs_counter = has_min || has_max; 5577 int reg_ctr = needs_counter 5578 ? compiler->AllocateRegister() 5579 : RegExpCompiler::kNoRegister; 5580 LoopChoiceNode* center = new (zone) 5581 LoopChoiceNode(body->min_match() == 0, compiler->read_backward(), zone); 5582 if (not_at_start && !compiler->read_backward()) center->set_not_at_start(); 5583 RegExpNode* loop_return = needs_counter 5584 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center)) 5585 : static_cast<RegExpNode*>(center); 5586 if (body_can_be_empty) { 5587 // If the body can be empty we need to check if it was and then 5588 // backtrack. 5589 loop_return = ActionNode::EmptyMatchCheck(body_start_reg, 5590 reg_ctr, 5591 min, 5592 loop_return); 5593 } 5594 RegExpNode* body_node = body->ToNode(compiler, loop_return); 5595 if (body_can_be_empty) { 5596 // If the body can be empty we need to store the start position 5597 // so we can bail out if it was empty. 5598 body_node = ActionNode::StorePosition(body_start_reg, false, body_node); 5599 } 5600 if (needs_capture_clearing) { 5601 // Before entering the body of this loop we need to clear captures. 5602 body_node = ActionNode::ClearCaptures(capture_registers, body_node); 5603 } 5604 GuardedAlternative body_alt(body_node); 5605 if (has_max) { 5606 Guard* body_guard = 5607 new(zone) Guard(reg_ctr, Guard::LT, max); 5608 body_alt.AddGuard(body_guard, zone); 5609 } 5610 GuardedAlternative rest_alt(on_success); 5611 if (has_min) { 5612 Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min); 5613 rest_alt.AddGuard(rest_guard, zone); 5614 } 5615 if (is_greedy) { 5616 center->AddLoopAlternative(body_alt); 5617 center->AddContinueAlternative(rest_alt); 5618 } else { 5619 center->AddContinueAlternative(rest_alt); 5620 center->AddLoopAlternative(body_alt); 5621 } 5622 if (needs_counter) { 5623 return ActionNode::SetRegister(reg_ctr, 0, center); 5624 } else { 5625 return center; 5626 } 5627 } 5628 5629 5630 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, 5631 RegExpNode* on_success) { 5632 NodeInfo info; 5633 Zone* zone = compiler->zone(); 5634 5635 switch (assertion_type()) { 5636 case START_OF_LINE: 5637 return AssertionNode::AfterNewline(on_success); 5638 case START_OF_INPUT: 5639 return AssertionNode::AtStart(on_success); 5640 case BOUNDARY: 5641 return AssertionNode::AtBoundary(on_success); 5642 case NON_BOUNDARY: 5643 return AssertionNode::AtNonBoundary(on_success); 5644 case END_OF_INPUT: 5645 return AssertionNode::AtEnd(on_success); 5646 case END_OF_LINE: { 5647 // Compile $ in multiline regexps as an alternation with a positive 5648 // lookahead in one side and an end-of-input on the other side. 5649 // We need two registers for the lookahead. 5650 int stack_pointer_register = compiler->AllocateRegister(); 5651 int position_register = compiler->AllocateRegister(); 5652 // The ChoiceNode to distinguish between a newline and end-of-input. 5653 ChoiceNode* result = new(zone) ChoiceNode(2, zone); 5654 // Create a newline atom. 5655 ZoneList<CharacterRange>* newline_ranges = 5656 new(zone) ZoneList<CharacterRange>(3, zone); 5657 CharacterRange::AddClassEscape('n', newline_ranges, zone); 5658 RegExpCharacterClass* newline_atom = new (zone) RegExpCharacterClass('n'); 5659 TextNode* newline_matcher = new (zone) TextNode( 5660 newline_atom, false, ActionNode::PositiveSubmatchSuccess( 5661 stack_pointer_register, position_register, 5662 0, // No captures inside. 5663 -1, // Ignored if no captures. 5664 on_success)); 5665 // Create an end-of-input matcher. 5666 RegExpNode* end_of_line = ActionNode::BeginSubmatch( 5667 stack_pointer_register, 5668 position_register, 5669 newline_matcher); 5670 // Add the two alternatives to the ChoiceNode. 5671 GuardedAlternative eol_alternative(end_of_line); 5672 result->AddAlternative(eol_alternative); 5673 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success)); 5674 result->AddAlternative(end_alternative); 5675 return result; 5676 } 5677 default: 5678 UNREACHABLE(); 5679 } 5680 return on_success; 5681 } 5682 5683 5684 RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler, 5685 RegExpNode* on_success) { 5686 return new (compiler->zone()) 5687 BackReferenceNode(RegExpCapture::StartRegister(index()), 5688 RegExpCapture::EndRegister(index()), 5689 compiler->read_backward(), on_success); 5690 } 5691 5692 5693 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler, 5694 RegExpNode* on_success) { 5695 return on_success; 5696 } 5697 5698 5699 RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success, 5700 int stack_pointer_register, 5701 int position_register, 5702 int capture_register_count, 5703 int capture_register_start) 5704 : is_positive_(is_positive), 5705 on_success_(on_success), 5706 stack_pointer_register_(stack_pointer_register), 5707 position_register_(position_register) { 5708 if (is_positive_) { 5709 on_match_success_ = ActionNode::PositiveSubmatchSuccess( 5710 stack_pointer_register, position_register, capture_register_count, 5711 capture_register_start, on_success_); 5712 } else { 5713 Zone* zone = on_success_->zone(); 5714 on_match_success_ = new (zone) NegativeSubmatchSuccess( 5715 stack_pointer_register, position_register, capture_register_count, 5716 capture_register_start, zone); 5717 } 5718 } 5719 5720 5721 RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) { 5722 if (is_positive_) { 5723 return ActionNode::BeginSubmatch(stack_pointer_register_, 5724 position_register_, match); 5725 } else { 5726 Zone* zone = on_success_->zone(); 5727 // We use a ChoiceNode to represent the negative lookaround. The first 5728 // alternative is the negative match. On success, the end node backtracks. 5729 // On failure, the second alternative is tried and leads to success. 5730 // NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the 5731 // first exit when calculating quick checks. 5732 ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode( 5733 GuardedAlternative(match), GuardedAlternative(on_success_), zone); 5734 return ActionNode::BeginSubmatch(stack_pointer_register_, 5735 position_register_, choice_node); 5736 } 5737 } 5738 5739 5740 RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler, 5741 RegExpNode* on_success) { 5742 int stack_pointer_register = compiler->AllocateRegister(); 5743 int position_register = compiler->AllocateRegister(); 5744 5745 const int registers_per_capture = 2; 5746 const int register_of_first_capture = 2; 5747 int register_count = capture_count_ * registers_per_capture; 5748 int register_start = 5749 register_of_first_capture + capture_from_ * registers_per_capture; 5750 5751 RegExpNode* result; 5752 bool was_reading_backward = compiler->read_backward(); 5753 compiler->set_read_backward(type() == LOOKBEHIND); 5754 Builder builder(is_positive(), on_success, stack_pointer_register, 5755 position_register, register_count, register_start); 5756 RegExpNode* match = body_->ToNode(compiler, builder.on_match_success()); 5757 result = builder.ForMatch(match); 5758 compiler->set_read_backward(was_reading_backward); 5759 return result; 5760 } 5761 5762 5763 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler, 5764 RegExpNode* on_success) { 5765 return ToNode(body(), index(), compiler, on_success); 5766 } 5767 5768 5769 RegExpNode* RegExpCapture::ToNode(RegExpTree* body, 5770 int index, 5771 RegExpCompiler* compiler, 5772 RegExpNode* on_success) { 5773 DCHECK_NOT_NULL(body); 5774 int start_reg = RegExpCapture::StartRegister(index); 5775 int end_reg = RegExpCapture::EndRegister(index); 5776 if (compiler->read_backward()) std::swap(start_reg, end_reg); 5777 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success); 5778 RegExpNode* body_node = body->ToNode(compiler, store_end); 5779 return ActionNode::StorePosition(start_reg, true, body_node); 5780 } 5781 5782 5783 RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler, 5784 RegExpNode* on_success) { 5785 ZoneList<RegExpTree*>* children = nodes(); 5786 RegExpNode* current = on_success; 5787 if (compiler->read_backward()) { 5788 for (int i = 0; i < children->length(); i++) { 5789 current = children->at(i)->ToNode(compiler, current); 5790 } 5791 } else { 5792 for (int i = children->length() - 1; i >= 0; i--) { 5793 current = children->at(i)->ToNode(compiler, current); 5794 } 5795 } 5796 return current; 5797 } 5798 5799 5800 static void AddClass(const int* elmv, 5801 int elmc, 5802 ZoneList<CharacterRange>* ranges, 5803 Zone* zone) { 5804 elmc--; 5805 DCHECK(elmv[elmc] == kRangeEndMarker); 5806 for (int i = 0; i < elmc; i += 2) { 5807 DCHECK(elmv[i] < elmv[i + 1]); 5808 ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone); 5809 } 5810 } 5811 5812 5813 static void AddClassNegated(const int *elmv, 5814 int elmc, 5815 ZoneList<CharacterRange>* ranges, 5816 Zone* zone) { 5817 elmc--; 5818 DCHECK(elmv[elmc] == kRangeEndMarker); 5819 DCHECK(elmv[0] != 0x0000); 5820 DCHECK(elmv[elmc - 1] != String::kMaxCodePoint); 5821 uc16 last = 0x0000; 5822 for (int i = 0; i < elmc; i += 2) { 5823 DCHECK(last <= elmv[i] - 1); 5824 DCHECK(elmv[i] < elmv[i + 1]); 5825 ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone); 5826 last = elmv[i + 1]; 5827 } 5828 ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone); 5829 } 5830 5831 5832 void CharacterRange::AddClassEscape(uc16 type, 5833 ZoneList<CharacterRange>* ranges, 5834 Zone* zone) { 5835 switch (type) { 5836 case 's': 5837 AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone); 5838 break; 5839 case 'S': 5840 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone); 5841 break; 5842 case 'w': 5843 AddClass(kWordRanges, kWordRangeCount, ranges, zone); 5844 break; 5845 case 'W': 5846 AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone); 5847 break; 5848 case 'd': 5849 AddClass(kDigitRanges, kDigitRangeCount, ranges, zone); 5850 break; 5851 case 'D': 5852 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone); 5853 break; 5854 case '.': 5855 AddClassNegated(kLineTerminatorRanges, 5856 kLineTerminatorRangeCount, 5857 ranges, 5858 zone); 5859 break; 5860 // This is not a character range as defined by the spec but a 5861 // convenient shorthand for a character class that matches any 5862 // character. 5863 case '*': 5864 ranges->Add(CharacterRange::Everything(), zone); 5865 break; 5866 // This is the set of characters matched by the $ and ^ symbols 5867 // in multiline mode. 5868 case 'n': 5869 AddClass(kLineTerminatorRanges, 5870 kLineTerminatorRangeCount, 5871 ranges, 5872 zone); 5873 break; 5874 default: 5875 UNREACHABLE(); 5876 } 5877 } 5878 5879 5880 Vector<const int> CharacterRange::GetWordBounds() { 5881 return Vector<const int>(kWordRanges, kWordRangeCount - 1); 5882 } 5883 5884 5885 void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone, 5886 ZoneList<CharacterRange>* ranges, 5887 bool is_one_byte) { 5888 CharacterRange::Canonicalize(ranges); 5889 int range_count = ranges->length(); 5890 for (int i = 0; i < range_count; i++) { 5891 CharacterRange range = ranges->at(i); 5892 uc32 bottom = range.from(); 5893 if (bottom > String::kMaxUtf16CodeUnit) return; 5894 uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit); 5895 // Nothing to be done for surrogates. 5896 if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) return; 5897 if (is_one_byte && !RangeContainsLatin1Equivalents(range)) { 5898 if (bottom > String::kMaxOneByteCharCode) return; 5899 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode; 5900 } 5901 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 5902 if (top == bottom) { 5903 // If this is a singleton we just expand the one character. 5904 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars); 5905 for (int i = 0; i < length; i++) { 5906 uc32 chr = chars[i]; 5907 if (chr != bottom) { 5908 ranges->Add(CharacterRange::Singleton(chars[i]), zone); 5909 } 5910 } 5911 } else { 5912 // If this is a range we expand the characters block by block, expanding 5913 // contiguous subranges (blocks) one at a time. The approach is as 5914 // follows. For a given start character we look up the remainder of the 5915 // block that contains it (represented by the end point), for instance we 5916 // find 'z' if the character is 'c'. A block is characterized by the 5917 // property that all characters uncanonicalize in the same way, except 5918 // that each entry in the result is incremented by the distance from the 5919 // first element. So a-z is a block because 'a' uncanonicalizes to ['a', 5920 // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. Once 5921 // we've found the end point we look up its uncanonicalization and 5922 // produce a range for each element. For instance for [c-f] we look up 5923 // ['z', 'Z'] and produce [c-f] and [C-F]. We then only add a range if 5924 // it is not already contained in the input, so [c-f] will be skipped but 5925 // [C-F] will be added. If this range is not completely contained in a 5926 // block we do this for all the blocks covered by the range (handling 5927 // characters that is not in a block as a "singleton block"). 5928 unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 5929 int pos = bottom; 5930 while (pos <= top) { 5931 int length = 5932 isolate->jsregexp_canonrange()->get(pos, '\0', equivalents); 5933 uc32 block_end; 5934 if (length == 0) { 5935 block_end = pos; 5936 } else { 5937 DCHECK_EQ(1, length); 5938 block_end = equivalents[0]; 5939 } 5940 int end = (block_end > top) ? top : block_end; 5941 length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', 5942 equivalents); 5943 for (int i = 0; i < length; i++) { 5944 uc32 c = equivalents[i]; 5945 uc32 range_from = c - (block_end - pos); 5946 uc32 range_to = c - (block_end - end); 5947 if (!(bottom <= range_from && range_to <= top)) { 5948 ranges->Add(CharacterRange::Range(range_from, range_to), zone); 5949 } 5950 } 5951 pos = end + 1; 5952 } 5953 } 5954 } 5955 } 5956 5957 5958 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) { 5959 DCHECK_NOT_NULL(ranges); 5960 int n = ranges->length(); 5961 if (n <= 1) return true; 5962 int max = ranges->at(0).to(); 5963 for (int i = 1; i < n; i++) { 5964 CharacterRange next_range = ranges->at(i); 5965 if (next_range.from() <= max + 1) return false; 5966 max = next_range.to(); 5967 } 5968 return true; 5969 } 5970 5971 5972 ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) { 5973 if (ranges_ == NULL) { 5974 ranges_ = new(zone) ZoneList<CharacterRange>(2, zone); 5975 CharacterRange::AddClassEscape(standard_set_type_, ranges_, zone); 5976 } 5977 return ranges_; 5978 } 5979 5980 5981 // Move a number of elements in a zonelist to another position 5982 // in the same list. Handles overlapping source and target areas. 5983 static void MoveRanges(ZoneList<CharacterRange>* list, 5984 int from, 5985 int to, 5986 int count) { 5987 // Ranges are potentially overlapping. 5988 if (from < to) { 5989 for (int i = count - 1; i >= 0; i--) { 5990 list->at(to + i) = list->at(from + i); 5991 } 5992 } else { 5993 for (int i = 0; i < count; i++) { 5994 list->at(to + i) = list->at(from + i); 5995 } 5996 } 5997 } 5998 5999 6000 static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list, 6001 int count, 6002 CharacterRange insert) { 6003 // Inserts a range into list[0..count[, which must be sorted 6004 // by from value and non-overlapping and non-adjacent, using at most 6005 // list[0..count] for the result. Returns the number of resulting 6006 // canonicalized ranges. Inserting a range may collapse existing ranges into 6007 // fewer ranges, so the return value can be anything in the range 1..count+1. 6008 uc32 from = insert.from(); 6009 uc32 to = insert.to(); 6010 int start_pos = 0; 6011 int end_pos = count; 6012 for (int i = count - 1; i >= 0; i--) { 6013 CharacterRange current = list->at(i); 6014 if (current.from() > to + 1) { 6015 end_pos = i; 6016 } else if (current.to() + 1 < from) { 6017 start_pos = i + 1; 6018 break; 6019 } 6020 } 6021 6022 // Inserted range overlaps, or is adjacent to, ranges at positions 6023 // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are 6024 // not affected by the insertion. 6025 // If start_pos == end_pos, the range must be inserted before start_pos. 6026 // if start_pos < end_pos, the entire range from start_pos to end_pos 6027 // must be merged with the insert range. 6028 6029 if (start_pos == end_pos) { 6030 // Insert between existing ranges at position start_pos. 6031 if (start_pos < count) { 6032 MoveRanges(list, start_pos, start_pos + 1, count - start_pos); 6033 } 6034 list->at(start_pos) = insert; 6035 return count + 1; 6036 } 6037 if (start_pos + 1 == end_pos) { 6038 // Replace single existing range at position start_pos. 6039 CharacterRange to_replace = list->at(start_pos); 6040 int new_from = Min(to_replace.from(), from); 6041 int new_to = Max(to_replace.to(), to); 6042 list->at(start_pos) = CharacterRange::Range(new_from, new_to); 6043 return count; 6044 } 6045 // Replace a number of existing ranges from start_pos to end_pos - 1. 6046 // Move the remaining ranges down. 6047 6048 int new_from = Min(list->at(start_pos).from(), from); 6049 int new_to = Max(list->at(end_pos - 1).to(), to); 6050 if (end_pos < count) { 6051 MoveRanges(list, end_pos, start_pos + 1, count - end_pos); 6052 } 6053 list->at(start_pos) = CharacterRange::Range(new_from, new_to); 6054 return count - (end_pos - start_pos) + 1; 6055 } 6056 6057 6058 void CharacterSet::Canonicalize() { 6059 // Special/default classes are always considered canonical. The result 6060 // of calling ranges() will be sorted. 6061 if (ranges_ == NULL) return; 6062 CharacterRange::Canonicalize(ranges_); 6063 } 6064 6065 6066 void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) { 6067 if (character_ranges->length() <= 1) return; 6068 // Check whether ranges are already canonical (increasing, non-overlapping, 6069 // non-adjacent). 6070 int n = character_ranges->length(); 6071 int max = character_ranges->at(0).to(); 6072 int i = 1; 6073 while (i < n) { 6074 CharacterRange current = character_ranges->at(i); 6075 if (current.from() <= max + 1) { 6076 break; 6077 } 6078 max = current.to(); 6079 i++; 6080 } 6081 // Canonical until the i'th range. If that's all of them, we are done. 6082 if (i == n) return; 6083 6084 // The ranges at index i and forward are not canonicalized. Make them so by 6085 // doing the equivalent of insertion sort (inserting each into the previous 6086 // list, in order). 6087 // Notice that inserting a range can reduce the number of ranges in the 6088 // result due to combining of adjacent and overlapping ranges. 6089 int read = i; // Range to insert. 6090 int num_canonical = i; // Length of canonicalized part of list. 6091 do { 6092 num_canonical = InsertRangeInCanonicalList(character_ranges, 6093 num_canonical, 6094 character_ranges->at(read)); 6095 read++; 6096 } while (read < n); 6097 character_ranges->Rewind(num_canonical); 6098 6099 DCHECK(CharacterRange::IsCanonical(character_ranges)); 6100 } 6101 6102 6103 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges, 6104 ZoneList<CharacterRange>* negated_ranges, 6105 Zone* zone) { 6106 DCHECK(CharacterRange::IsCanonical(ranges)); 6107 DCHECK_EQ(0, negated_ranges->length()); 6108 int range_count = ranges->length(); 6109 uc32 from = 0; 6110 int i = 0; 6111 if (range_count > 0 && ranges->at(0).from() == 0) { 6112 from = ranges->at(0).to() + 1; 6113 i = 1; 6114 } 6115 while (i < range_count) { 6116 CharacterRange range = ranges->at(i); 6117 negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone); 6118 from = range.to() + 1; 6119 i++; 6120 } 6121 if (from < String::kMaxCodePoint) { 6122 negated_ranges->Add(CharacterRange::Range(from, String::kMaxCodePoint), 6123 zone); 6124 } 6125 } 6126 6127 6128 // ------------------------------------------------------------------- 6129 // Splay tree 6130 6131 6132 OutSet* OutSet::Extend(unsigned value, Zone* zone) { 6133 if (Get(value)) 6134 return this; 6135 if (successors(zone) != NULL) { 6136 for (int i = 0; i < successors(zone)->length(); i++) { 6137 OutSet* successor = successors(zone)->at(i); 6138 if (successor->Get(value)) 6139 return successor; 6140 } 6141 } else { 6142 successors_ = new(zone) ZoneList<OutSet*>(2, zone); 6143 } 6144 OutSet* result = new(zone) OutSet(first_, remaining_); 6145 result->Set(value, zone); 6146 successors(zone)->Add(result, zone); 6147 return result; 6148 } 6149 6150 6151 void OutSet::Set(unsigned value, Zone *zone) { 6152 if (value < kFirstLimit) { 6153 first_ |= (1 << value); 6154 } else { 6155 if (remaining_ == NULL) 6156 remaining_ = new(zone) ZoneList<unsigned>(1, zone); 6157 if (remaining_->is_empty() || !remaining_->Contains(value)) 6158 remaining_->Add(value, zone); 6159 } 6160 } 6161 6162 6163 bool OutSet::Get(unsigned value) const { 6164 if (value < kFirstLimit) { 6165 return (first_ & (1 << value)) != 0; 6166 } else if (remaining_ == NULL) { 6167 return false; 6168 } else { 6169 return remaining_->Contains(value); 6170 } 6171 } 6172 6173 6174 const uc32 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar; 6175 6176 6177 void DispatchTable::AddRange(CharacterRange full_range, int value, 6178 Zone* zone) { 6179 CharacterRange current = full_range; 6180 if (tree()->is_empty()) { 6181 // If this is the first range we just insert into the table. 6182 ZoneSplayTree<Config>::Locator loc; 6183 bool inserted = tree()->Insert(current.from(), &loc); 6184 DCHECK(inserted); 6185 USE(inserted); 6186 loc.set_value(Entry(current.from(), current.to(), 6187 empty()->Extend(value, zone))); 6188 return; 6189 } 6190 // First see if there is a range to the left of this one that 6191 // overlaps. 6192 ZoneSplayTree<Config>::Locator loc; 6193 if (tree()->FindGreatestLessThan(current.from(), &loc)) { 6194 Entry* entry = &loc.value(); 6195 // If we've found a range that overlaps with this one, and it 6196 // starts strictly to the left of this one, we have to fix it 6197 // because the following code only handles ranges that start on 6198 // or after the start point of the range we're adding. 6199 if (entry->from() < current.from() && entry->to() >= current.from()) { 6200 // Snap the overlapping range in half around the start point of 6201 // the range we're adding. 6202 CharacterRange left = 6203 CharacterRange::Range(entry->from(), current.from() - 1); 6204 CharacterRange right = CharacterRange::Range(current.from(), entry->to()); 6205 // The left part of the overlapping range doesn't overlap. 6206 // Truncate the whole entry to be just the left part. 6207 entry->set_to(left.to()); 6208 // The right part is the one that overlaps. We add this part 6209 // to the map and let the next step deal with merging it with 6210 // the range we're adding. 6211 ZoneSplayTree<Config>::Locator loc; 6212 bool inserted = tree()->Insert(right.from(), &loc); 6213 DCHECK(inserted); 6214 USE(inserted); 6215 loc.set_value(Entry(right.from(), 6216 right.to(), 6217 entry->out_set())); 6218 } 6219 } 6220 while (current.is_valid()) { 6221 if (tree()->FindLeastGreaterThan(current.from(), &loc) && 6222 (loc.value().from() <= current.to()) && 6223 (loc.value().to() >= current.from())) { 6224 Entry* entry = &loc.value(); 6225 // We have overlap. If there is space between the start point of 6226 // the range we're adding and where the overlapping range starts 6227 // then we have to add a range covering just that space. 6228 if (current.from() < entry->from()) { 6229 ZoneSplayTree<Config>::Locator ins; 6230 bool inserted = tree()->Insert(current.from(), &ins); 6231 DCHECK(inserted); 6232 USE(inserted); 6233 ins.set_value(Entry(current.from(), 6234 entry->from() - 1, 6235 empty()->Extend(value, zone))); 6236 current.set_from(entry->from()); 6237 } 6238 DCHECK_EQ(current.from(), entry->from()); 6239 // If the overlapping range extends beyond the one we want to add 6240 // we have to snap the right part off and add it separately. 6241 if (entry->to() > current.to()) { 6242 ZoneSplayTree<Config>::Locator ins; 6243 bool inserted = tree()->Insert(current.to() + 1, &ins); 6244 DCHECK(inserted); 6245 USE(inserted); 6246 ins.set_value(Entry(current.to() + 1, 6247 entry->to(), 6248 entry->out_set())); 6249 entry->set_to(current.to()); 6250 } 6251 DCHECK(entry->to() <= current.to()); 6252 // The overlapping range is now completely contained by the range 6253 // we're adding so we can just update it and move the start point 6254 // of the range we're adding just past it. 6255 entry->AddValue(value, zone); 6256 DCHECK(entry->to() + 1 > current.from()); 6257 current.set_from(entry->to() + 1); 6258 } else { 6259 // There is no overlap so we can just add the range 6260 ZoneSplayTree<Config>::Locator ins; 6261 bool inserted = tree()->Insert(current.from(), &ins); 6262 DCHECK(inserted); 6263 USE(inserted); 6264 ins.set_value(Entry(current.from(), 6265 current.to(), 6266 empty()->Extend(value, zone))); 6267 break; 6268 } 6269 } 6270 } 6271 6272 6273 OutSet* DispatchTable::Get(uc32 value) { 6274 ZoneSplayTree<Config>::Locator loc; 6275 if (!tree()->FindGreatestLessThan(value, &loc)) 6276 return empty(); 6277 Entry* entry = &loc.value(); 6278 if (value <= entry->to()) 6279 return entry->out_set(); 6280 else 6281 return empty(); 6282 } 6283 6284 6285 // ------------------------------------------------------------------- 6286 // Analysis 6287 6288 6289 void Analysis::EnsureAnalyzed(RegExpNode* that) { 6290 StackLimitCheck check(isolate()); 6291 if (check.HasOverflowed()) { 6292 fail("Stack overflow"); 6293 return; 6294 } 6295 if (that->info()->been_analyzed || that->info()->being_analyzed) 6296 return; 6297 that->info()->being_analyzed = true; 6298 that->Accept(this); 6299 that->info()->being_analyzed = false; 6300 that->info()->been_analyzed = true; 6301 } 6302 6303 6304 void Analysis::VisitEnd(EndNode* that) { 6305 // nothing to do 6306 } 6307 6308 6309 void TextNode::CalculateOffsets() { 6310 int element_count = elements()->length(); 6311 // Set up the offsets of the elements relative to the start. This is a fixed 6312 // quantity since a TextNode can only contain fixed-width things. 6313 int cp_offset = 0; 6314 for (int i = 0; i < element_count; i++) { 6315 TextElement& elm = elements()->at(i); 6316 elm.set_cp_offset(cp_offset); 6317 cp_offset += elm.length(); 6318 } 6319 } 6320 6321 6322 void Analysis::VisitText(TextNode* that) { 6323 if (ignore_case()) { 6324 that->MakeCaseIndependent(isolate(), is_one_byte_); 6325 } 6326 EnsureAnalyzed(that->on_success()); 6327 if (!has_failed()) { 6328 that->CalculateOffsets(); 6329 } 6330 } 6331 6332 6333 void Analysis::VisitAction(ActionNode* that) { 6334 RegExpNode* target = that->on_success(); 6335 EnsureAnalyzed(target); 6336 if (!has_failed()) { 6337 // If the next node is interested in what it follows then this node 6338 // has to be interested too so it can pass the information on. 6339 that->info()->AddFromFollowing(target->info()); 6340 } 6341 } 6342 6343 6344 void Analysis::VisitChoice(ChoiceNode* that) { 6345 NodeInfo* info = that->info(); 6346 for (int i = 0; i < that->alternatives()->length(); i++) { 6347 RegExpNode* node = that->alternatives()->at(i).node(); 6348 EnsureAnalyzed(node); 6349 if (has_failed()) return; 6350 // Anything the following nodes need to know has to be known by 6351 // this node also, so it can pass it on. 6352 info->AddFromFollowing(node->info()); 6353 } 6354 } 6355 6356 6357 void Analysis::VisitLoopChoice(LoopChoiceNode* that) { 6358 NodeInfo* info = that->info(); 6359 for (int i = 0; i < that->alternatives()->length(); i++) { 6360 RegExpNode* node = that->alternatives()->at(i).node(); 6361 if (node != that->loop_node()) { 6362 EnsureAnalyzed(node); 6363 if (has_failed()) return; 6364 info->AddFromFollowing(node->info()); 6365 } 6366 } 6367 // Check the loop last since it may need the value of this node 6368 // to get a correct result. 6369 EnsureAnalyzed(that->loop_node()); 6370 if (!has_failed()) { 6371 info->AddFromFollowing(that->loop_node()->info()); 6372 } 6373 } 6374 6375 6376 void Analysis::VisitBackReference(BackReferenceNode* that) { 6377 EnsureAnalyzed(that->on_success()); 6378 } 6379 6380 6381 void Analysis::VisitAssertion(AssertionNode* that) { 6382 EnsureAnalyzed(that->on_success()); 6383 } 6384 6385 6386 void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget, 6387 BoyerMooreLookahead* bm, 6388 bool not_at_start) { 6389 // Working out the set of characters that a backreference can match is too 6390 // hard, so we just say that any character can match. 6391 bm->SetRest(offset); 6392 SaveBMInfo(bm, not_at_start, offset); 6393 } 6394 6395 6396 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize == 6397 RegExpMacroAssembler::kTableSize); 6398 6399 6400 void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget, 6401 BoyerMooreLookahead* bm, bool not_at_start) { 6402 ZoneList<GuardedAlternative>* alts = alternatives(); 6403 budget = (budget - 1) / alts->length(); 6404 for (int i = 0; i < alts->length(); i++) { 6405 GuardedAlternative& alt = alts->at(i); 6406 if (alt.guards() != NULL && alt.guards()->length() != 0) { 6407 bm->SetRest(offset); // Give up trying to fill in info. 6408 SaveBMInfo(bm, not_at_start, offset); 6409 return; 6410 } 6411 alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start); 6412 } 6413 SaveBMInfo(bm, not_at_start, offset); 6414 } 6415 6416 6417 void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget, 6418 BoyerMooreLookahead* bm, bool not_at_start) { 6419 if (initial_offset >= bm->length()) return; 6420 int offset = initial_offset; 6421 int max_char = bm->max_char(); 6422 for (int i = 0; i < elements()->length(); i++) { 6423 if (offset >= bm->length()) { 6424 if (initial_offset == 0) set_bm_info(not_at_start, bm); 6425 return; 6426 } 6427 TextElement text = elements()->at(i); 6428 if (text.text_type() == TextElement::ATOM) { 6429 RegExpAtom* atom = text.atom(); 6430 for (int j = 0; j < atom->length(); j++, offset++) { 6431 if (offset >= bm->length()) { 6432 if (initial_offset == 0) set_bm_info(not_at_start, bm); 6433 return; 6434 } 6435 uc16 character = atom->data()[j]; 6436 if (bm->compiler()->ignore_case()) { 6437 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 6438 int length = GetCaseIndependentLetters( 6439 isolate, character, bm->max_char() == String::kMaxOneByteCharCode, 6440 chars); 6441 for (int j = 0; j < length; j++) { 6442 bm->Set(offset, chars[j]); 6443 } 6444 } else { 6445 if (character <= max_char) bm->Set(offset, character); 6446 } 6447 } 6448 } else { 6449 DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type()); 6450 RegExpCharacterClass* char_class = text.char_class(); 6451 ZoneList<CharacterRange>* ranges = char_class->ranges(zone()); 6452 if (char_class->is_negated()) { 6453 bm->SetAll(offset); 6454 } else { 6455 for (int k = 0; k < ranges->length(); k++) { 6456 CharacterRange& range = ranges->at(k); 6457 if (range.from() > max_char) continue; 6458 int to = Min(max_char, static_cast<int>(range.to())); 6459 bm->SetInterval(offset, Interval(range.from(), to)); 6460 } 6461 } 6462 offset++; 6463 } 6464 } 6465 if (offset >= bm->length()) { 6466 if (initial_offset == 0) set_bm_info(not_at_start, bm); 6467 return; 6468 } 6469 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, 6470 true); // Not at start after a text node. 6471 if (initial_offset == 0) set_bm_info(not_at_start, bm); 6472 } 6473 6474 6475 // ------------------------------------------------------------------- 6476 // Dispatch table construction 6477 6478 6479 void DispatchTableConstructor::VisitEnd(EndNode* that) { 6480 AddRange(CharacterRange::Everything()); 6481 } 6482 6483 6484 void DispatchTableConstructor::BuildTable(ChoiceNode* node) { 6485 node->set_being_calculated(true); 6486 ZoneList<GuardedAlternative>* alternatives = node->alternatives(); 6487 for (int i = 0; i < alternatives->length(); i++) { 6488 set_choice_index(i); 6489 alternatives->at(i).node()->Accept(this); 6490 } 6491 node->set_being_calculated(false); 6492 } 6493 6494 6495 class AddDispatchRange { 6496 public: 6497 explicit AddDispatchRange(DispatchTableConstructor* constructor) 6498 : constructor_(constructor) { } 6499 void Call(uc32 from, DispatchTable::Entry entry); 6500 private: 6501 DispatchTableConstructor* constructor_; 6502 }; 6503 6504 6505 void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) { 6506 constructor_->AddRange(CharacterRange::Range(from, entry.to())); 6507 } 6508 6509 6510 void DispatchTableConstructor::VisitChoice(ChoiceNode* node) { 6511 if (node->being_calculated()) 6512 return; 6513 DispatchTable* table = node->GetTable(ignore_case_); 6514 AddDispatchRange adder(this); 6515 table->ForEach(&adder); 6516 } 6517 6518 6519 void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) { 6520 // TODO(160): Find the node that we refer back to and propagate its start 6521 // set back to here. For now we just accept anything. 6522 AddRange(CharacterRange::Everything()); 6523 } 6524 6525 6526 void DispatchTableConstructor::VisitAssertion(AssertionNode* that) { 6527 RegExpNode* target = that->on_success(); 6528 target->Accept(this); 6529 } 6530 6531 6532 static int CompareRangeByFrom(const CharacterRange* a, 6533 const CharacterRange* b) { 6534 return Compare<uc16>(a->from(), b->from()); 6535 } 6536 6537 6538 void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) { 6539 ranges->Sort(CompareRangeByFrom); 6540 uc16 last = 0; 6541 for (int i = 0; i < ranges->length(); i++) { 6542 CharacterRange range = ranges->at(i); 6543 if (last < range.from()) 6544 AddRange(CharacterRange::Range(last, range.from() - 1)); 6545 if (range.to() >= last) { 6546 if (range.to() == String::kMaxCodePoint) { 6547 return; 6548 } else { 6549 last = range.to() + 1; 6550 } 6551 } 6552 } 6553 AddRange(CharacterRange::Range(last, String::kMaxCodePoint)); 6554 } 6555 6556 6557 void DispatchTableConstructor::VisitText(TextNode* that) { 6558 TextElement elm = that->elements()->at(0); 6559 switch (elm.text_type()) { 6560 case TextElement::ATOM: { 6561 uc16 c = elm.atom()->data()[0]; 6562 AddRange(CharacterRange::Range(c, c)); 6563 break; 6564 } 6565 case TextElement::CHAR_CLASS: { 6566 RegExpCharacterClass* tree = elm.char_class(); 6567 ZoneList<CharacterRange>* ranges = tree->ranges(that->zone()); 6568 if (tree->is_negated()) { 6569 AddInverse(ranges); 6570 } else { 6571 for (int i = 0; i < ranges->length(); i++) 6572 AddRange(ranges->at(i)); 6573 } 6574 break; 6575 } 6576 default: { 6577 UNIMPLEMENTED(); 6578 } 6579 } 6580 } 6581 6582 6583 void DispatchTableConstructor::VisitAction(ActionNode* that) { 6584 RegExpNode* target = that->on_success(); 6585 target->Accept(this); 6586 } 6587 6588 6589 RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler, 6590 RegExpNode* on_success) { 6591 // If the regexp matching starts within a surrogate pair, step back 6592 // to the lead surrogate and start matching from there. 6593 DCHECK(!compiler->read_backward()); 6594 Zone* zone = compiler->zone(); 6595 ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List( 6596 zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd)); 6597 ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List( 6598 zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd)); 6599 6600 ChoiceNode* optional_step_back = new (zone) ChoiceNode(2, zone); 6601 6602 int stack_register = compiler->UnicodeLookaroundStackRegister(); 6603 int position_register = compiler->UnicodeLookaroundPositionRegister(); 6604 RegExpNode* step_back = TextNode::CreateForCharacterRanges( 6605 zone, lead_surrogates, true, on_success); 6606 RegExpLookaround::Builder builder(true, step_back, stack_register, 6607 position_register); 6608 RegExpNode* match_trail = TextNode::CreateForCharacterRanges( 6609 zone, trail_surrogates, false, builder.on_match_success()); 6610 6611 optional_step_back->AddAlternative( 6612 GuardedAlternative(builder.ForMatch(match_trail))); 6613 optional_step_back->AddAlternative(GuardedAlternative(on_success)); 6614 6615 return optional_step_back; 6616 } 6617 6618 6619 RegExpEngine::CompilationResult RegExpEngine::Compile( 6620 Isolate* isolate, Zone* zone, RegExpCompileData* data, 6621 JSRegExp::Flags flags, Handle<String> pattern, 6622 Handle<String> sample_subject, bool is_one_byte) { 6623 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) { 6624 return IrregexpRegExpTooBig(isolate); 6625 } 6626 bool ignore_case = flags & JSRegExp::kIgnoreCase; 6627 bool is_sticky = flags & JSRegExp::kSticky; 6628 bool is_global = flags & JSRegExp::kGlobal; 6629 bool is_unicode = flags & JSRegExp::kUnicode; 6630 RegExpCompiler compiler(isolate, zone, data->capture_count, flags, 6631 is_one_byte); 6632 6633 if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern)); 6634 6635 // Sample some characters from the middle of the string. 6636 static const int kSampleSize = 128; 6637 6638 sample_subject = String::Flatten(sample_subject); 6639 int chars_sampled = 0; 6640 int half_way = (sample_subject->length() - kSampleSize) / 2; 6641 for (int i = Max(0, half_way); 6642 i < sample_subject->length() && chars_sampled < kSampleSize; 6643 i++, chars_sampled++) { 6644 compiler.frequency_collator()->CountCharacter(sample_subject->Get(i)); 6645 } 6646 6647 // Wrap the body of the regexp in capture #0. 6648 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree, 6649 0, 6650 &compiler, 6651 compiler.accept()); 6652 RegExpNode* node = captured_body; 6653 bool is_end_anchored = data->tree->IsAnchoredAtEnd(); 6654 bool is_start_anchored = data->tree->IsAnchoredAtStart(); 6655 int max_length = data->tree->max_match(); 6656 if (!is_start_anchored && !is_sticky) { 6657 // Add a .*? at the beginning, outside the body capture, unless 6658 // this expression is anchored at the beginning or sticky. 6659 RegExpNode* loop_node = RegExpQuantifier::ToNode( 6660 0, RegExpTree::kInfinity, false, new (zone) RegExpCharacterClass('*'), 6661 &compiler, captured_body, data->contains_anchor); 6662 6663 if (data->contains_anchor) { 6664 // Unroll loop once, to take care of the case that might start 6665 // at the start of input. 6666 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone); 6667 first_step_node->AddAlternative(GuardedAlternative(captured_body)); 6668 first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode( 6669 new (zone) RegExpCharacterClass('*'), false, loop_node))); 6670 node = first_step_node; 6671 } else { 6672 node = loop_node; 6673 } 6674 } 6675 if (is_one_byte) { 6676 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case); 6677 // Do it again to propagate the new nodes to places where they were not 6678 // put because they had not been calculated yet. 6679 if (node != NULL) { 6680 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case); 6681 } 6682 } else if (compiler.unicode() && (is_global || is_sticky)) { 6683 node = OptionallyStepBackToLeadSurrogate(&compiler, node); 6684 } 6685 6686 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone); 6687 data->node = node; 6688 Analysis analysis(isolate, flags, is_one_byte); 6689 analysis.EnsureAnalyzed(node); 6690 if (analysis.has_failed()) { 6691 const char* error_message = analysis.error_message(); 6692 return CompilationResult(isolate, error_message); 6693 } 6694 6695 // Create the correct assembler for the architecture. 6696 #ifndef V8_INTERPRETED_REGEXP 6697 // Native regexp implementation. 6698 6699 NativeRegExpMacroAssembler::Mode mode = 6700 is_one_byte ? NativeRegExpMacroAssembler::LATIN1 6701 : NativeRegExpMacroAssembler::UC16; 6702 6703 #if V8_TARGET_ARCH_IA32 6704 RegExpMacroAssemblerIA32 macro_assembler(isolate, zone, mode, 6705 (data->capture_count + 1) * 2); 6706 #elif V8_TARGET_ARCH_X64 6707 RegExpMacroAssemblerX64 macro_assembler(isolate, zone, mode, 6708 (data->capture_count + 1) * 2); 6709 #elif V8_TARGET_ARCH_ARM 6710 RegExpMacroAssemblerARM macro_assembler(isolate, zone, mode, 6711 (data->capture_count + 1) * 2); 6712 #elif V8_TARGET_ARCH_ARM64 6713 RegExpMacroAssemblerARM64 macro_assembler(isolate, zone, mode, 6714 (data->capture_count + 1) * 2); 6715 #elif V8_TARGET_ARCH_S390 6716 RegExpMacroAssemblerS390 macro_assembler(isolate, zone, mode, 6717 (data->capture_count + 1) * 2); 6718 #elif V8_TARGET_ARCH_PPC 6719 RegExpMacroAssemblerPPC macro_assembler(isolate, zone, mode, 6720 (data->capture_count + 1) * 2); 6721 #elif V8_TARGET_ARCH_MIPS 6722 RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode, 6723 (data->capture_count + 1) * 2); 6724 #elif V8_TARGET_ARCH_MIPS64 6725 RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode, 6726 (data->capture_count + 1) * 2); 6727 #elif V8_TARGET_ARCH_X87 6728 RegExpMacroAssemblerX87 macro_assembler(isolate, zone, mode, 6729 (data->capture_count + 1) * 2); 6730 #else 6731 #error "Unsupported architecture" 6732 #endif 6733 6734 #else // V8_INTERPRETED_REGEXP 6735 // Interpreted regexp implementation. 6736 EmbeddedVector<byte, 1024> codes; 6737 RegExpMacroAssemblerIrregexp macro_assembler(isolate, codes, zone); 6738 #endif // V8_INTERPRETED_REGEXP 6739 6740 macro_assembler.set_slow_safe(TooMuchRegExpCode(pattern)); 6741 6742 // Inserted here, instead of in Assembler, because it depends on information 6743 // in the AST that isn't replicated in the Node structure. 6744 static const int kMaxBacksearchLimit = 1024; 6745 if (is_end_anchored && 6746 !is_start_anchored && 6747 max_length < kMaxBacksearchLimit) { 6748 macro_assembler.SetCurrentPositionFromEnd(max_length); 6749 } 6750 6751 if (is_global) { 6752 RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL; 6753 if (data->tree->min_match() > 0) { 6754 mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK; 6755 } else if (is_unicode) { 6756 mode = RegExpMacroAssembler::GLOBAL_UNICODE; 6757 } 6758 macro_assembler.set_global_mode(mode); 6759 } 6760 6761 return compiler.Assemble(¯o_assembler, 6762 node, 6763 data->capture_count, 6764 pattern); 6765 } 6766 6767 6768 bool RegExpEngine::TooMuchRegExpCode(Handle<String> pattern) { 6769 Heap* heap = pattern->GetHeap(); 6770 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize; 6771 if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit && 6772 heap->memory_allocator()->SizeExecutable() > 6773 RegExpImpl::kRegExpExecutableMemoryLimit) { 6774 too_much = true; 6775 } 6776 return too_much; 6777 } 6778 6779 6780 Object* RegExpResultsCache::Lookup(Heap* heap, String* key_string, 6781 Object* key_pattern, 6782 FixedArray** last_match_cache, 6783 ResultsCacheType type) { 6784 FixedArray* cache; 6785 if (!key_string->IsInternalizedString()) return Smi::kZero; 6786 if (type == STRING_SPLIT_SUBSTRINGS) { 6787 DCHECK(key_pattern->IsString()); 6788 if (!key_pattern->IsInternalizedString()) return Smi::kZero; 6789 cache = heap->string_split_cache(); 6790 } else { 6791 DCHECK(type == REGEXP_MULTIPLE_INDICES); 6792 DCHECK(key_pattern->IsFixedArray()); 6793 cache = heap->regexp_multiple_cache(); 6794 } 6795 6796 uint32_t hash = key_string->Hash(); 6797 uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) & 6798 ~(kArrayEntriesPerCacheEntry - 1)); 6799 if (cache->get(index + kStringOffset) != key_string || 6800 cache->get(index + kPatternOffset) != key_pattern) { 6801 index = 6802 ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1)); 6803 if (cache->get(index + kStringOffset) != key_string || 6804 cache->get(index + kPatternOffset) != key_pattern) { 6805 return Smi::kZero; 6806 } 6807 } 6808 6809 *last_match_cache = FixedArray::cast(cache->get(index + kLastMatchOffset)); 6810 return cache->get(index + kArrayOffset); 6811 } 6812 6813 6814 void RegExpResultsCache::Enter(Isolate* isolate, Handle<String> key_string, 6815 Handle<Object> key_pattern, 6816 Handle<FixedArray> value_array, 6817 Handle<FixedArray> last_match_cache, 6818 ResultsCacheType type) { 6819 Factory* factory = isolate->factory(); 6820 Handle<FixedArray> cache; 6821 if (!key_string->IsInternalizedString()) return; 6822 if (type == STRING_SPLIT_SUBSTRINGS) { 6823 DCHECK(key_pattern->IsString()); 6824 if (!key_pattern->IsInternalizedString()) return; 6825 cache = factory->string_split_cache(); 6826 } else { 6827 DCHECK(type == REGEXP_MULTIPLE_INDICES); 6828 DCHECK(key_pattern->IsFixedArray()); 6829 cache = factory->regexp_multiple_cache(); 6830 } 6831 6832 uint32_t hash = key_string->Hash(); 6833 uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) & 6834 ~(kArrayEntriesPerCacheEntry - 1)); 6835 if (cache->get(index + kStringOffset) == Smi::kZero) { 6836 cache->set(index + kStringOffset, *key_string); 6837 cache->set(index + kPatternOffset, *key_pattern); 6838 cache->set(index + kArrayOffset, *value_array); 6839 cache->set(index + kLastMatchOffset, *last_match_cache); 6840 } else { 6841 uint32_t index2 = 6842 ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1)); 6843 if (cache->get(index2 + kStringOffset) == Smi::kZero) { 6844 cache->set(index2 + kStringOffset, *key_string); 6845 cache->set(index2 + kPatternOffset, *key_pattern); 6846 cache->set(index2 + kArrayOffset, *value_array); 6847 cache->set(index2 + kLastMatchOffset, *last_match_cache); 6848 } else { 6849 cache->set(index2 + kStringOffset, Smi::kZero); 6850 cache->set(index2 + kPatternOffset, Smi::kZero); 6851 cache->set(index2 + kArrayOffset, Smi::kZero); 6852 cache->set(index2 + kLastMatchOffset, Smi::kZero); 6853 cache->set(index + kStringOffset, *key_string); 6854 cache->set(index + kPatternOffset, *key_pattern); 6855 cache->set(index + kArrayOffset, *value_array); 6856 cache->set(index + kLastMatchOffset, *last_match_cache); 6857 } 6858 } 6859 // If the array is a reasonably short list of substrings, convert it into a 6860 // list of internalized strings. 6861 if (type == STRING_SPLIT_SUBSTRINGS && value_array->length() < 100) { 6862 for (int i = 0; i < value_array->length(); i++) { 6863 Handle<String> str(String::cast(value_array->get(i)), isolate); 6864 Handle<String> internalized_str = factory->InternalizeString(str); 6865 value_array->set(i, *internalized_str); 6866 } 6867 } 6868 // Convert backing store to a copy-on-write array. 6869 value_array->set_map_no_write_barrier(isolate->heap()->fixed_cow_array_map()); 6870 } 6871 6872 6873 void RegExpResultsCache::Clear(FixedArray* cache) { 6874 for (int i = 0; i < kRegExpResultsCacheSize; i++) { 6875 cache->set(i, Smi::kZero); 6876 } 6877 } 6878 6879 } // namespace internal 6880 } // namespace v8 6881