1 // Copyright 2012 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "src/regexp/jsregexp.h" 6 7 #include <memory> 8 #include <vector> 9 10 #include "src/base/platform/platform.h" 11 #include "src/code-tracer.h" 12 #include "src/compilation-cache.h" 13 #include "src/elements.h" 14 #include "src/execution.h" 15 #include "src/heap/factory.h" 16 #include "src/isolate-inl.h" 17 #include "src/messages.h" 18 #include "src/ostreams.h" 19 #include "src/regexp/interpreter-irregexp.h" 20 #include "src/regexp/jsregexp-inl.h" 21 #include "src/regexp/regexp-macro-assembler-irregexp.h" 22 #include "src/regexp/regexp-macro-assembler-tracer.h" 23 #include "src/regexp/regexp-macro-assembler.h" 24 #include "src/regexp/regexp-parser.h" 25 #include "src/regexp/regexp-stack.h" 26 #include "src/runtime/runtime.h" 27 #include "src/splay-tree-inl.h" 28 #include "src/string-search.h" 29 #include "src/unicode-decoder.h" 30 #include "src/unicode-inl.h" 31 32 #ifdef V8_INTL_SUPPORT 33 #include "unicode/uniset.h" 34 #include "unicode/utypes.h" 35 #endif // V8_INTL_SUPPORT 36 37 #ifndef V8_INTERPRETED_REGEXP 38 #if V8_TARGET_ARCH_IA32 39 #include "src/regexp/ia32/regexp-macro-assembler-ia32.h" 40 #elif V8_TARGET_ARCH_X64 41 #include "src/regexp/x64/regexp-macro-assembler-x64.h" 42 #elif V8_TARGET_ARCH_ARM64 43 #include "src/regexp/arm64/regexp-macro-assembler-arm64.h" 44 #elif V8_TARGET_ARCH_ARM 45 #include "src/regexp/arm/regexp-macro-assembler-arm.h" 46 #elif V8_TARGET_ARCH_PPC 47 #include "src/regexp/ppc/regexp-macro-assembler-ppc.h" 48 #elif V8_TARGET_ARCH_S390 49 #include "src/regexp/s390/regexp-macro-assembler-s390.h" 50 #elif V8_TARGET_ARCH_MIPS 51 #include "src/regexp/mips/regexp-macro-assembler-mips.h" 52 #elif V8_TARGET_ARCH_MIPS64 53 #include "src/regexp/mips64/regexp-macro-assembler-mips64.h" 54 #else 55 #error Unsupported target architecture. 56 #endif 57 #endif 58 59 60 namespace v8 { 61 namespace internal { 62 63 V8_WARN_UNUSED_RESULT 64 static inline MaybeHandle<Object> ThrowRegExpException( 65 Isolate* isolate, Handle<JSRegExp> re, Handle<String> pattern, 66 Handle<String> error_text) { 67 THROW_NEW_ERROR(isolate, NewSyntaxError(MessageTemplate::kMalformedRegExp, 68 pattern, error_text), 69 Object); 70 } 71 72 inline void ThrowRegExpException(Isolate* isolate, Handle<JSRegExp> re, 73 Handle<String> error_text) { 74 USE(ThrowRegExpException(isolate, re, Handle<String>(re->Pattern(), isolate), 75 error_text)); 76 } 77 78 79 ContainedInLattice AddRange(ContainedInLattice containment, 80 const int* ranges, 81 int ranges_length, 82 Interval new_range) { 83 DCHECK_EQ(1, ranges_length & 1); 84 DCHECK_EQ(String::kMaxCodePoint + 1, ranges[ranges_length - 1]); 85 if (containment == kLatticeUnknown) return containment; 86 bool inside = false; 87 int last = 0; 88 for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) { 89 // Consider the range from last to ranges[i]. 90 // We haven't got to the new range yet. 91 if (ranges[i] <= new_range.from()) continue; 92 // New range is wholly inside last-ranges[i]. Note that new_range.to() is 93 // inclusive, but the values in ranges are not. 94 if (last <= new_range.from() && new_range.to() < ranges[i]) { 95 return Combine(containment, inside ? kLatticeIn : kLatticeOut); 96 } 97 return kLatticeUnknown; 98 } 99 return containment; 100 } 101 102 // More makes code generation slower, less makes V8 benchmark score lower. 103 const int kMaxLookaheadForBoyerMoore = 8; 104 // In a 3-character pattern you can maximally step forwards 3 characters 105 // at a time, which is not always enough to pay for the extra logic. 106 const int kPatternTooShortForBoyerMoore = 2; 107 108 // Identifies the sort of regexps where the regexp engine is faster 109 // than the code used for atom matches. 110 static bool HasFewDifferentCharacters(Handle<String> pattern) { 111 int length = Min(kMaxLookaheadForBoyerMoore, pattern->length()); 112 if (length <= kPatternTooShortForBoyerMoore) return false; 113 const int kMod = 128; 114 bool character_found[kMod]; 115 int different = 0; 116 memset(&character_found[0], 0, sizeof(character_found)); 117 for (int i = 0; i < length; i++) { 118 int ch = (pattern->Get(i) & (kMod - 1)); 119 if (!character_found[ch]) { 120 character_found[ch] = true; 121 different++; 122 // We declare a regexp low-alphabet if it has at least 3 times as many 123 // characters as it has different characters. 124 if (different * 3 > length) return false; 125 } 126 } 127 return true; 128 } 129 130 // Generic RegExp methods. Dispatches to implementation specific methods. 131 132 MaybeHandle<Object> RegExpImpl::Compile(Isolate* isolate, Handle<JSRegExp> re, 133 Handle<String> pattern, 134 JSRegExp::Flags flags) { 135 DCHECK(pattern->IsFlat()); 136 137 Zone zone(isolate->allocator(), ZONE_NAME); 138 CompilationCache* compilation_cache = isolate->compilation_cache(); 139 MaybeHandle<FixedArray> maybe_cached = 140 compilation_cache->LookupRegExp(pattern, flags); 141 Handle<FixedArray> cached; 142 if (maybe_cached.ToHandle(&cached)) { 143 re->set_data(*cached); 144 return re; 145 } 146 147 PostponeInterruptsScope postpone(isolate); 148 RegExpCompileData parse_result; 149 FlatStringReader reader(isolate, pattern); 150 DCHECK(!isolate->has_pending_exception()); 151 if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags, 152 &parse_result)) { 153 // Throw an exception if we fail to parse the pattern. 154 return ThrowRegExpException(isolate, re, pattern, parse_result.error); 155 } 156 157 bool has_been_compiled = false; 158 159 if (parse_result.simple && !IgnoreCase(flags) && !IsSticky(flags) && 160 !HasFewDifferentCharacters(pattern)) { 161 // Parse-tree is a single atom that is equal to the pattern. 162 AtomCompile(isolate, re, pattern, flags, pattern); 163 has_been_compiled = true; 164 } else if (parse_result.tree->IsAtom() && !IsSticky(flags) && 165 parse_result.capture_count == 0) { 166 RegExpAtom* atom = parse_result.tree->AsAtom(); 167 Vector<const uc16> atom_pattern = atom->data(); 168 Handle<String> atom_string; 169 ASSIGN_RETURN_ON_EXCEPTION( 170 isolate, atom_string, 171 isolate->factory()->NewStringFromTwoByte(atom_pattern), Object); 172 if (!IgnoreCase(atom->flags()) && !HasFewDifferentCharacters(atom_string)) { 173 AtomCompile(isolate, re, pattern, flags, atom_string); 174 has_been_compiled = true; 175 } 176 } 177 if (!has_been_compiled) { 178 IrregexpInitialize(isolate, re, pattern, flags, parse_result.capture_count); 179 } 180 DCHECK(re->data()->IsFixedArray()); 181 // Compilation succeeded so the data is set on the regexp 182 // and we can store it in the cache. 183 Handle<FixedArray> data(FixedArray::cast(re->data()), isolate); 184 compilation_cache->PutRegExp(pattern, flags, data); 185 186 return re; 187 } 188 189 MaybeHandle<Object> RegExpImpl::Exec(Isolate* isolate, Handle<JSRegExp> regexp, 190 Handle<String> subject, int index, 191 Handle<RegExpMatchInfo> last_match_info) { 192 switch (regexp->TypeTag()) { 193 case JSRegExp::ATOM: 194 return AtomExec(isolate, regexp, subject, index, last_match_info); 195 case JSRegExp::IRREGEXP: { 196 return IrregexpExec(isolate, regexp, subject, index, last_match_info); 197 } 198 default: 199 UNREACHABLE(); 200 } 201 } 202 203 204 // RegExp Atom implementation: Simple string search using indexOf. 205 206 void RegExpImpl::AtomCompile(Isolate* isolate, Handle<JSRegExp> re, 207 Handle<String> pattern, JSRegExp::Flags flags, 208 Handle<String> match_pattern) { 209 isolate->factory()->SetRegExpAtomData(re, JSRegExp::ATOM, pattern, flags, 210 match_pattern); 211 } 212 213 static void SetAtomLastCapture(Isolate* isolate, 214 Handle<RegExpMatchInfo> last_match_info, 215 String* subject, int from, int to) { 216 SealHandleScope shs(isolate); 217 last_match_info->SetNumberOfCaptureRegisters(2); 218 last_match_info->SetLastSubject(subject); 219 last_match_info->SetLastInput(subject); 220 last_match_info->SetCapture(0, from); 221 last_match_info->SetCapture(1, to); 222 } 223 224 int RegExpImpl::AtomExecRaw(Isolate* isolate, Handle<JSRegExp> regexp, 225 Handle<String> subject, int index, int32_t* output, 226 int output_size) { 227 DCHECK_LE(0, index); 228 DCHECK_LE(index, subject->length()); 229 230 subject = String::Flatten(isolate, subject); 231 DisallowHeapAllocation no_gc; // ensure vectors stay valid 232 233 String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex)); 234 int needle_len = needle->length(); 235 DCHECK(needle->IsFlat()); 236 DCHECK_LT(0, needle_len); 237 238 if (index + needle_len > subject->length()) { 239 return RegExpImpl::RE_FAILURE; 240 } 241 242 for (int i = 0; i < output_size; i += 2) { 243 String::FlatContent needle_content = needle->GetFlatContent(); 244 String::FlatContent subject_content = subject->GetFlatContent(); 245 DCHECK(needle_content.IsFlat()); 246 DCHECK(subject_content.IsFlat()); 247 // dispatch on type of strings 248 index = 249 (needle_content.IsOneByte() 250 ? (subject_content.IsOneByte() 251 ? SearchString(isolate, subject_content.ToOneByteVector(), 252 needle_content.ToOneByteVector(), index) 253 : SearchString(isolate, subject_content.ToUC16Vector(), 254 needle_content.ToOneByteVector(), index)) 255 : (subject_content.IsOneByte() 256 ? SearchString(isolate, subject_content.ToOneByteVector(), 257 needle_content.ToUC16Vector(), index) 258 : SearchString(isolate, subject_content.ToUC16Vector(), 259 needle_content.ToUC16Vector(), index))); 260 if (index == -1) { 261 return i / 2; // Return number of matches. 262 } else { 263 output[i] = index; 264 output[i+1] = index + needle_len; 265 index += needle_len; 266 } 267 } 268 return output_size / 2; 269 } 270 271 Handle<Object> RegExpImpl::AtomExec(Isolate* isolate, Handle<JSRegExp> re, 272 Handle<String> subject, int index, 273 Handle<RegExpMatchInfo> last_match_info) { 274 static const int kNumRegisters = 2; 275 STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize); 276 int32_t* output_registers = isolate->jsregexp_static_offsets_vector(); 277 278 int res = 279 AtomExecRaw(isolate, re, subject, index, output_registers, kNumRegisters); 280 281 if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value(); 282 283 DCHECK_EQ(res, RegExpImpl::RE_SUCCESS); 284 SealHandleScope shs(isolate); 285 SetAtomLastCapture(isolate, last_match_info, *subject, output_registers[0], 286 output_registers[1]); 287 return last_match_info; 288 } 289 290 291 // Irregexp implementation. 292 293 // Ensures that the regexp object contains a compiled version of the 294 // source for either one-byte or two-byte subject strings. 295 // If the compiled version doesn't already exist, it is compiled 296 // from the source pattern. 297 // If compilation fails, an exception is thrown and this function 298 // returns false. 299 bool RegExpImpl::EnsureCompiledIrregexp(Isolate* isolate, Handle<JSRegExp> re, 300 Handle<String> sample_subject, 301 bool is_one_byte) { 302 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_one_byte)); 303 #ifdef V8_INTERPRETED_REGEXP 304 if (compiled_code->IsByteArray()) return true; 305 #else // V8_INTERPRETED_REGEXP (RegExp native code) 306 if (compiled_code->IsCode()) return true; 307 #endif 308 return CompileIrregexp(isolate, re, sample_subject, is_one_byte); 309 } 310 311 bool RegExpImpl::CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re, 312 Handle<String> sample_subject, 313 bool is_one_byte) { 314 // Compile the RegExp. 315 Zone zone(isolate->allocator(), ZONE_NAME); 316 PostponeInterruptsScope postpone(isolate); 317 #ifdef DEBUG 318 Object* entry = re->DataAt(JSRegExp::code_index(is_one_byte)); 319 // When arriving here entry can only be a smi representing an uncompiled 320 // regexp. 321 DCHECK(entry->IsSmi()); 322 int entry_value = Smi::ToInt(entry); 323 DCHECK_EQ(JSRegExp::kUninitializedValue, entry_value); 324 #endif 325 326 JSRegExp::Flags flags = re->GetFlags(); 327 328 Handle<String> pattern(re->Pattern(), isolate); 329 pattern = String::Flatten(isolate, pattern); 330 RegExpCompileData compile_data; 331 FlatStringReader reader(isolate, pattern); 332 if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags, 333 &compile_data)) { 334 // Throw an exception if we fail to parse the pattern. 335 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once. 336 USE(ThrowRegExpException(isolate, re, pattern, compile_data.error)); 337 return false; 338 } 339 RegExpEngine::CompilationResult result = 340 RegExpEngine::Compile(isolate, &zone, &compile_data, flags, pattern, 341 sample_subject, is_one_byte); 342 if (result.error_message != nullptr) { 343 // Unable to compile regexp. 344 if (FLAG_abort_on_stack_or_string_length_overflow && 345 strncmp(result.error_message, "Stack overflow", 15) == 0) { 346 FATAL("Aborting on stack overflow"); 347 } 348 Handle<String> error_message = isolate->factory()->NewStringFromUtf8( 349 CStrVector(result.error_message)).ToHandleChecked(); 350 ThrowRegExpException(isolate, re, error_message); 351 return false; 352 } 353 354 Handle<FixedArray> data = 355 Handle<FixedArray>(FixedArray::cast(re->data()), isolate); 356 data->set(JSRegExp::code_index(is_one_byte), result.code); 357 SetIrregexpCaptureNameMap(*data, compile_data.capture_name_map); 358 int register_max = IrregexpMaxRegisterCount(*data); 359 if (result.num_registers > register_max) { 360 SetIrregexpMaxRegisterCount(*data, result.num_registers); 361 } 362 363 return true; 364 } 365 366 367 int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) { 368 return Smi::cast( 369 re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value(); 370 } 371 372 373 void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) { 374 re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value)); 375 } 376 377 void RegExpImpl::SetIrregexpCaptureNameMap(FixedArray* re, 378 Handle<FixedArray> value) { 379 if (value.is_null()) { 380 re->set(JSRegExp::kIrregexpCaptureNameMapIndex, Smi::kZero); 381 } else { 382 re->set(JSRegExp::kIrregexpCaptureNameMapIndex, *value); 383 } 384 } 385 386 int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) { 387 return Smi::ToInt(re->get(JSRegExp::kIrregexpCaptureCountIndex)); 388 } 389 390 391 int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) { 392 return Smi::ToInt(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex)); 393 } 394 395 396 ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_one_byte) { 397 return ByteArray::cast(re->get(JSRegExp::code_index(is_one_byte))); 398 } 399 400 401 Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_one_byte) { 402 return Code::cast(re->get(JSRegExp::code_index(is_one_byte))); 403 } 404 405 void RegExpImpl::IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re, 406 Handle<String> pattern, 407 JSRegExp::Flags flags, int capture_count) { 408 // Initialize compiled code entries to null. 409 isolate->factory()->SetRegExpIrregexpData(re, JSRegExp::IRREGEXP, pattern, 410 flags, capture_count); 411 } 412 413 int RegExpImpl::IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp, 414 Handle<String> subject) { 415 DCHECK(subject->IsFlat()); 416 417 // Check representation of the underlying storage. 418 bool is_one_byte = subject->IsOneByteRepresentationUnderneath(); 419 if (!EnsureCompiledIrregexp(isolate, regexp, subject, is_one_byte)) return -1; 420 421 #ifdef V8_INTERPRETED_REGEXP 422 // Byte-code regexp needs space allocated for all its registers. 423 // The result captures are copied to the start of the registers array 424 // if the match succeeds. This way those registers are not clobbered 425 // when we set the last match info from last successful match. 426 return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) + 427 (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2; 428 #else // V8_INTERPRETED_REGEXP 429 // Native regexp only needs room to output captures. Registers are handled 430 // internally. 431 return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2; 432 #endif // V8_INTERPRETED_REGEXP 433 } 434 435 int RegExpImpl::IrregexpExecRaw(Isolate* isolate, Handle<JSRegExp> regexp, 436 Handle<String> subject, int index, 437 int32_t* output, int output_size) { 438 Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate); 439 440 DCHECK_LE(0, index); 441 DCHECK_LE(index, subject->length()); 442 DCHECK(subject->IsFlat()); 443 444 bool is_one_byte = subject->IsOneByteRepresentationUnderneath(); 445 446 #ifndef V8_INTERPRETED_REGEXP 447 DCHECK(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2); 448 do { 449 EnsureCompiledIrregexp(isolate, regexp, subject, is_one_byte); 450 Handle<Code> code(IrregexpNativeCode(*irregexp, is_one_byte), isolate); 451 // The stack is used to allocate registers for the compiled regexp code. 452 // This means that in case of failure, the output registers array is left 453 // untouched and contains the capture results from the previous successful 454 // match. We can use that to set the last match info lazily. 455 NativeRegExpMacroAssembler::Result res = 456 NativeRegExpMacroAssembler::Match(code, 457 subject, 458 output, 459 output_size, 460 index, 461 isolate); 462 if (res != NativeRegExpMacroAssembler::RETRY) { 463 DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION || 464 isolate->has_pending_exception()); 465 STATIC_ASSERT( 466 static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS); 467 STATIC_ASSERT( 468 static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE); 469 STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION) 470 == RE_EXCEPTION); 471 return static_cast<IrregexpResult>(res); 472 } 473 // If result is RETRY, the string has changed representation, and we 474 // must restart from scratch. 475 // In this case, it means we must make sure we are prepared to handle 476 // the, potentially, different subject (the string can switch between 477 // being internal and external, and even between being Latin1 and UC16, 478 // but the characters are always the same). 479 IrregexpPrepare(isolate, regexp, subject); 480 is_one_byte = subject->IsOneByteRepresentationUnderneath(); 481 } while (true); 482 UNREACHABLE(); 483 #else // V8_INTERPRETED_REGEXP 484 485 DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp)); 486 // We must have done EnsureCompiledIrregexp, so we can get the number of 487 // registers. 488 int number_of_capture_registers = 489 (IrregexpNumberOfCaptures(*irregexp) + 1) * 2; 490 int32_t* raw_output = &output[number_of_capture_registers]; 491 // We do not touch the actual capture result registers until we know there 492 // has been a match so that we can use those capture results to set the 493 // last match info. 494 for (int i = number_of_capture_registers - 1; i >= 0; i--) { 495 raw_output[i] = -1; 496 } 497 Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_one_byte), 498 isolate); 499 500 IrregexpResult result = IrregexpInterpreter::Match(isolate, 501 byte_codes, 502 subject, 503 raw_output, 504 index); 505 if (result == RE_SUCCESS) { 506 // Copy capture results to the start of the registers array. 507 MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t)); 508 } 509 if (result == RE_EXCEPTION) { 510 DCHECK(!isolate->has_pending_exception()); 511 isolate->StackOverflow(); 512 } 513 return result; 514 #endif // V8_INTERPRETED_REGEXP 515 } 516 517 MaybeHandle<Object> RegExpImpl::IrregexpExec( 518 Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject, 519 int previous_index, Handle<RegExpMatchInfo> last_match_info) { 520 DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP); 521 522 subject = String::Flatten(isolate, subject); 523 524 // Prepare space for the return values. 525 #if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG) 526 if (FLAG_trace_regexp_bytecodes) { 527 String* pattern = regexp->Pattern(); 528 PrintF("\n\nRegexp match: /%s/\n\n", pattern->ToCString().get()); 529 PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get()); 530 } 531 #endif 532 int required_registers = 533 RegExpImpl::IrregexpPrepare(isolate, regexp, subject); 534 if (required_registers < 0) { 535 // Compiling failed with an exception. 536 DCHECK(isolate->has_pending_exception()); 537 return MaybeHandle<Object>(); 538 } 539 540 int32_t* output_registers = nullptr; 541 if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) { 542 output_registers = NewArray<int32_t>(required_registers); 543 } 544 std::unique_ptr<int32_t[]> auto_release(output_registers); 545 if (output_registers == nullptr) { 546 output_registers = isolate->jsregexp_static_offsets_vector(); 547 } 548 549 int res = 550 RegExpImpl::IrregexpExecRaw(isolate, regexp, subject, previous_index, 551 output_registers, required_registers); 552 if (res == RE_SUCCESS) { 553 int capture_count = 554 IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())); 555 return SetLastMatchInfo(isolate, last_match_info, subject, capture_count, 556 output_registers); 557 } 558 if (res == RE_EXCEPTION) { 559 DCHECK(isolate->has_pending_exception()); 560 return MaybeHandle<Object>(); 561 } 562 DCHECK(res == RE_FAILURE); 563 return isolate->factory()->null_value(); 564 } 565 566 Handle<RegExpMatchInfo> RegExpImpl::SetLastMatchInfo( 567 Isolate* isolate, Handle<RegExpMatchInfo> last_match_info, 568 Handle<String> subject, int capture_count, int32_t* match) { 569 // This is the only place where match infos can grow. If, after executing the 570 // regexp, RegExpExecStub finds that the match info is too small, it restarts 571 // execution in RegExpImpl::Exec, which finally grows the match info right 572 // here. 573 574 int capture_register_count = (capture_count + 1) * 2; 575 Handle<RegExpMatchInfo> result = RegExpMatchInfo::ReserveCaptures( 576 isolate, last_match_info, capture_register_count); 577 result->SetNumberOfCaptureRegisters(capture_register_count); 578 579 if (*result != *last_match_info) { 580 // The match info has been reallocated, update the corresponding reference 581 // on the native context. 582 if (*last_match_info == *isolate->regexp_last_match_info()) { 583 isolate->native_context()->set_regexp_last_match_info(*result); 584 } else if (*last_match_info == *isolate->regexp_internal_match_info()) { 585 isolate->native_context()->set_regexp_internal_match_info(*result); 586 } 587 } 588 589 DisallowHeapAllocation no_allocation; 590 if (match != nullptr) { 591 for (int i = 0; i < capture_register_count; i += 2) { 592 result->SetCapture(i, match[i]); 593 result->SetCapture(i + 1, match[i + 1]); 594 } 595 } 596 result->SetLastSubject(*subject); 597 result->SetLastInput(*subject); 598 return result; 599 } 600 601 RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp, 602 Handle<String> subject, Isolate* isolate) 603 : register_array_(nullptr), 604 register_array_size_(0), 605 regexp_(regexp), 606 subject_(subject), 607 isolate_(isolate) { 608 #ifdef V8_INTERPRETED_REGEXP 609 bool interpreted = true; 610 #else 611 bool interpreted = false; 612 #endif // V8_INTERPRETED_REGEXP 613 614 if (regexp_->TypeTag() == JSRegExp::ATOM) { 615 static const int kAtomRegistersPerMatch = 2; 616 registers_per_match_ = kAtomRegistersPerMatch; 617 // There is no distinction between interpreted and native for atom regexps. 618 interpreted = false; 619 } else { 620 registers_per_match_ = 621 RegExpImpl::IrregexpPrepare(isolate_, regexp_, subject_); 622 if (registers_per_match_ < 0) { 623 num_matches_ = -1; // Signal exception. 624 return; 625 } 626 } 627 628 DCHECK(IsGlobal(regexp->GetFlags())); 629 if (!interpreted) { 630 register_array_size_ = 631 Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize); 632 max_matches_ = register_array_size_ / registers_per_match_; 633 } else { 634 // Global loop in interpreted regexp is not implemented. We choose 635 // the size of the offsets vector so that it can only store one match. 636 register_array_size_ = registers_per_match_; 637 max_matches_ = 1; 638 } 639 640 if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) { 641 register_array_ = NewArray<int32_t>(register_array_size_); 642 } else { 643 register_array_ = isolate->jsregexp_static_offsets_vector(); 644 } 645 646 // Set state so that fetching the results the first time triggers a call 647 // to the compiled regexp. 648 current_match_index_ = max_matches_ - 1; 649 num_matches_ = max_matches_; 650 DCHECK_LE(2, registers_per_match_); // Each match has at least one capture. 651 DCHECK_GE(register_array_size_, registers_per_match_); 652 int32_t* last_match = 653 ®ister_array_[current_match_index_ * registers_per_match_]; 654 last_match[0] = -1; 655 last_match[1] = 0; 656 } 657 658 int RegExpImpl::GlobalCache::AdvanceZeroLength(int last_index) { 659 if (IsUnicode(regexp_->GetFlags()) && last_index + 1 < subject_->length() && 660 unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) && 661 unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) { 662 // Advance over the surrogate pair. 663 return last_index + 2; 664 } 665 return last_index + 1; 666 } 667 668 // ------------------------------------------------------------------- 669 // Implementation of the Irregexp regular expression engine. 670 // 671 // The Irregexp regular expression engine is intended to be a complete 672 // implementation of ECMAScript regular expressions. It generates either 673 // bytecodes or native code. 674 675 // The Irregexp regexp engine is structured in three steps. 676 // 1) The parser generates an abstract syntax tree. See ast.cc. 677 // 2) From the AST a node network is created. The nodes are all 678 // subclasses of RegExpNode. The nodes represent states when 679 // executing a regular expression. Several optimizations are 680 // performed on the node network. 681 // 3) From the nodes we generate either byte codes or native code 682 // that can actually execute the regular expression (perform 683 // the search). The code generation step is described in more 684 // detail below. 685 686 // Code generation. 687 // 688 // The nodes are divided into four main categories. 689 // * Choice nodes 690 // These represent places where the regular expression can 691 // match in more than one way. For example on entry to an 692 // alternation (foo|bar) or a repetition (*, +, ? or {}). 693 // * Action nodes 694 // These represent places where some action should be 695 // performed. Examples include recording the current position 696 // in the input string to a register (in order to implement 697 // captures) or other actions on register for example in order 698 // to implement the counters needed for {} repetitions. 699 // * Matching nodes 700 // These attempt to match some element part of the input string. 701 // Examples of elements include character classes, plain strings 702 // or back references. 703 // * End nodes 704 // These are used to implement the actions required on finding 705 // a successful match or failing to find a match. 706 // 707 // The code generated (whether as byte codes or native code) maintains 708 // some state as it runs. This consists of the following elements: 709 // 710 // * The capture registers. Used for string captures. 711 // * Other registers. Used for counters etc. 712 // * The current position. 713 // * The stack of backtracking information. Used when a matching node 714 // fails to find a match and needs to try an alternative. 715 // 716 // Conceptual regular expression execution model: 717 // 718 // There is a simple conceptual model of regular expression execution 719 // which will be presented first. The actual code generated is a more 720 // efficient simulation of the simple conceptual model: 721 // 722 // * Choice nodes are implemented as follows: 723 // For each choice except the last { 724 // push current position 725 // push backtrack code location 726 // <generate code to test for choice> 727 // backtrack code location: 728 // pop current position 729 // } 730 // <generate code to test for last choice> 731 // 732 // * Actions nodes are generated as follows 733 // <push affected registers on backtrack stack> 734 // <generate code to perform action> 735 // push backtrack code location 736 // <generate code to test for following nodes> 737 // backtrack code location: 738 // <pop affected registers to restore their state> 739 // <pop backtrack location from stack and go to it> 740 // 741 // * Matching nodes are generated as follows: 742 // if input string matches at current position 743 // update current position 744 // <generate code to test for following nodes> 745 // else 746 // <pop backtrack location from stack and go to it> 747 // 748 // Thus it can be seen that the current position is saved and restored 749 // by the choice nodes, whereas the registers are saved and restored by 750 // by the action nodes that manipulate them. 751 // 752 // The other interesting aspect of this model is that nodes are generated 753 // at the point where they are needed by a recursive call to Emit(). If 754 // the node has already been code generated then the Emit() call will 755 // generate a jump to the previously generated code instead. In order to 756 // limit recursion it is possible for the Emit() function to put the node 757 // on a work list for later generation and instead generate a jump. The 758 // destination of the jump is resolved later when the code is generated. 759 // 760 // Actual regular expression code generation. 761 // 762 // Code generation is actually more complicated than the above. In order 763 // to improve the efficiency of the generated code some optimizations are 764 // performed 765 // 766 // * Choice nodes have 1-character lookahead. 767 // A choice node looks at the following character and eliminates some of 768 // the choices immediately based on that character. This is not yet 769 // implemented. 770 // * Simple greedy loops store reduced backtracking information. 771 // A quantifier like /.*foo/m will greedily match the whole input. It will 772 // then need to backtrack to a point where it can match "foo". The naive 773 // implementation of this would push each character position onto the 774 // backtracking stack, then pop them off one by one. This would use space 775 // proportional to the length of the input string. However since the "." 776 // can only match in one way and always has a constant length (in this case 777 // of 1) it suffices to store the current position on the top of the stack 778 // once. Matching now becomes merely incrementing the current position and 779 // backtracking becomes decrementing the current position and checking the 780 // result against the stored current position. This is faster and saves 781 // space. 782 // * The current state is virtualized. 783 // This is used to defer expensive operations until it is clear that they 784 // are needed and to generate code for a node more than once, allowing 785 // specialized an efficient versions of the code to be created. This is 786 // explained in the section below. 787 // 788 // Execution state virtualization. 789 // 790 // Instead of emitting code, nodes that manipulate the state can record their 791 // manipulation in an object called the Trace. The Trace object can record a 792 // current position offset, an optional backtrack code location on the top of 793 // the virtualized backtrack stack and some register changes. When a node is 794 // to be emitted it can flush the Trace or update it. Flushing the Trace 795 // will emit code to bring the actual state into line with the virtual state. 796 // Avoiding flushing the state can postpone some work (e.g. updates of capture 797 // registers). Postponing work can save time when executing the regular 798 // expression since it may be found that the work never has to be done as a 799 // failure to match can occur. In addition it is much faster to jump to a 800 // known backtrack code location than it is to pop an unknown backtrack 801 // location from the stack and jump there. 802 // 803 // The virtual state found in the Trace affects code generation. For example 804 // the virtual state contains the difference between the actual current 805 // position and the virtual current position, and matching code needs to use 806 // this offset to attempt a match in the correct location of the input 807 // string. Therefore code generated for a non-trivial trace is specialized 808 // to that trace. The code generator therefore has the ability to generate 809 // code for each node several times. In order to limit the size of the 810 // generated code there is an arbitrary limit on how many specialized sets of 811 // code may be generated for a given node. If the limit is reached, the 812 // trace is flushed and a generic version of the code for a node is emitted. 813 // This is subsequently used for that node. The code emitted for non-generic 814 // trace is not recorded in the node and so it cannot currently be reused in 815 // the event that code generation is requested for an identical trace. 816 817 818 void RegExpTree::AppendToText(RegExpText* text, Zone* zone) { 819 UNREACHABLE(); 820 } 821 822 823 void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) { 824 text->AddElement(TextElement::Atom(this), zone); 825 } 826 827 828 void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) { 829 text->AddElement(TextElement::CharClass(this), zone); 830 } 831 832 833 void RegExpText::AppendToText(RegExpText* text, Zone* zone) { 834 for (int i = 0; i < elements()->length(); i++) 835 text->AddElement(elements()->at(i), zone); 836 } 837 838 839 TextElement TextElement::Atom(RegExpAtom* atom) { 840 return TextElement(ATOM, atom); 841 } 842 843 844 TextElement TextElement::CharClass(RegExpCharacterClass* char_class) { 845 return TextElement(CHAR_CLASS, char_class); 846 } 847 848 849 int TextElement::length() const { 850 switch (text_type()) { 851 case ATOM: 852 return atom()->length(); 853 854 case CHAR_CLASS: 855 return 1; 856 } 857 UNREACHABLE(); 858 } 859 860 861 DispatchTable* ChoiceNode::GetTable(bool ignore_case) { 862 if (table_ == nullptr) { 863 table_ = new(zone()) DispatchTable(zone()); 864 DispatchTableConstructor cons(table_, ignore_case, zone()); 865 cons.BuildTable(this); 866 } 867 return table_; 868 } 869 870 871 class FrequencyCollator { 872 public: 873 FrequencyCollator() : total_samples_(0) { 874 for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) { 875 frequencies_[i] = CharacterFrequency(i); 876 } 877 } 878 879 void CountCharacter(int character) { 880 int index = (character & RegExpMacroAssembler::kTableMask); 881 frequencies_[index].Increment(); 882 total_samples_++; 883 } 884 885 // Does not measure in percent, but rather per-128 (the table size from the 886 // regexp macro assembler). 887 int Frequency(int in_character) { 888 DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character); 889 if (total_samples_ < 1) return 1; // Division by zero. 890 int freq_in_per128 = 891 (frequencies_[in_character].counter() * 128) / total_samples_; 892 return freq_in_per128; 893 } 894 895 private: 896 class CharacterFrequency { 897 public: 898 CharacterFrequency() : counter_(0), character_(-1) { } 899 explicit CharacterFrequency(int character) 900 : counter_(0), character_(character) { } 901 902 void Increment() { counter_++; } 903 int counter() { return counter_; } 904 int character() { return character_; } 905 906 private: 907 int counter_; 908 int character_; 909 }; 910 911 912 private: 913 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize]; 914 int total_samples_; 915 }; 916 917 918 class RegExpCompiler { 919 public: 920 RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count, 921 bool is_one_byte); 922 923 int AllocateRegister() { 924 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) { 925 reg_exp_too_big_ = true; 926 return next_register_; 927 } 928 return next_register_++; 929 } 930 931 // Lookarounds to match lone surrogates for unicode character class matches 932 // are never nested. We can therefore reuse registers. 933 int UnicodeLookaroundStackRegister() { 934 if (unicode_lookaround_stack_register_ == kNoRegister) { 935 unicode_lookaround_stack_register_ = AllocateRegister(); 936 } 937 return unicode_lookaround_stack_register_; 938 } 939 940 int UnicodeLookaroundPositionRegister() { 941 if (unicode_lookaround_position_register_ == kNoRegister) { 942 unicode_lookaround_position_register_ = AllocateRegister(); 943 } 944 return unicode_lookaround_position_register_; 945 } 946 947 RegExpEngine::CompilationResult Assemble(Isolate* isolate, 948 RegExpMacroAssembler* assembler, 949 RegExpNode* start, int capture_count, 950 Handle<String> pattern); 951 952 inline void AddWork(RegExpNode* node) { 953 if (!node->on_work_list() && !node->label()->is_bound()) { 954 node->set_on_work_list(true); 955 work_list_->push_back(node); 956 } 957 } 958 959 static const int kImplementationOffset = 0; 960 static const int kNumberOfRegistersOffset = 0; 961 static const int kCodeOffset = 1; 962 963 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; } 964 EndNode* accept() { return accept_; } 965 966 static const int kMaxRecursion = 100; 967 inline int recursion_depth() { return recursion_depth_; } 968 inline void IncrementRecursionDepth() { recursion_depth_++; } 969 inline void DecrementRecursionDepth() { recursion_depth_--; } 970 971 void SetRegExpTooBig() { reg_exp_too_big_ = true; } 972 973 inline bool one_byte() { return one_byte_; } 974 inline bool optimize() { return optimize_; } 975 inline void set_optimize(bool value) { optimize_ = value; } 976 inline bool limiting_recursion() { return limiting_recursion_; } 977 inline void set_limiting_recursion(bool value) { 978 limiting_recursion_ = value; 979 } 980 bool read_backward() { return read_backward_; } 981 void set_read_backward(bool value) { read_backward_ = value; } 982 FrequencyCollator* frequency_collator() { return &frequency_collator_; } 983 984 int current_expansion_factor() { return current_expansion_factor_; } 985 void set_current_expansion_factor(int value) { 986 current_expansion_factor_ = value; 987 } 988 989 Isolate* isolate() const { return isolate_; } 990 Zone* zone() const { return zone_; } 991 992 static const int kNoRegister = -1; 993 994 private: 995 EndNode* accept_; 996 int next_register_; 997 int unicode_lookaround_stack_register_; 998 int unicode_lookaround_position_register_; 999 std::vector<RegExpNode*>* work_list_; 1000 int recursion_depth_; 1001 RegExpMacroAssembler* macro_assembler_; 1002 bool one_byte_; 1003 bool reg_exp_too_big_; 1004 bool limiting_recursion_; 1005 bool optimize_; 1006 bool read_backward_; 1007 int current_expansion_factor_; 1008 FrequencyCollator frequency_collator_; 1009 Isolate* isolate_; 1010 Zone* zone_; 1011 }; 1012 1013 1014 class RecursionCheck { 1015 public: 1016 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) { 1017 compiler->IncrementRecursionDepth(); 1018 } 1019 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); } 1020 private: 1021 RegExpCompiler* compiler_; 1022 }; 1023 1024 1025 static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) { 1026 return RegExpEngine::CompilationResult(isolate, "RegExp too big"); 1027 } 1028 1029 1030 // Attempts to compile the regexp using an Irregexp code generator. Returns 1031 // a fixed array or a null handle depending on whether it succeeded. 1032 RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count, 1033 bool one_byte) 1034 : next_register_(2 * (capture_count + 1)), 1035 unicode_lookaround_stack_register_(kNoRegister), 1036 unicode_lookaround_position_register_(kNoRegister), 1037 work_list_(nullptr), 1038 recursion_depth_(0), 1039 one_byte_(one_byte), 1040 reg_exp_too_big_(false), 1041 limiting_recursion_(false), 1042 optimize_(FLAG_regexp_optimization), 1043 read_backward_(false), 1044 current_expansion_factor_(1), 1045 frequency_collator_(), 1046 isolate_(isolate), 1047 zone_(zone) { 1048 accept_ = new(zone) EndNode(EndNode::ACCEPT, zone); 1049 DCHECK_GE(RegExpMacroAssembler::kMaxRegister, next_register_ - 1); 1050 } 1051 1052 RegExpEngine::CompilationResult RegExpCompiler::Assemble( 1053 Isolate* isolate, RegExpMacroAssembler* macro_assembler, RegExpNode* start, 1054 int capture_count, Handle<String> pattern) { 1055 #ifdef DEBUG 1056 if (FLAG_trace_regexp_assembler) 1057 macro_assembler_ = new RegExpMacroAssemblerTracer(isolate, macro_assembler); 1058 else 1059 #endif 1060 macro_assembler_ = macro_assembler; 1061 1062 std::vector<RegExpNode*> work_list; 1063 work_list_ = &work_list; 1064 Label fail; 1065 macro_assembler_->PushBacktrack(&fail); 1066 Trace new_trace; 1067 start->Emit(this, &new_trace); 1068 macro_assembler_->Bind(&fail); 1069 macro_assembler_->Fail(); 1070 while (!work_list.empty()) { 1071 RegExpNode* node = work_list.back(); 1072 work_list.pop_back(); 1073 node->set_on_work_list(false); 1074 if (!node->label()->is_bound()) node->Emit(this, &new_trace); 1075 } 1076 if (reg_exp_too_big_) { 1077 macro_assembler_->AbortedCodeGeneration(); 1078 return IrregexpRegExpTooBig(isolate_); 1079 } 1080 1081 Handle<HeapObject> code = macro_assembler_->GetCode(pattern); 1082 isolate->IncreaseTotalRegexpCodeGenerated(code->Size()); 1083 work_list_ = nullptr; 1084 #if defined(ENABLE_DISASSEMBLER) && !defined(V8_INTERPRETED_REGEXP) 1085 if (FLAG_print_code) { 1086 CodeTracer::Scope trace_scope(isolate->GetCodeTracer()); 1087 OFStream os(trace_scope.file()); 1088 Handle<Code>::cast(code)->Disassemble(pattern->ToCString().get(), os); 1089 } 1090 #endif 1091 #ifdef DEBUG 1092 if (FLAG_trace_regexp_assembler) { 1093 delete macro_assembler_; 1094 } 1095 #endif 1096 return RegExpEngine::CompilationResult(*code, next_register_); 1097 } 1098 1099 1100 bool Trace::DeferredAction::Mentions(int that) { 1101 if (action_type() == ActionNode::CLEAR_CAPTURES) { 1102 Interval range = static_cast<DeferredClearCaptures*>(this)->range(); 1103 return range.Contains(that); 1104 } else { 1105 return reg() == that; 1106 } 1107 } 1108 1109 1110 bool Trace::mentions_reg(int reg) { 1111 for (DeferredAction* action = actions_; action != nullptr; 1112 action = action->next()) { 1113 if (action->Mentions(reg)) 1114 return true; 1115 } 1116 return false; 1117 } 1118 1119 1120 bool Trace::GetStoredPosition(int reg, int* cp_offset) { 1121 DCHECK_EQ(0, *cp_offset); 1122 for (DeferredAction* action = actions_; action != nullptr; 1123 action = action->next()) { 1124 if (action->Mentions(reg)) { 1125 if (action->action_type() == ActionNode::STORE_POSITION) { 1126 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset(); 1127 return true; 1128 } else { 1129 return false; 1130 } 1131 } 1132 } 1133 return false; 1134 } 1135 1136 1137 int Trace::FindAffectedRegisters(OutSet* affected_registers, 1138 Zone* zone) { 1139 int max_register = RegExpCompiler::kNoRegister; 1140 for (DeferredAction* action = actions_; action != nullptr; 1141 action = action->next()) { 1142 if (action->action_type() == ActionNode::CLEAR_CAPTURES) { 1143 Interval range = static_cast<DeferredClearCaptures*>(action)->range(); 1144 for (int i = range.from(); i <= range.to(); i++) 1145 affected_registers->Set(i, zone); 1146 if (range.to() > max_register) max_register = range.to(); 1147 } else { 1148 affected_registers->Set(action->reg(), zone); 1149 if (action->reg() > max_register) max_register = action->reg(); 1150 } 1151 } 1152 return max_register; 1153 } 1154 1155 1156 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler, 1157 int max_register, 1158 const OutSet& registers_to_pop, 1159 const OutSet& registers_to_clear) { 1160 for (int reg = max_register; reg >= 0; reg--) { 1161 if (registers_to_pop.Get(reg)) { 1162 assembler->PopRegister(reg); 1163 } else if (registers_to_clear.Get(reg)) { 1164 int clear_to = reg; 1165 while (reg > 0 && registers_to_clear.Get(reg - 1)) { 1166 reg--; 1167 } 1168 assembler->ClearRegisters(reg, clear_to); 1169 } 1170 } 1171 } 1172 1173 1174 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler, 1175 int max_register, 1176 const OutSet& affected_registers, 1177 OutSet* registers_to_pop, 1178 OutSet* registers_to_clear, 1179 Zone* zone) { 1180 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1. 1181 const int push_limit = (assembler->stack_limit_slack() + 1) / 2; 1182 1183 // Count pushes performed to force a stack limit check occasionally. 1184 int pushes = 0; 1185 1186 for (int reg = 0; reg <= max_register; reg++) { 1187 if (!affected_registers.Get(reg)) { 1188 continue; 1189 } 1190 1191 // The chronologically first deferred action in the trace 1192 // is used to infer the action needed to restore a register 1193 // to its previous state (or not, if it's safe to ignore it). 1194 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR }; 1195 DeferredActionUndoType undo_action = IGNORE; 1196 1197 int value = 0; 1198 bool absolute = false; 1199 bool clear = false; 1200 static const int kNoStore = kMinInt; 1201 int store_position = kNoStore; 1202 // This is a little tricky because we are scanning the actions in reverse 1203 // historical order (newest first). 1204 for (DeferredAction* action = actions_; action != nullptr; 1205 action = action->next()) { 1206 if (action->Mentions(reg)) { 1207 switch (action->action_type()) { 1208 case ActionNode::SET_REGISTER: { 1209 Trace::DeferredSetRegister* psr = 1210 static_cast<Trace::DeferredSetRegister*>(action); 1211 if (!absolute) { 1212 value += psr->value(); 1213 absolute = true; 1214 } 1215 // SET_REGISTER is currently only used for newly introduced loop 1216 // counters. They can have a significant previous value if they 1217 // occur in a loop. TODO(lrn): Propagate this information, so 1218 // we can set undo_action to IGNORE if we know there is no value to 1219 // restore. 1220 undo_action = RESTORE; 1221 DCHECK_EQ(store_position, kNoStore); 1222 DCHECK(!clear); 1223 break; 1224 } 1225 case ActionNode::INCREMENT_REGISTER: 1226 if (!absolute) { 1227 value++; 1228 } 1229 DCHECK_EQ(store_position, kNoStore); 1230 DCHECK(!clear); 1231 undo_action = RESTORE; 1232 break; 1233 case ActionNode::STORE_POSITION: { 1234 Trace::DeferredCapture* pc = 1235 static_cast<Trace::DeferredCapture*>(action); 1236 if (!clear && store_position == kNoStore) { 1237 store_position = pc->cp_offset(); 1238 } 1239 1240 // For captures we know that stores and clears alternate. 1241 // Other register, are never cleared, and if the occur 1242 // inside a loop, they might be assigned more than once. 1243 if (reg <= 1) { 1244 // Registers zero and one, aka "capture zero", is 1245 // always set correctly if we succeed. There is no 1246 // need to undo a setting on backtrack, because we 1247 // will set it again or fail. 1248 undo_action = IGNORE; 1249 } else { 1250 undo_action = pc->is_capture() ? CLEAR : RESTORE; 1251 } 1252 DCHECK(!absolute); 1253 DCHECK_EQ(value, 0); 1254 break; 1255 } 1256 case ActionNode::CLEAR_CAPTURES: { 1257 // Since we're scanning in reverse order, if we've already 1258 // set the position we have to ignore historically earlier 1259 // clearing operations. 1260 if (store_position == kNoStore) { 1261 clear = true; 1262 } 1263 undo_action = RESTORE; 1264 DCHECK(!absolute); 1265 DCHECK_EQ(value, 0); 1266 break; 1267 } 1268 default: 1269 UNREACHABLE(); 1270 break; 1271 } 1272 } 1273 } 1274 // Prepare for the undo-action (e.g., push if it's going to be popped). 1275 if (undo_action == RESTORE) { 1276 pushes++; 1277 RegExpMacroAssembler::StackCheckFlag stack_check = 1278 RegExpMacroAssembler::kNoStackLimitCheck; 1279 if (pushes == push_limit) { 1280 stack_check = RegExpMacroAssembler::kCheckStackLimit; 1281 pushes = 0; 1282 } 1283 1284 assembler->PushRegister(reg, stack_check); 1285 registers_to_pop->Set(reg, zone); 1286 } else if (undo_action == CLEAR) { 1287 registers_to_clear->Set(reg, zone); 1288 } 1289 // Perform the chronologically last action (or accumulated increment) 1290 // for the register. 1291 if (store_position != kNoStore) { 1292 assembler->WriteCurrentPositionToRegister(reg, store_position); 1293 } else if (clear) { 1294 assembler->ClearRegisters(reg, reg); 1295 } else if (absolute) { 1296 assembler->SetRegister(reg, value); 1297 } else if (value != 0) { 1298 assembler->AdvanceRegister(reg, value); 1299 } 1300 } 1301 } 1302 1303 1304 // This is called as we come into a loop choice node and some other tricky 1305 // nodes. It normalizes the state of the code generator to ensure we can 1306 // generate generic code. 1307 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) { 1308 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1309 1310 DCHECK(!is_trivial()); 1311 1312 if (actions_ == nullptr && backtrack() == nullptr) { 1313 // Here we just have some deferred cp advances to fix and we are back to 1314 // a normal situation. We may also have to forget some information gained 1315 // through a quick check that was already performed. 1316 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_); 1317 // Create a new trivial state and generate the node with that. 1318 Trace new_state; 1319 successor->Emit(compiler, &new_state); 1320 return; 1321 } 1322 1323 // Generate deferred actions here along with code to undo them again. 1324 OutSet affected_registers; 1325 1326 if (backtrack() != nullptr) { 1327 // Here we have a concrete backtrack location. These are set up by choice 1328 // nodes and so they indicate that we have a deferred save of the current 1329 // position which we may need to emit here. 1330 assembler->PushCurrentPosition(); 1331 } 1332 1333 int max_register = FindAffectedRegisters(&affected_registers, 1334 compiler->zone()); 1335 OutSet registers_to_pop; 1336 OutSet registers_to_clear; 1337 PerformDeferredActions(assembler, 1338 max_register, 1339 affected_registers, 1340 ®isters_to_pop, 1341 ®isters_to_clear, 1342 compiler->zone()); 1343 if (cp_offset_ != 0) { 1344 assembler->AdvanceCurrentPosition(cp_offset_); 1345 } 1346 1347 // Create a new trivial state and generate the node with that. 1348 Label undo; 1349 assembler->PushBacktrack(&undo); 1350 if (successor->KeepRecursing(compiler)) { 1351 Trace new_state; 1352 successor->Emit(compiler, &new_state); 1353 } else { 1354 compiler->AddWork(successor); 1355 assembler->GoTo(successor->label()); 1356 } 1357 1358 // On backtrack we need to restore state. 1359 assembler->Bind(&undo); 1360 RestoreAffectedRegisters(assembler, 1361 max_register, 1362 registers_to_pop, 1363 registers_to_clear); 1364 if (backtrack() == nullptr) { 1365 assembler->Backtrack(); 1366 } else { 1367 assembler->PopCurrentPosition(); 1368 assembler->GoTo(backtrack()); 1369 } 1370 } 1371 1372 1373 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) { 1374 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1375 1376 // Omit flushing the trace. We discard the entire stack frame anyway. 1377 1378 if (!label()->is_bound()) { 1379 // We are completely independent of the trace, since we ignore it, 1380 // so this code can be used as the generic version. 1381 assembler->Bind(label()); 1382 } 1383 1384 // Throw away everything on the backtrack stack since the start 1385 // of the negative submatch and restore the character position. 1386 assembler->ReadCurrentPositionFromRegister(current_position_register_); 1387 assembler->ReadStackPointerFromRegister(stack_pointer_register_); 1388 if (clear_capture_count_ > 0) { 1389 // Clear any captures that might have been performed during the success 1390 // of the body of the negative look-ahead. 1391 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1; 1392 assembler->ClearRegisters(clear_capture_start_, clear_capture_end); 1393 } 1394 // Now that we have unwound the stack we find at the top of the stack the 1395 // backtrack that the BeginSubmatch node got. 1396 assembler->Backtrack(); 1397 } 1398 1399 1400 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) { 1401 if (!trace->is_trivial()) { 1402 trace->Flush(compiler, this); 1403 return; 1404 } 1405 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1406 if (!label()->is_bound()) { 1407 assembler->Bind(label()); 1408 } 1409 switch (action_) { 1410 case ACCEPT: 1411 assembler->Succeed(); 1412 return; 1413 case BACKTRACK: 1414 assembler->GoTo(trace->backtrack()); 1415 return; 1416 case NEGATIVE_SUBMATCH_SUCCESS: 1417 // This case is handled in a different virtual method. 1418 UNREACHABLE(); 1419 } 1420 UNIMPLEMENTED(); 1421 } 1422 1423 1424 void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) { 1425 if (guards_ == nullptr) guards_ = new (zone) ZoneList<Guard*>(1, zone); 1426 guards_->Add(guard, zone); 1427 } 1428 1429 1430 ActionNode* ActionNode::SetRegister(int reg, 1431 int val, 1432 RegExpNode* on_success) { 1433 ActionNode* result = 1434 new(on_success->zone()) ActionNode(SET_REGISTER, on_success); 1435 result->data_.u_store_register.reg = reg; 1436 result->data_.u_store_register.value = val; 1437 return result; 1438 } 1439 1440 1441 ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) { 1442 ActionNode* result = 1443 new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success); 1444 result->data_.u_increment_register.reg = reg; 1445 return result; 1446 } 1447 1448 1449 ActionNode* ActionNode::StorePosition(int reg, 1450 bool is_capture, 1451 RegExpNode* on_success) { 1452 ActionNode* result = 1453 new(on_success->zone()) ActionNode(STORE_POSITION, on_success); 1454 result->data_.u_position_register.reg = reg; 1455 result->data_.u_position_register.is_capture = is_capture; 1456 return result; 1457 } 1458 1459 1460 ActionNode* ActionNode::ClearCaptures(Interval range, 1461 RegExpNode* on_success) { 1462 ActionNode* result = 1463 new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success); 1464 result->data_.u_clear_captures.range_from = range.from(); 1465 result->data_.u_clear_captures.range_to = range.to(); 1466 return result; 1467 } 1468 1469 1470 ActionNode* ActionNode::BeginSubmatch(int stack_reg, 1471 int position_reg, 1472 RegExpNode* on_success) { 1473 ActionNode* result = 1474 new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success); 1475 result->data_.u_submatch.stack_pointer_register = stack_reg; 1476 result->data_.u_submatch.current_position_register = position_reg; 1477 return result; 1478 } 1479 1480 1481 ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg, 1482 int position_reg, 1483 int clear_register_count, 1484 int clear_register_from, 1485 RegExpNode* on_success) { 1486 ActionNode* result = 1487 new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success); 1488 result->data_.u_submatch.stack_pointer_register = stack_reg; 1489 result->data_.u_submatch.current_position_register = position_reg; 1490 result->data_.u_submatch.clear_register_count = clear_register_count; 1491 result->data_.u_submatch.clear_register_from = clear_register_from; 1492 return result; 1493 } 1494 1495 1496 ActionNode* ActionNode::EmptyMatchCheck(int start_register, 1497 int repetition_register, 1498 int repetition_limit, 1499 RegExpNode* on_success) { 1500 ActionNode* result = 1501 new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success); 1502 result->data_.u_empty_match_check.start_register = start_register; 1503 result->data_.u_empty_match_check.repetition_register = repetition_register; 1504 result->data_.u_empty_match_check.repetition_limit = repetition_limit; 1505 return result; 1506 } 1507 1508 1509 #define DEFINE_ACCEPT(Type) \ 1510 void Type##Node::Accept(NodeVisitor* visitor) { \ 1511 visitor->Visit##Type(this); \ 1512 } 1513 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT) 1514 #undef DEFINE_ACCEPT 1515 1516 1517 void LoopChoiceNode::Accept(NodeVisitor* visitor) { 1518 visitor->VisitLoopChoice(this); 1519 } 1520 1521 1522 // ------------------------------------------------------------------- 1523 // Emit code. 1524 1525 1526 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler, 1527 Guard* guard, 1528 Trace* trace) { 1529 switch (guard->op()) { 1530 case Guard::LT: 1531 DCHECK(!trace->mentions_reg(guard->reg())); 1532 macro_assembler->IfRegisterGE(guard->reg(), 1533 guard->value(), 1534 trace->backtrack()); 1535 break; 1536 case Guard::GEQ: 1537 DCHECK(!trace->mentions_reg(guard->reg())); 1538 macro_assembler->IfRegisterLT(guard->reg(), 1539 guard->value(), 1540 trace->backtrack()); 1541 break; 1542 } 1543 } 1544 1545 1546 // Returns the number of characters in the equivalence class, omitting those 1547 // that cannot occur in the source string because it is Latin1. 1548 static int GetCaseIndependentLetters(Isolate* isolate, uc16 character, 1549 bool one_byte_subject, 1550 unibrow::uchar* letters) { 1551 int length = 1552 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters); 1553 // Unibrow returns 0 or 1 for characters where case independence is 1554 // trivial. 1555 if (length == 0) { 1556 letters[0] = character; 1557 length = 1; 1558 } 1559 1560 if (one_byte_subject) { 1561 int new_length = 0; 1562 for (int i = 0; i < length; i++) { 1563 if (letters[i] <= String::kMaxOneByteCharCode) { 1564 letters[new_length++] = letters[i]; 1565 } 1566 } 1567 length = new_length; 1568 } 1569 1570 return length; 1571 } 1572 1573 1574 static inline bool EmitSimpleCharacter(Isolate* isolate, 1575 RegExpCompiler* compiler, 1576 uc16 c, 1577 Label* on_failure, 1578 int cp_offset, 1579 bool check, 1580 bool preloaded) { 1581 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1582 bool bound_checked = false; 1583 if (!preloaded) { 1584 assembler->LoadCurrentCharacter( 1585 cp_offset, 1586 on_failure, 1587 check); 1588 bound_checked = true; 1589 } 1590 assembler->CheckNotCharacter(c, on_failure); 1591 return bound_checked; 1592 } 1593 1594 1595 // Only emits non-letters (things that don't have case). Only used for case 1596 // independent matches. 1597 static inline bool EmitAtomNonLetter(Isolate* isolate, 1598 RegExpCompiler* compiler, 1599 uc16 c, 1600 Label* on_failure, 1601 int cp_offset, 1602 bool check, 1603 bool preloaded) { 1604 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 1605 bool one_byte = compiler->one_byte(); 1606 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1607 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars); 1608 if (length < 1) { 1609 // This can't match. Must be an one-byte subject and a non-one-byte 1610 // character. We do not need to do anything since the one-byte pass 1611 // already handled this. 1612 return false; // Bounds not checked. 1613 } 1614 bool checked = false; 1615 // We handle the length > 1 case in a later pass. 1616 if (length == 1) { 1617 if (one_byte && c > String::kMaxOneByteCharCodeU) { 1618 // Can't match - see above. 1619 return false; // Bounds not checked. 1620 } 1621 if (!preloaded) { 1622 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 1623 checked = check; 1624 } 1625 macro_assembler->CheckNotCharacter(c, on_failure); 1626 } 1627 return checked; 1628 } 1629 1630 1631 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler, 1632 bool one_byte, uc16 c1, uc16 c2, 1633 Label* on_failure) { 1634 uc16 char_mask; 1635 if (one_byte) { 1636 char_mask = String::kMaxOneByteCharCode; 1637 } else { 1638 char_mask = String::kMaxUtf16CodeUnit; 1639 } 1640 uc16 exor = c1 ^ c2; 1641 // Check whether exor has only one bit set. 1642 if (((exor - 1) & exor) == 0) { 1643 // If c1 and c2 differ only by one bit. 1644 // Ecma262UnCanonicalize always gives the highest number last. 1645 DCHECK(c2 > c1); 1646 uc16 mask = char_mask ^ exor; 1647 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure); 1648 return true; 1649 } 1650 DCHECK(c2 > c1); 1651 uc16 diff = c2 - c1; 1652 if (((diff - 1) & diff) == 0 && c1 >= diff) { 1653 // If the characters differ by 2^n but don't differ by one bit then 1654 // subtract the difference from the found character, then do the or 1655 // trick. We avoid the theoretical case where negative numbers are 1656 // involved in order to simplify code generation. 1657 uc16 mask = char_mask ^ diff; 1658 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff, 1659 diff, 1660 mask, 1661 on_failure); 1662 return true; 1663 } 1664 return false; 1665 } 1666 1667 1668 typedef bool EmitCharacterFunction(Isolate* isolate, 1669 RegExpCompiler* compiler, 1670 uc16 c, 1671 Label* on_failure, 1672 int cp_offset, 1673 bool check, 1674 bool preloaded); 1675 1676 // Only emits letters (things that have case). Only used for case independent 1677 // matches. 1678 static inline bool EmitAtomLetter(Isolate* isolate, 1679 RegExpCompiler* compiler, 1680 uc16 c, 1681 Label* on_failure, 1682 int cp_offset, 1683 bool check, 1684 bool preloaded) { 1685 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 1686 bool one_byte = compiler->one_byte(); 1687 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1688 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars); 1689 if (length <= 1) return false; 1690 // We may not need to check against the end of the input string 1691 // if this character lies before a character that matched. 1692 if (!preloaded) { 1693 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 1694 } 1695 Label ok; 1696 DCHECK_EQ(4, unibrow::Ecma262UnCanonicalize::kMaxWidth); 1697 switch (length) { 1698 case 2: { 1699 if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0], 1700 chars[1], on_failure)) { 1701 } else { 1702 macro_assembler->CheckCharacter(chars[0], &ok); 1703 macro_assembler->CheckNotCharacter(chars[1], on_failure); 1704 macro_assembler->Bind(&ok); 1705 } 1706 break; 1707 } 1708 case 4: 1709 macro_assembler->CheckCharacter(chars[3], &ok); 1710 V8_FALLTHROUGH; 1711 case 3: 1712 macro_assembler->CheckCharacter(chars[0], &ok); 1713 macro_assembler->CheckCharacter(chars[1], &ok); 1714 macro_assembler->CheckNotCharacter(chars[2], on_failure); 1715 macro_assembler->Bind(&ok); 1716 break; 1717 default: 1718 UNREACHABLE(); 1719 break; 1720 } 1721 return true; 1722 } 1723 1724 1725 static void EmitBoundaryTest(RegExpMacroAssembler* masm, 1726 int border, 1727 Label* fall_through, 1728 Label* above_or_equal, 1729 Label* below) { 1730 if (below != fall_through) { 1731 masm->CheckCharacterLT(border, below); 1732 if (above_or_equal != fall_through) masm->GoTo(above_or_equal); 1733 } else { 1734 masm->CheckCharacterGT(border - 1, above_or_equal); 1735 } 1736 } 1737 1738 1739 static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm, 1740 int first, 1741 int last, 1742 Label* fall_through, 1743 Label* in_range, 1744 Label* out_of_range) { 1745 if (in_range == fall_through) { 1746 if (first == last) { 1747 masm->CheckNotCharacter(first, out_of_range); 1748 } else { 1749 masm->CheckCharacterNotInRange(first, last, out_of_range); 1750 } 1751 } else { 1752 if (first == last) { 1753 masm->CheckCharacter(first, in_range); 1754 } else { 1755 masm->CheckCharacterInRange(first, last, in_range); 1756 } 1757 if (out_of_range != fall_through) masm->GoTo(out_of_range); 1758 } 1759 } 1760 1761 1762 // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even. 1763 // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd. 1764 static void EmitUseLookupTable( 1765 RegExpMacroAssembler* masm, 1766 ZoneList<int>* ranges, 1767 int start_index, 1768 int end_index, 1769 int min_char, 1770 Label* fall_through, 1771 Label* even_label, 1772 Label* odd_label) { 1773 static const int kSize = RegExpMacroAssembler::kTableSize; 1774 static const int kMask = RegExpMacroAssembler::kTableMask; 1775 1776 int base = (min_char & ~kMask); 1777 USE(base); 1778 1779 // Assert that everything is on one kTableSize page. 1780 for (int i = start_index; i <= end_index; i++) { 1781 DCHECK_EQ(ranges->at(i) & ~kMask, base); 1782 } 1783 DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base); 1784 1785 char templ[kSize]; 1786 Label* on_bit_set; 1787 Label* on_bit_clear; 1788 int bit; 1789 if (even_label == fall_through) { 1790 on_bit_set = odd_label; 1791 on_bit_clear = even_label; 1792 bit = 1; 1793 } else { 1794 on_bit_set = even_label; 1795 on_bit_clear = odd_label; 1796 bit = 0; 1797 } 1798 for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) { 1799 templ[i] = bit; 1800 } 1801 int j = 0; 1802 bit ^= 1; 1803 for (int i = start_index; i < end_index; i++) { 1804 for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) { 1805 templ[j] = bit; 1806 } 1807 bit ^= 1; 1808 } 1809 for (int i = j; i < kSize; i++) { 1810 templ[i] = bit; 1811 } 1812 Factory* factory = masm->isolate()->factory(); 1813 // TODO(erikcorry): Cache these. 1814 Handle<ByteArray> ba = factory->NewByteArray(kSize, TENURED); 1815 for (int i = 0; i < kSize; i++) { 1816 ba->set(i, templ[i]); 1817 } 1818 masm->CheckBitInTable(ba, on_bit_set); 1819 if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear); 1820 } 1821 1822 1823 static void CutOutRange(RegExpMacroAssembler* masm, 1824 ZoneList<int>* ranges, 1825 int start_index, 1826 int end_index, 1827 int cut_index, 1828 Label* even_label, 1829 Label* odd_label) { 1830 bool odd = (((cut_index - start_index) & 1) == 1); 1831 Label* in_range_label = odd ? odd_label : even_label; 1832 Label dummy; 1833 EmitDoubleBoundaryTest(masm, 1834 ranges->at(cut_index), 1835 ranges->at(cut_index + 1) - 1, 1836 &dummy, 1837 in_range_label, 1838 &dummy); 1839 DCHECK(!dummy.is_linked()); 1840 // Cut out the single range by rewriting the array. This creates a new 1841 // range that is a merger of the two ranges on either side of the one we 1842 // are cutting out. The oddity of the labels is preserved. 1843 for (int j = cut_index; j > start_index; j--) { 1844 ranges->at(j) = ranges->at(j - 1); 1845 } 1846 for (int j = cut_index + 1; j < end_index; j++) { 1847 ranges->at(j) = ranges->at(j + 1); 1848 } 1849 } 1850 1851 1852 // Unicode case. Split the search space into kSize spaces that are handled 1853 // with recursion. 1854 static void SplitSearchSpace(ZoneList<int>* ranges, 1855 int start_index, 1856 int end_index, 1857 int* new_start_index, 1858 int* new_end_index, 1859 int* border) { 1860 static const int kSize = RegExpMacroAssembler::kTableSize; 1861 static const int kMask = RegExpMacroAssembler::kTableMask; 1862 1863 int first = ranges->at(start_index); 1864 int last = ranges->at(end_index) - 1; 1865 1866 *new_start_index = start_index; 1867 *border = (ranges->at(start_index) & ~kMask) + kSize; 1868 while (*new_start_index < end_index) { 1869 if (ranges->at(*new_start_index) > *border) break; 1870 (*new_start_index)++; 1871 } 1872 // new_start_index is the index of the first edge that is beyond the 1873 // current kSize space. 1874 1875 // For very large search spaces we do a binary chop search of the non-Latin1 1876 // space instead of just going to the end of the current kSize space. The 1877 // heuristics are complicated a little by the fact that any 128-character 1878 // encoding space can be quickly tested with a table lookup, so we don't 1879 // wish to do binary chop search at a smaller granularity than that. A 1880 // 128-character space can take up a lot of space in the ranges array if, 1881 // for example, we only want to match every second character (eg. the lower 1882 // case characters on some Unicode pages). 1883 int binary_chop_index = (end_index + start_index) / 2; 1884 // The first test ensures that we get to the code that handles the Latin1 1885 // range with a single not-taken branch, speeding up this important 1886 // character range (even non-Latin1 charset-based text has spaces and 1887 // punctuation). 1888 if (*border - 1 > String::kMaxOneByteCharCode && // Latin1 case. 1889 end_index - start_index > (*new_start_index - start_index) * 2 && 1890 last - first > kSize * 2 && binary_chop_index > *new_start_index && 1891 ranges->at(binary_chop_index) >= first + 2 * kSize) { 1892 int scan_forward_for_section_border = binary_chop_index;; 1893 int new_border = (ranges->at(binary_chop_index) | kMask) + 1; 1894 1895 while (scan_forward_for_section_border < end_index) { 1896 if (ranges->at(scan_forward_for_section_border) > new_border) { 1897 *new_start_index = scan_forward_for_section_border; 1898 *border = new_border; 1899 break; 1900 } 1901 scan_forward_for_section_border++; 1902 } 1903 } 1904 1905 DCHECK(*new_start_index > start_index); 1906 *new_end_index = *new_start_index - 1; 1907 if (ranges->at(*new_end_index) == *border) { 1908 (*new_end_index)--; 1909 } 1910 if (*border >= ranges->at(end_index)) { 1911 *border = ranges->at(end_index); 1912 *new_start_index = end_index; // Won't be used. 1913 *new_end_index = end_index - 1; 1914 } 1915 } 1916 1917 // Gets a series of segment boundaries representing a character class. If the 1918 // character is in the range between an even and an odd boundary (counting from 1919 // start_index) then go to even_label, otherwise go to odd_label. We already 1920 // know that the character is in the range of min_char to max_char inclusive. 1921 // Either label can be nullptr indicating backtracking. Either label can also 1922 // be equal to the fall_through label. 1923 static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<int>* ranges, 1924 int start_index, int end_index, uc32 min_char, 1925 uc32 max_char, Label* fall_through, 1926 Label* even_label, Label* odd_label) { 1927 DCHECK_LE(min_char, String::kMaxUtf16CodeUnit); 1928 DCHECK_LE(max_char, String::kMaxUtf16CodeUnit); 1929 1930 int first = ranges->at(start_index); 1931 int last = ranges->at(end_index) - 1; 1932 1933 DCHECK_LT(min_char, first); 1934 1935 // Just need to test if the character is before or on-or-after 1936 // a particular character. 1937 if (start_index == end_index) { 1938 EmitBoundaryTest(masm, first, fall_through, even_label, odd_label); 1939 return; 1940 } 1941 1942 // Another almost trivial case: There is one interval in the middle that is 1943 // different from the end intervals. 1944 if (start_index + 1 == end_index) { 1945 EmitDoubleBoundaryTest( 1946 masm, first, last, fall_through, even_label, odd_label); 1947 return; 1948 } 1949 1950 // It's not worth using table lookup if there are very few intervals in the 1951 // character class. 1952 if (end_index - start_index <= 6) { 1953 // It is faster to test for individual characters, so we look for those 1954 // first, then try arbitrary ranges in the second round. 1955 static int kNoCutIndex = -1; 1956 int cut = kNoCutIndex; 1957 for (int i = start_index; i < end_index; i++) { 1958 if (ranges->at(i) == ranges->at(i + 1) - 1) { 1959 cut = i; 1960 break; 1961 } 1962 } 1963 if (cut == kNoCutIndex) cut = start_index; 1964 CutOutRange( 1965 masm, ranges, start_index, end_index, cut, even_label, odd_label); 1966 DCHECK_GE(end_index - start_index, 2); 1967 GenerateBranches(masm, 1968 ranges, 1969 start_index + 1, 1970 end_index - 1, 1971 min_char, 1972 max_char, 1973 fall_through, 1974 even_label, 1975 odd_label); 1976 return; 1977 } 1978 1979 // If there are a lot of intervals in the regexp, then we will use tables to 1980 // determine whether the character is inside or outside the character class. 1981 static const int kBits = RegExpMacroAssembler::kTableSizeBits; 1982 1983 if ((max_char >> kBits) == (min_char >> kBits)) { 1984 EmitUseLookupTable(masm, 1985 ranges, 1986 start_index, 1987 end_index, 1988 min_char, 1989 fall_through, 1990 even_label, 1991 odd_label); 1992 return; 1993 } 1994 1995 if ((min_char >> kBits) != (first >> kBits)) { 1996 masm->CheckCharacterLT(first, odd_label); 1997 GenerateBranches(masm, 1998 ranges, 1999 start_index + 1, 2000 end_index, 2001 first, 2002 max_char, 2003 fall_through, 2004 odd_label, 2005 even_label); 2006 return; 2007 } 2008 2009 int new_start_index = 0; 2010 int new_end_index = 0; 2011 int border = 0; 2012 2013 SplitSearchSpace(ranges, 2014 start_index, 2015 end_index, 2016 &new_start_index, 2017 &new_end_index, 2018 &border); 2019 2020 Label handle_rest; 2021 Label* above = &handle_rest; 2022 if (border == last + 1) { 2023 // We didn't find any section that started after the limit, so everything 2024 // above the border is one of the terminal labels. 2025 above = (end_index & 1) != (start_index & 1) ? odd_label : even_label; 2026 DCHECK(new_end_index == end_index - 1); 2027 } 2028 2029 DCHECK_LE(start_index, new_end_index); 2030 DCHECK_LE(new_start_index, end_index); 2031 DCHECK_LT(start_index, new_start_index); 2032 DCHECK_LT(new_end_index, end_index); 2033 DCHECK(new_end_index + 1 == new_start_index || 2034 (new_end_index + 2 == new_start_index && 2035 border == ranges->at(new_end_index + 1))); 2036 DCHECK_LT(min_char, border - 1); 2037 DCHECK_LT(border, max_char); 2038 DCHECK_LT(ranges->at(new_end_index), border); 2039 DCHECK(border < ranges->at(new_start_index) || 2040 (border == ranges->at(new_start_index) && 2041 new_start_index == end_index && 2042 new_end_index == end_index - 1 && 2043 border == last + 1)); 2044 DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1)); 2045 2046 masm->CheckCharacterGT(border - 1, above); 2047 Label dummy; 2048 GenerateBranches(masm, 2049 ranges, 2050 start_index, 2051 new_end_index, 2052 min_char, 2053 border - 1, 2054 &dummy, 2055 even_label, 2056 odd_label); 2057 if (handle_rest.is_linked()) { 2058 masm->Bind(&handle_rest); 2059 bool flip = (new_start_index & 1) != (start_index & 1); 2060 GenerateBranches(masm, 2061 ranges, 2062 new_start_index, 2063 end_index, 2064 border, 2065 max_char, 2066 &dummy, 2067 flip ? odd_label : even_label, 2068 flip ? even_label : odd_label); 2069 } 2070 } 2071 2072 2073 static void EmitCharClass(RegExpMacroAssembler* macro_assembler, 2074 RegExpCharacterClass* cc, bool one_byte, 2075 Label* on_failure, int cp_offset, bool check_offset, 2076 bool preloaded, Zone* zone) { 2077 ZoneList<CharacterRange>* ranges = cc->ranges(zone); 2078 CharacterRange::Canonicalize(ranges); 2079 2080 int max_char; 2081 if (one_byte) { 2082 max_char = String::kMaxOneByteCharCode; 2083 } else { 2084 max_char = String::kMaxUtf16CodeUnit; 2085 } 2086 2087 int range_count = ranges->length(); 2088 2089 int last_valid_range = range_count - 1; 2090 while (last_valid_range >= 0) { 2091 CharacterRange& range = ranges->at(last_valid_range); 2092 if (range.from() <= max_char) { 2093 break; 2094 } 2095 last_valid_range--; 2096 } 2097 2098 if (last_valid_range < 0) { 2099 if (!cc->is_negated()) { 2100 macro_assembler->GoTo(on_failure); 2101 } 2102 if (check_offset) { 2103 macro_assembler->CheckPosition(cp_offset, on_failure); 2104 } 2105 return; 2106 } 2107 2108 if (last_valid_range == 0 && 2109 ranges->at(0).IsEverything(max_char)) { 2110 if (cc->is_negated()) { 2111 macro_assembler->GoTo(on_failure); 2112 } else { 2113 // This is a common case hit by non-anchored expressions. 2114 if (check_offset) { 2115 macro_assembler->CheckPosition(cp_offset, on_failure); 2116 } 2117 } 2118 return; 2119 } 2120 2121 if (!preloaded) { 2122 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset); 2123 } 2124 2125 if (cc->is_standard(zone) && 2126 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(), 2127 on_failure)) { 2128 return; 2129 } 2130 2131 2132 // A new list with ascending entries. Each entry is a code unit 2133 // where there is a boundary between code units that are part of 2134 // the class and code units that are not. Normally we insert an 2135 // entry at zero which goes to the failure label, but if there 2136 // was already one there we fall through for success on that entry. 2137 // Subsequent entries have alternating meaning (success/failure). 2138 ZoneList<int>* range_boundaries = 2139 new(zone) ZoneList<int>(last_valid_range, zone); 2140 2141 bool zeroth_entry_is_failure = !cc->is_negated(); 2142 2143 for (int i = 0; i <= last_valid_range; i++) { 2144 CharacterRange& range = ranges->at(i); 2145 if (range.from() == 0) { 2146 DCHECK_EQ(i, 0); 2147 zeroth_entry_is_failure = !zeroth_entry_is_failure; 2148 } else { 2149 range_boundaries->Add(range.from(), zone); 2150 } 2151 range_boundaries->Add(range.to() + 1, zone); 2152 } 2153 int end_index = range_boundaries->length() - 1; 2154 if (range_boundaries->at(end_index) > max_char) { 2155 end_index--; 2156 } 2157 2158 Label fall_through; 2159 GenerateBranches(macro_assembler, 2160 range_boundaries, 2161 0, // start_index. 2162 end_index, 2163 0, // min_char. 2164 max_char, 2165 &fall_through, 2166 zeroth_entry_is_failure ? &fall_through : on_failure, 2167 zeroth_entry_is_failure ? on_failure : &fall_through); 2168 macro_assembler->Bind(&fall_through); 2169 } 2170 2171 2172 RegExpNode::~RegExpNode() { 2173 } 2174 2175 2176 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler, 2177 Trace* trace) { 2178 // If we are generating a greedy loop then don't stop and don't reuse code. 2179 if (trace->stop_node() != nullptr) { 2180 return CONTINUE; 2181 } 2182 2183 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 2184 if (trace->is_trivial()) { 2185 if (label_.is_bound() || on_work_list() || !KeepRecursing(compiler)) { 2186 // If a generic version is already scheduled to be generated or we have 2187 // recursed too deeply then just generate a jump to that code. 2188 macro_assembler->GoTo(&label_); 2189 // This will queue it up for generation of a generic version if it hasn't 2190 // already been queued. 2191 compiler->AddWork(this); 2192 return DONE; 2193 } 2194 // Generate generic version of the node and bind the label for later use. 2195 macro_assembler->Bind(&label_); 2196 return CONTINUE; 2197 } 2198 2199 // We are being asked to make a non-generic version. Keep track of how many 2200 // non-generic versions we generate so as not to overdo it. 2201 trace_count_++; 2202 if (KeepRecursing(compiler) && compiler->optimize() && 2203 trace_count_ < kMaxCopiesCodeGenerated) { 2204 return CONTINUE; 2205 } 2206 2207 // If we get here code has been generated for this node too many times or 2208 // recursion is too deep. Time to switch to a generic version. The code for 2209 // generic versions above can handle deep recursion properly. 2210 bool was_limiting = compiler->limiting_recursion(); 2211 compiler->set_limiting_recursion(true); 2212 trace->Flush(compiler, this); 2213 compiler->set_limiting_recursion(was_limiting); 2214 return DONE; 2215 } 2216 2217 2218 bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) { 2219 return !compiler->limiting_recursion() && 2220 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion; 2221 } 2222 2223 2224 int ActionNode::EatsAtLeast(int still_to_find, 2225 int budget, 2226 bool not_at_start) { 2227 if (budget <= 0) return 0; 2228 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input! 2229 return on_success()->EatsAtLeast(still_to_find, 2230 budget - 1, 2231 not_at_start); 2232 } 2233 2234 2235 void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget, 2236 BoyerMooreLookahead* bm, bool not_at_start) { 2237 if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) { 2238 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); 2239 } 2240 SaveBMInfo(bm, not_at_start, offset); 2241 } 2242 2243 2244 int AssertionNode::EatsAtLeast(int still_to_find, 2245 int budget, 2246 bool not_at_start) { 2247 if (budget <= 0) return 0; 2248 // If we know we are not at the start and we are asked "how many characters 2249 // will you match if you succeed?" then we can answer anything since false 2250 // implies false. So lets just return the max answer (still_to_find) since 2251 // that won't prevent us from preloading a lot of characters for the other 2252 // branches in the node graph. 2253 if (assertion_type() == AT_START && not_at_start) return still_to_find; 2254 return on_success()->EatsAtLeast(still_to_find, 2255 budget - 1, 2256 not_at_start); 2257 } 2258 2259 2260 void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget, 2261 BoyerMooreLookahead* bm, bool not_at_start) { 2262 // Match the behaviour of EatsAtLeast on this node. 2263 if (assertion_type() == AT_START && not_at_start) return; 2264 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); 2265 SaveBMInfo(bm, not_at_start, offset); 2266 } 2267 2268 2269 int BackReferenceNode::EatsAtLeast(int still_to_find, 2270 int budget, 2271 bool not_at_start) { 2272 if (read_backward()) return 0; 2273 if (budget <= 0) return 0; 2274 return on_success()->EatsAtLeast(still_to_find, 2275 budget - 1, 2276 not_at_start); 2277 } 2278 2279 2280 int TextNode::EatsAtLeast(int still_to_find, 2281 int budget, 2282 bool not_at_start) { 2283 if (read_backward()) return 0; 2284 int answer = Length(); 2285 if (answer >= still_to_find) return answer; 2286 if (budget <= 0) return answer; 2287 // We are not at start after this node so we set the last argument to 'true'. 2288 return answer + on_success()->EatsAtLeast(still_to_find - answer, 2289 budget - 1, 2290 true); 2291 } 2292 2293 2294 int NegativeLookaroundChoiceNode::EatsAtLeast(int still_to_find, int budget, 2295 bool not_at_start) { 2296 if (budget <= 0) return 0; 2297 // Alternative 0 is the negative lookahead, alternative 1 is what comes 2298 // afterwards. 2299 RegExpNode* node = alternatives_->at(1).node(); 2300 return node->EatsAtLeast(still_to_find, budget - 1, not_at_start); 2301 } 2302 2303 2304 void NegativeLookaroundChoiceNode::GetQuickCheckDetails( 2305 QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in, 2306 bool not_at_start) { 2307 // Alternative 0 is the negative lookahead, alternative 1 is what comes 2308 // afterwards. 2309 RegExpNode* node = alternatives_->at(1).node(); 2310 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start); 2311 } 2312 2313 2314 int ChoiceNode::EatsAtLeastHelper(int still_to_find, 2315 int budget, 2316 RegExpNode* ignore_this_node, 2317 bool not_at_start) { 2318 if (budget <= 0) return 0; 2319 int min = 100; 2320 int choice_count = alternatives_->length(); 2321 budget = (budget - 1) / choice_count; 2322 for (int i = 0; i < choice_count; i++) { 2323 RegExpNode* node = alternatives_->at(i).node(); 2324 if (node == ignore_this_node) continue; 2325 int node_eats_at_least = 2326 node->EatsAtLeast(still_to_find, budget, not_at_start); 2327 if (node_eats_at_least < min) min = node_eats_at_least; 2328 if (min == 0) return 0; 2329 } 2330 return min; 2331 } 2332 2333 2334 int LoopChoiceNode::EatsAtLeast(int still_to_find, 2335 int budget, 2336 bool not_at_start) { 2337 return EatsAtLeastHelper(still_to_find, 2338 budget - 1, 2339 loop_node_, 2340 not_at_start); 2341 } 2342 2343 2344 int ChoiceNode::EatsAtLeast(int still_to_find, 2345 int budget, 2346 bool not_at_start) { 2347 return EatsAtLeastHelper(still_to_find, budget, nullptr, not_at_start); 2348 } 2349 2350 2351 // Takes the left-most 1-bit and smears it out, setting all bits to its right. 2352 static inline uint32_t SmearBitsRight(uint32_t v) { 2353 v |= v >> 1; 2354 v |= v >> 2; 2355 v |= v >> 4; 2356 v |= v >> 8; 2357 v |= v >> 16; 2358 return v; 2359 } 2360 2361 2362 bool QuickCheckDetails::Rationalize(bool asc) { 2363 bool found_useful_op = false; 2364 uint32_t char_mask; 2365 if (asc) { 2366 char_mask = String::kMaxOneByteCharCode; 2367 } else { 2368 char_mask = String::kMaxUtf16CodeUnit; 2369 } 2370 mask_ = 0; 2371 value_ = 0; 2372 int char_shift = 0; 2373 for (int i = 0; i < characters_; i++) { 2374 Position* pos = &positions_[i]; 2375 if ((pos->mask & String::kMaxOneByteCharCode) != 0) { 2376 found_useful_op = true; 2377 } 2378 mask_ |= (pos->mask & char_mask) << char_shift; 2379 value_ |= (pos->value & char_mask) << char_shift; 2380 char_shift += asc ? 8 : 16; 2381 } 2382 return found_useful_op; 2383 } 2384 2385 2386 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler, 2387 Trace* bounds_check_trace, 2388 Trace* trace, 2389 bool preload_has_checked_bounds, 2390 Label* on_possible_success, 2391 QuickCheckDetails* details, 2392 bool fall_through_on_failure) { 2393 if (details->characters() == 0) return false; 2394 GetQuickCheckDetails( 2395 details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE); 2396 if (details->cannot_match()) return false; 2397 if (!details->Rationalize(compiler->one_byte())) return false; 2398 DCHECK(details->characters() == 1 || 2399 compiler->macro_assembler()->CanReadUnaligned()); 2400 uint32_t mask = details->mask(); 2401 uint32_t value = details->value(); 2402 2403 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2404 2405 if (trace->characters_preloaded() != details->characters()) { 2406 DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset()); 2407 // We are attempting to preload the minimum number of characters 2408 // any choice would eat, so if the bounds check fails, then none of the 2409 // choices can succeed, so we can just immediately backtrack, rather 2410 // than go to the next choice. 2411 assembler->LoadCurrentCharacter(trace->cp_offset(), 2412 bounds_check_trace->backtrack(), 2413 !preload_has_checked_bounds, 2414 details->characters()); 2415 } 2416 2417 2418 bool need_mask = true; 2419 2420 if (details->characters() == 1) { 2421 // If number of characters preloaded is 1 then we used a byte or 16 bit 2422 // load so the value is already masked down. 2423 uint32_t char_mask; 2424 if (compiler->one_byte()) { 2425 char_mask = String::kMaxOneByteCharCode; 2426 } else { 2427 char_mask = String::kMaxUtf16CodeUnit; 2428 } 2429 if ((mask & char_mask) == char_mask) need_mask = false; 2430 mask &= char_mask; 2431 } else { 2432 // For 2-character preloads in one-byte mode or 1-character preloads in 2433 // two-byte mode we also use a 16 bit load with zero extend. 2434 static const uint32_t kTwoByteMask = 0xFFFF; 2435 static const uint32_t kFourByteMask = 0xFFFFFFFF; 2436 if (details->characters() == 2 && compiler->one_byte()) { 2437 if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false; 2438 } else if (details->characters() == 1 && !compiler->one_byte()) { 2439 if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false; 2440 } else { 2441 if (mask == kFourByteMask) need_mask = false; 2442 } 2443 } 2444 2445 if (fall_through_on_failure) { 2446 if (need_mask) { 2447 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success); 2448 } else { 2449 assembler->CheckCharacter(value, on_possible_success); 2450 } 2451 } else { 2452 if (need_mask) { 2453 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack()); 2454 } else { 2455 assembler->CheckNotCharacter(value, trace->backtrack()); 2456 } 2457 } 2458 return true; 2459 } 2460 2461 2462 // Here is the meat of GetQuickCheckDetails (see also the comment on the 2463 // super-class in the .h file). 2464 // 2465 // We iterate along the text object, building up for each character a 2466 // mask and value that can be used to test for a quick failure to match. 2467 // The masks and values for the positions will be combined into a single 2468 // machine word for the current character width in order to be used in 2469 // generating a quick check. 2470 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, 2471 RegExpCompiler* compiler, 2472 int characters_filled_in, 2473 bool not_at_start) { 2474 // Do not collect any quick check details if the text node reads backward, 2475 // since it reads in the opposite direction than we use for quick checks. 2476 if (read_backward()) return; 2477 Isolate* isolate = compiler->macro_assembler()->isolate(); 2478 DCHECK(characters_filled_in < details->characters()); 2479 int characters = details->characters(); 2480 int char_mask; 2481 if (compiler->one_byte()) { 2482 char_mask = String::kMaxOneByteCharCode; 2483 } else { 2484 char_mask = String::kMaxUtf16CodeUnit; 2485 } 2486 for (int k = 0; k < elements()->length(); k++) { 2487 TextElement elm = elements()->at(k); 2488 if (elm.text_type() == TextElement::ATOM) { 2489 Vector<const uc16> quarks = elm.atom()->data(); 2490 for (int i = 0; i < characters && i < quarks.length(); i++) { 2491 QuickCheckDetails::Position* pos = 2492 details->positions(characters_filled_in); 2493 uc16 c = quarks[i]; 2494 if (elm.atom()->ignore_case()) { 2495 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 2496 int length = GetCaseIndependentLetters(isolate, c, 2497 compiler->one_byte(), chars); 2498 if (length == 0) { 2499 // This can happen because all case variants are non-Latin1, but we 2500 // know the input is Latin1. 2501 details->set_cannot_match(); 2502 pos->determines_perfectly = false; 2503 return; 2504 } 2505 if (length == 1) { 2506 // This letter has no case equivalents, so it's nice and simple 2507 // and the mask-compare will determine definitely whether we have 2508 // a match at this character position. 2509 pos->mask = char_mask; 2510 pos->value = c; 2511 pos->determines_perfectly = true; 2512 } else { 2513 uint32_t common_bits = char_mask; 2514 uint32_t bits = chars[0]; 2515 for (int j = 1; j < length; j++) { 2516 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits); 2517 common_bits ^= differing_bits; 2518 bits &= common_bits; 2519 } 2520 // If length is 2 and common bits has only one zero in it then 2521 // our mask and compare instruction will determine definitely 2522 // whether we have a match at this character position. Otherwise 2523 // it can only be an approximate check. 2524 uint32_t one_zero = (common_bits | ~char_mask); 2525 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) { 2526 pos->determines_perfectly = true; 2527 } 2528 pos->mask = common_bits; 2529 pos->value = bits; 2530 } 2531 } else { 2532 // Don't ignore case. Nice simple case where the mask-compare will 2533 // determine definitely whether we have a match at this character 2534 // position. 2535 if (c > char_mask) { 2536 details->set_cannot_match(); 2537 pos->determines_perfectly = false; 2538 return; 2539 } 2540 pos->mask = char_mask; 2541 pos->value = c; 2542 pos->determines_perfectly = true; 2543 } 2544 characters_filled_in++; 2545 DCHECK(characters_filled_in <= details->characters()); 2546 if (characters_filled_in == details->characters()) { 2547 return; 2548 } 2549 } 2550 } else { 2551 QuickCheckDetails::Position* pos = 2552 details->positions(characters_filled_in); 2553 RegExpCharacterClass* tree = elm.char_class(); 2554 ZoneList<CharacterRange>* ranges = tree->ranges(zone()); 2555 DCHECK(!ranges->is_empty()); 2556 if (tree->is_negated()) { 2557 // A quick check uses multi-character mask and compare. There is no 2558 // useful way to incorporate a negative char class into this scheme 2559 // so we just conservatively create a mask and value that will always 2560 // succeed. 2561 pos->mask = 0; 2562 pos->value = 0; 2563 } else { 2564 int first_range = 0; 2565 while (ranges->at(first_range).from() > char_mask) { 2566 first_range++; 2567 if (first_range == ranges->length()) { 2568 details->set_cannot_match(); 2569 pos->determines_perfectly = false; 2570 return; 2571 } 2572 } 2573 CharacterRange range = ranges->at(first_range); 2574 uc16 from = range.from(); 2575 uc16 to = range.to(); 2576 if (to > char_mask) { 2577 to = char_mask; 2578 } 2579 uint32_t differing_bits = (from ^ to); 2580 // A mask and compare is only perfect if the differing bits form a 2581 // number like 00011111 with one single block of trailing 1s. 2582 if ((differing_bits & (differing_bits + 1)) == 0 && 2583 from + differing_bits == to) { 2584 pos->determines_perfectly = true; 2585 } 2586 uint32_t common_bits = ~SmearBitsRight(differing_bits); 2587 uint32_t bits = (from & common_bits); 2588 for (int i = first_range + 1; i < ranges->length(); i++) { 2589 CharacterRange range = ranges->at(i); 2590 uc16 from = range.from(); 2591 uc16 to = range.to(); 2592 if (from > char_mask) continue; 2593 if (to > char_mask) to = char_mask; 2594 // Here we are combining more ranges into the mask and compare 2595 // value. With each new range the mask becomes more sparse and 2596 // so the chances of a false positive rise. A character class 2597 // with multiple ranges is assumed never to be equivalent to a 2598 // mask and compare operation. 2599 pos->determines_perfectly = false; 2600 uint32_t new_common_bits = (from ^ to); 2601 new_common_bits = ~SmearBitsRight(new_common_bits); 2602 common_bits &= new_common_bits; 2603 bits &= new_common_bits; 2604 uint32_t differing_bits = (from & common_bits) ^ bits; 2605 common_bits ^= differing_bits; 2606 bits &= common_bits; 2607 } 2608 pos->mask = common_bits; 2609 pos->value = bits; 2610 } 2611 characters_filled_in++; 2612 DCHECK(characters_filled_in <= details->characters()); 2613 if (characters_filled_in == details->characters()) { 2614 return; 2615 } 2616 } 2617 } 2618 DCHECK(characters_filled_in != details->characters()); 2619 if (!details->cannot_match()) { 2620 on_success()-> GetQuickCheckDetails(details, 2621 compiler, 2622 characters_filled_in, 2623 true); 2624 } 2625 } 2626 2627 2628 void QuickCheckDetails::Clear() { 2629 for (int i = 0; i < characters_; i++) { 2630 positions_[i].mask = 0; 2631 positions_[i].value = 0; 2632 positions_[i].determines_perfectly = false; 2633 } 2634 characters_ = 0; 2635 } 2636 2637 2638 void QuickCheckDetails::Advance(int by, bool one_byte) { 2639 if (by >= characters_ || by < 0) { 2640 DCHECK_IMPLIES(by < 0, characters_ == 0); 2641 Clear(); 2642 return; 2643 } 2644 DCHECK_LE(characters_ - by, 4); 2645 DCHECK_LE(characters_, 4); 2646 for (int i = 0; i < characters_ - by; i++) { 2647 positions_[i] = positions_[by + i]; 2648 } 2649 for (int i = characters_ - by; i < characters_; i++) { 2650 positions_[i].mask = 0; 2651 positions_[i].value = 0; 2652 positions_[i].determines_perfectly = false; 2653 } 2654 characters_ -= by; 2655 // We could change mask_ and value_ here but we would never advance unless 2656 // they had already been used in a check and they won't be used again because 2657 // it would gain us nothing. So there's no point. 2658 } 2659 2660 2661 void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) { 2662 DCHECK(characters_ == other->characters_); 2663 if (other->cannot_match_) { 2664 return; 2665 } 2666 if (cannot_match_) { 2667 *this = *other; 2668 return; 2669 } 2670 for (int i = from_index; i < characters_; i++) { 2671 QuickCheckDetails::Position* pos = positions(i); 2672 QuickCheckDetails::Position* other_pos = other->positions(i); 2673 if (pos->mask != other_pos->mask || 2674 pos->value != other_pos->value || 2675 !other_pos->determines_perfectly) { 2676 // Our mask-compare operation will be approximate unless we have the 2677 // exact same operation on both sides of the alternation. 2678 pos->determines_perfectly = false; 2679 } 2680 pos->mask &= other_pos->mask; 2681 pos->value &= pos->mask; 2682 other_pos->value &= pos->mask; 2683 uc16 differing_bits = (pos->value ^ other_pos->value); 2684 pos->mask &= ~differing_bits; 2685 pos->value &= pos->mask; 2686 } 2687 } 2688 2689 2690 class VisitMarker { 2691 public: 2692 explicit VisitMarker(NodeInfo* info) : info_(info) { 2693 DCHECK(!info->visited); 2694 info->visited = true; 2695 } 2696 ~VisitMarker() { 2697 info_->visited = false; 2698 } 2699 private: 2700 NodeInfo* info_; 2701 }; 2702 2703 RegExpNode* SeqRegExpNode::FilterOneByte(int depth) { 2704 if (info()->replacement_calculated) return replacement(); 2705 if (depth < 0) return this; 2706 DCHECK(!info()->visited); 2707 VisitMarker marker(info()); 2708 return FilterSuccessor(depth - 1); 2709 } 2710 2711 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth) { 2712 RegExpNode* next = on_success_->FilterOneByte(depth - 1); 2713 if (next == nullptr) return set_replacement(nullptr); 2714 on_success_ = next; 2715 return set_replacement(this); 2716 } 2717 2718 // We need to check for the following characters: 0x39C 0x3BC 0x178. 2719 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) { 2720 // TODO(dcarney): this could be a lot more efficient. 2721 return range.Contains(0x039C) || range.Contains(0x03BC) || 2722 range.Contains(0x0178); 2723 } 2724 2725 2726 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) { 2727 for (int i = 0; i < ranges->length(); i++) { 2728 // TODO(dcarney): this could be a lot more efficient. 2729 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true; 2730 } 2731 return false; 2732 } 2733 2734 RegExpNode* TextNode::FilterOneByte(int depth) { 2735 if (info()->replacement_calculated) return replacement(); 2736 if (depth < 0) return this; 2737 DCHECK(!info()->visited); 2738 VisitMarker marker(info()); 2739 int element_count = elements()->length(); 2740 for (int i = 0; i < element_count; i++) { 2741 TextElement elm = elements()->at(i); 2742 if (elm.text_type() == TextElement::ATOM) { 2743 Vector<const uc16> quarks = elm.atom()->data(); 2744 for (int j = 0; j < quarks.length(); j++) { 2745 uint16_t c = quarks[j]; 2746 if (elm.atom()->ignore_case()) { 2747 c = unibrow::Latin1::TryConvertToLatin1(c); 2748 } 2749 if (c > unibrow::Latin1::kMaxChar) return set_replacement(nullptr); 2750 // Replace quark in case we converted to Latin-1. 2751 uint16_t* writable_quarks = const_cast<uint16_t*>(quarks.start()); 2752 writable_quarks[j] = c; 2753 } 2754 } else { 2755 DCHECK(elm.text_type() == TextElement::CHAR_CLASS); 2756 RegExpCharacterClass* cc = elm.char_class(); 2757 ZoneList<CharacterRange>* ranges = cc->ranges(zone()); 2758 CharacterRange::Canonicalize(ranges); 2759 // Now they are in order so we only need to look at the first. 2760 int range_count = ranges->length(); 2761 if (cc->is_negated()) { 2762 if (range_count != 0 && 2763 ranges->at(0).from() == 0 && 2764 ranges->at(0).to() >= String::kMaxOneByteCharCode) { 2765 // This will be handled in a later filter. 2766 if (IgnoreCase(cc->flags()) && RangesContainLatin1Equivalents(ranges)) 2767 continue; 2768 return set_replacement(nullptr); 2769 } 2770 } else { 2771 if (range_count == 0 || 2772 ranges->at(0).from() > String::kMaxOneByteCharCode) { 2773 // This will be handled in a later filter. 2774 if (IgnoreCase(cc->flags()) && RangesContainLatin1Equivalents(ranges)) 2775 continue; 2776 return set_replacement(nullptr); 2777 } 2778 } 2779 } 2780 } 2781 return FilterSuccessor(depth - 1); 2782 } 2783 2784 RegExpNode* LoopChoiceNode::FilterOneByte(int depth) { 2785 if (info()->replacement_calculated) return replacement(); 2786 if (depth < 0) return this; 2787 if (info()->visited) return this; 2788 { 2789 VisitMarker marker(info()); 2790 2791 RegExpNode* continue_replacement = continue_node_->FilterOneByte(depth - 1); 2792 // If we can't continue after the loop then there is no sense in doing the 2793 // loop. 2794 if (continue_replacement == nullptr) return set_replacement(nullptr); 2795 } 2796 2797 return ChoiceNode::FilterOneByte(depth - 1); 2798 } 2799 2800 RegExpNode* ChoiceNode::FilterOneByte(int depth) { 2801 if (info()->replacement_calculated) return replacement(); 2802 if (depth < 0) return this; 2803 if (info()->visited) return this; 2804 VisitMarker marker(info()); 2805 int choice_count = alternatives_->length(); 2806 2807 for (int i = 0; i < choice_count; i++) { 2808 GuardedAlternative alternative = alternatives_->at(i); 2809 if (alternative.guards() != nullptr && 2810 alternative.guards()->length() != 0) { 2811 set_replacement(this); 2812 return this; 2813 } 2814 } 2815 2816 int surviving = 0; 2817 RegExpNode* survivor = nullptr; 2818 for (int i = 0; i < choice_count; i++) { 2819 GuardedAlternative alternative = alternatives_->at(i); 2820 RegExpNode* replacement = alternative.node()->FilterOneByte(depth - 1); 2821 DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK. 2822 if (replacement != nullptr) { 2823 alternatives_->at(i).set_node(replacement); 2824 surviving++; 2825 survivor = replacement; 2826 } 2827 } 2828 if (surviving < 2) return set_replacement(survivor); 2829 2830 set_replacement(this); 2831 if (surviving == choice_count) { 2832 return this; 2833 } 2834 // Only some of the nodes survived the filtering. We need to rebuild the 2835 // alternatives list. 2836 ZoneList<GuardedAlternative>* new_alternatives = 2837 new(zone()) ZoneList<GuardedAlternative>(surviving, zone()); 2838 for (int i = 0; i < choice_count; i++) { 2839 RegExpNode* replacement = 2840 alternatives_->at(i).node()->FilterOneByte(depth - 1); 2841 if (replacement != nullptr) { 2842 alternatives_->at(i).set_node(replacement); 2843 new_alternatives->Add(alternatives_->at(i), zone()); 2844 } 2845 } 2846 alternatives_ = new_alternatives; 2847 return this; 2848 } 2849 2850 RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth) { 2851 if (info()->replacement_calculated) return replacement(); 2852 if (depth < 0) return this; 2853 if (info()->visited) return this; 2854 VisitMarker marker(info()); 2855 // Alternative 0 is the negative lookahead, alternative 1 is what comes 2856 // afterwards. 2857 RegExpNode* node = alternatives_->at(1).node(); 2858 RegExpNode* replacement = node->FilterOneByte(depth - 1); 2859 if (replacement == nullptr) return set_replacement(nullptr); 2860 alternatives_->at(1).set_node(replacement); 2861 2862 RegExpNode* neg_node = alternatives_->at(0).node(); 2863 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1); 2864 // If the negative lookahead is always going to fail then 2865 // we don't need to check it. 2866 if (neg_replacement == nullptr) return set_replacement(replacement); 2867 alternatives_->at(0).set_node(neg_replacement); 2868 return set_replacement(this); 2869 } 2870 2871 2872 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 2873 RegExpCompiler* compiler, 2874 int characters_filled_in, 2875 bool not_at_start) { 2876 if (body_can_be_zero_length_ || info()->visited) return; 2877 VisitMarker marker(info()); 2878 return ChoiceNode::GetQuickCheckDetails(details, 2879 compiler, 2880 characters_filled_in, 2881 not_at_start); 2882 } 2883 2884 2885 void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget, 2886 BoyerMooreLookahead* bm, bool not_at_start) { 2887 if (body_can_be_zero_length_ || budget <= 0) { 2888 bm->SetRest(offset); 2889 SaveBMInfo(bm, not_at_start, offset); 2890 return; 2891 } 2892 ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); 2893 SaveBMInfo(bm, not_at_start, offset); 2894 } 2895 2896 2897 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 2898 RegExpCompiler* compiler, 2899 int characters_filled_in, 2900 bool not_at_start) { 2901 not_at_start = (not_at_start || not_at_start_); 2902 int choice_count = alternatives_->length(); 2903 DCHECK_LT(0, choice_count); 2904 alternatives_->at(0).node()->GetQuickCheckDetails(details, 2905 compiler, 2906 characters_filled_in, 2907 not_at_start); 2908 for (int i = 1; i < choice_count; i++) { 2909 QuickCheckDetails new_details(details->characters()); 2910 RegExpNode* node = alternatives_->at(i).node(); 2911 node->GetQuickCheckDetails(&new_details, compiler, 2912 characters_filled_in, 2913 not_at_start); 2914 // Here we merge the quick match details of the two branches. 2915 details->Merge(&new_details, characters_filled_in); 2916 } 2917 } 2918 2919 2920 // Check for [0-9A-Z_a-z]. 2921 static void EmitWordCheck(RegExpMacroAssembler* assembler, 2922 Label* word, 2923 Label* non_word, 2924 bool fall_through_on_word) { 2925 if (assembler->CheckSpecialCharacterClass( 2926 fall_through_on_word ? 'w' : 'W', 2927 fall_through_on_word ? non_word : word)) { 2928 // Optimized implementation available. 2929 return; 2930 } 2931 assembler->CheckCharacterGT('z', non_word); 2932 assembler->CheckCharacterLT('0', non_word); 2933 assembler->CheckCharacterGT('a' - 1, word); 2934 assembler->CheckCharacterLT('9' + 1, word); 2935 assembler->CheckCharacterLT('A', non_word); 2936 assembler->CheckCharacterLT('Z' + 1, word); 2937 if (fall_through_on_word) { 2938 assembler->CheckNotCharacter('_', non_word); 2939 } else { 2940 assembler->CheckCharacter('_', word); 2941 } 2942 } 2943 2944 2945 // Emit the code to check for a ^ in multiline mode (1-character lookbehind 2946 // that matches newline or the start of input). 2947 static void EmitHat(RegExpCompiler* compiler, 2948 RegExpNode* on_success, 2949 Trace* trace) { 2950 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2951 // We will be loading the previous character into the current character 2952 // register. 2953 Trace new_trace(*trace); 2954 new_trace.InvalidateCurrentCharacter(); 2955 2956 Label ok; 2957 if (new_trace.cp_offset() == 0) { 2958 // The start of input counts as a newline in this context, so skip to 2959 // ok if we are at the start. 2960 assembler->CheckAtStart(&ok); 2961 } 2962 // We already checked that we are not at the start of input so it must be 2963 // OK to load the previous character. 2964 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1, 2965 new_trace.backtrack(), 2966 false); 2967 if (!assembler->CheckSpecialCharacterClass('n', 2968 new_trace.backtrack())) { 2969 // Newline means \n, \r, 0x2028 or 0x2029. 2970 if (!compiler->one_byte()) { 2971 assembler->CheckCharacterAfterAnd(0x2028, 0xFFFE, &ok); 2972 } 2973 assembler->CheckCharacter('\n', &ok); 2974 assembler->CheckNotCharacter('\r', new_trace.backtrack()); 2975 } 2976 assembler->Bind(&ok); 2977 on_success->Emit(compiler, &new_trace); 2978 } 2979 2980 2981 // Emit the code to handle \b and \B (word-boundary or non-word-boundary). 2982 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) { 2983 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2984 Isolate* isolate = assembler->isolate(); 2985 Trace::TriBool next_is_word_character = Trace::UNKNOWN; 2986 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE); 2987 BoyerMooreLookahead* lookahead = bm_info(not_at_start); 2988 if (lookahead == nullptr) { 2989 int eats_at_least = 2990 Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore, 2991 kRecursionBudget, 2992 not_at_start)); 2993 if (eats_at_least >= 1) { 2994 BoyerMooreLookahead* bm = 2995 new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone()); 2996 FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start); 2997 if (bm->at(0)->is_non_word()) 2998 next_is_word_character = Trace::FALSE_VALUE; 2999 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE; 3000 } 3001 } else { 3002 if (lookahead->at(0)->is_non_word()) 3003 next_is_word_character = Trace::FALSE_VALUE; 3004 if (lookahead->at(0)->is_word()) 3005 next_is_word_character = Trace::TRUE_VALUE; 3006 } 3007 bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY); 3008 if (next_is_word_character == Trace::UNKNOWN) { 3009 Label before_non_word; 3010 Label before_word; 3011 if (trace->characters_preloaded() != 1) { 3012 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word); 3013 } 3014 // Fall through on non-word. 3015 EmitWordCheck(assembler, &before_word, &before_non_word, false); 3016 // Next character is not a word character. 3017 assembler->Bind(&before_non_word); 3018 Label ok; 3019 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord); 3020 assembler->GoTo(&ok); 3021 3022 assembler->Bind(&before_word); 3023 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord); 3024 assembler->Bind(&ok); 3025 } else if (next_is_word_character == Trace::TRUE_VALUE) { 3026 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord); 3027 } else { 3028 DCHECK(next_is_word_character == Trace::FALSE_VALUE); 3029 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord); 3030 } 3031 } 3032 3033 3034 void AssertionNode::BacktrackIfPrevious( 3035 RegExpCompiler* compiler, 3036 Trace* trace, 3037 AssertionNode::IfPrevious backtrack_if_previous) { 3038 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3039 Trace new_trace(*trace); 3040 new_trace.InvalidateCurrentCharacter(); 3041 3042 Label fall_through, dummy; 3043 3044 Label* non_word = backtrack_if_previous == kIsNonWord ? 3045 new_trace.backtrack() : 3046 &fall_through; 3047 Label* word = backtrack_if_previous == kIsNonWord ? 3048 &fall_through : 3049 new_trace.backtrack(); 3050 3051 if (new_trace.cp_offset() == 0) { 3052 // The start of input counts as a non-word character, so the question is 3053 // decided if we are at the start. 3054 assembler->CheckAtStart(non_word); 3055 } 3056 // We already checked that we are not at the start of input so it must be 3057 // OK to load the previous character. 3058 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false); 3059 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord); 3060 3061 assembler->Bind(&fall_through); 3062 on_success()->Emit(compiler, &new_trace); 3063 } 3064 3065 3066 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details, 3067 RegExpCompiler* compiler, 3068 int filled_in, 3069 bool not_at_start) { 3070 if (assertion_type_ == AT_START && not_at_start) { 3071 details->set_cannot_match(); 3072 return; 3073 } 3074 return on_success()->GetQuickCheckDetails(details, 3075 compiler, 3076 filled_in, 3077 not_at_start); 3078 } 3079 3080 3081 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3082 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3083 switch (assertion_type_) { 3084 case AT_END: { 3085 Label ok; 3086 assembler->CheckPosition(trace->cp_offset(), &ok); 3087 assembler->GoTo(trace->backtrack()); 3088 assembler->Bind(&ok); 3089 break; 3090 } 3091 case AT_START: { 3092 if (trace->at_start() == Trace::FALSE_VALUE) { 3093 assembler->GoTo(trace->backtrack()); 3094 return; 3095 } 3096 if (trace->at_start() == Trace::UNKNOWN) { 3097 assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack()); 3098 Trace at_start_trace = *trace; 3099 at_start_trace.set_at_start(Trace::TRUE_VALUE); 3100 on_success()->Emit(compiler, &at_start_trace); 3101 return; 3102 } 3103 } 3104 break; 3105 case AFTER_NEWLINE: 3106 EmitHat(compiler, on_success(), trace); 3107 return; 3108 case AT_BOUNDARY: 3109 case AT_NON_BOUNDARY: { 3110 EmitBoundaryCheck(compiler, trace); 3111 return; 3112 } 3113 } 3114 on_success()->Emit(compiler, trace); 3115 } 3116 3117 3118 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) { 3119 if (quick_check == nullptr) return false; 3120 if (offset >= quick_check->characters()) return false; 3121 return quick_check->positions(offset)->determines_perfectly; 3122 } 3123 3124 3125 static void UpdateBoundsCheck(int index, int* checked_up_to) { 3126 if (index > *checked_up_to) { 3127 *checked_up_to = index; 3128 } 3129 } 3130 3131 3132 // We call this repeatedly to generate code for each pass over the text node. 3133 // The passes are in increasing order of difficulty because we hope one 3134 // of the first passes will fail in which case we are saved the work of the 3135 // later passes. for example for the case independent regexp /%[asdfghjkl]a/ 3136 // we will check the '%' in the first pass, the case independent 'a' in the 3137 // second pass and the character class in the last pass. 3138 // 3139 // The passes are done from right to left, so for example to test for /bar/ 3140 // we will first test for an 'r' with offset 2, then an 'a' with offset 1 3141 // and then a 'b' with offset 0. This means we can avoid the end-of-input 3142 // bounds check most of the time. In the example we only need to check for 3143 // end-of-input when loading the putative 'r'. 3144 // 3145 // A slight complication involves the fact that the first character may already 3146 // be fetched into a register by the previous node. In this case we want to 3147 // do the test for that character first. We do this in separate passes. The 3148 // 'preloaded' argument indicates that we are doing such a 'pass'. If such a 3149 // pass has been performed then subsequent passes will have true in 3150 // first_element_checked to indicate that that character does not need to be 3151 // checked again. 3152 // 3153 // In addition to all this we are passed a Trace, which can 3154 // contain an AlternativeGeneration object. In this AlternativeGeneration 3155 // object we can see details of any quick check that was already passed in 3156 // order to get to the code we are now generating. The quick check can involve 3157 // loading characters, which means we do not need to recheck the bounds 3158 // up to the limit the quick check already checked. In addition the quick 3159 // check can have involved a mask and compare operation which may simplify 3160 // or obviate the need for further checks at some character positions. 3161 void TextNode::TextEmitPass(RegExpCompiler* compiler, 3162 TextEmitPassType pass, 3163 bool preloaded, 3164 Trace* trace, 3165 bool first_element_checked, 3166 int* checked_up_to) { 3167 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3168 Isolate* isolate = assembler->isolate(); 3169 bool one_byte = compiler->one_byte(); 3170 Label* backtrack = trace->backtrack(); 3171 QuickCheckDetails* quick_check = trace->quick_check_performed(); 3172 int element_count = elements()->length(); 3173 int backward_offset = read_backward() ? -Length() : 0; 3174 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) { 3175 TextElement elm = elements()->at(i); 3176 int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset; 3177 if (elm.text_type() == TextElement::ATOM) { 3178 if (SkipPass(pass, elm.atom()->ignore_case())) continue; 3179 Vector<const uc16> quarks = elm.atom()->data(); 3180 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { 3181 if (first_element_checked && i == 0 && j == 0) continue; 3182 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue; 3183 EmitCharacterFunction* emit_function = nullptr; 3184 uc16 quark = quarks[j]; 3185 if (elm.atom()->ignore_case()) { 3186 // Everywhere else we assume that a non-Latin-1 character cannot match 3187 // a Latin-1 character. Avoid the cases where this is assumption is 3188 // invalid by using the Latin1 equivalent instead. 3189 quark = unibrow::Latin1::TryConvertToLatin1(quark); 3190 } 3191 switch (pass) { 3192 case NON_LATIN1_MATCH: 3193 DCHECK(one_byte); 3194 if (quark > String::kMaxOneByteCharCode) { 3195 assembler->GoTo(backtrack); 3196 return; 3197 } 3198 break; 3199 case NON_LETTER_CHARACTER_MATCH: 3200 emit_function = &EmitAtomNonLetter; 3201 break; 3202 case SIMPLE_CHARACTER_MATCH: 3203 emit_function = &EmitSimpleCharacter; 3204 break; 3205 case CASE_CHARACTER_MATCH: 3206 emit_function = &EmitAtomLetter; 3207 break; 3208 default: 3209 break; 3210 } 3211 if (emit_function != nullptr) { 3212 bool bounds_check = *checked_up_to < cp_offset + j || read_backward(); 3213 bool bound_checked = 3214 emit_function(isolate, compiler, quark, backtrack, cp_offset + j, 3215 bounds_check, preloaded); 3216 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); 3217 } 3218 } 3219 } else { 3220 DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type()); 3221 if (pass == CHARACTER_CLASS_MATCH) { 3222 if (first_element_checked && i == 0) continue; 3223 if (DeterminedAlready(quick_check, elm.cp_offset())) continue; 3224 RegExpCharacterClass* cc = elm.char_class(); 3225 bool bounds_check = *checked_up_to < cp_offset || read_backward(); 3226 EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset, 3227 bounds_check, preloaded, zone()); 3228 UpdateBoundsCheck(cp_offset, checked_up_to); 3229 } 3230 } 3231 } 3232 } 3233 3234 3235 int TextNode::Length() { 3236 TextElement elm = elements()->last(); 3237 DCHECK_LE(0, elm.cp_offset()); 3238 return elm.cp_offset() + elm.length(); 3239 } 3240 3241 bool TextNode::SkipPass(TextEmitPassType pass, bool ignore_case) { 3242 if (ignore_case) { 3243 return pass == SIMPLE_CHARACTER_MATCH; 3244 } else { 3245 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH; 3246 } 3247 } 3248 3249 TextNode* TextNode::CreateForCharacterRanges(Zone* zone, 3250 ZoneList<CharacterRange>* ranges, 3251 bool read_backward, 3252 RegExpNode* on_success, 3253 JSRegExp::Flags flags) { 3254 DCHECK_NOT_NULL(ranges); 3255 ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone); 3256 elms->Add(TextElement::CharClass( 3257 new (zone) RegExpCharacterClass(zone, ranges, flags)), 3258 zone); 3259 return new (zone) TextNode(elms, read_backward, on_success); 3260 } 3261 3262 TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead, 3263 CharacterRange trail, 3264 bool read_backward, 3265 RegExpNode* on_success, 3266 JSRegExp::Flags flags) { 3267 ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead); 3268 ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail); 3269 ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone); 3270 elms->Add(TextElement::CharClass( 3271 new (zone) RegExpCharacterClass(zone, lead_ranges, flags)), 3272 zone); 3273 elms->Add(TextElement::CharClass( 3274 new (zone) RegExpCharacterClass(zone, trail_ranges, flags)), 3275 zone); 3276 return new (zone) TextNode(elms, read_backward, on_success); 3277 } 3278 3279 3280 // This generates the code to match a text node. A text node can contain 3281 // straight character sequences (possibly to be matched in a case-independent 3282 // way) and character classes. For efficiency we do not do this in a single 3283 // pass from left to right. Instead we pass over the text node several times, 3284 // emitting code for some character positions every time. See the comment on 3285 // TextEmitPass for details. 3286 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3287 LimitResult limit_result = LimitVersions(compiler, trace); 3288 if (limit_result == DONE) return; 3289 DCHECK(limit_result == CONTINUE); 3290 3291 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) { 3292 compiler->SetRegExpTooBig(); 3293 return; 3294 } 3295 3296 if (compiler->one_byte()) { 3297 int dummy = 0; 3298 TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy); 3299 } 3300 3301 bool first_elt_done = false; 3302 int bound_checked_to = trace->cp_offset() - 1; 3303 bound_checked_to += trace->bound_checked_up_to(); 3304 3305 // If a character is preloaded into the current character register then 3306 // check that now. 3307 if (trace->characters_preloaded() == 1) { 3308 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { 3309 TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), true, trace, 3310 false, &bound_checked_to); 3311 } 3312 first_elt_done = true; 3313 } 3314 3315 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { 3316 TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), false, trace, 3317 first_elt_done, &bound_checked_to); 3318 } 3319 3320 Trace successor_trace(*trace); 3321 // If we advance backward, we may end up at the start. 3322 successor_trace.AdvanceCurrentPositionInTrace( 3323 read_backward() ? -Length() : Length(), compiler); 3324 successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN 3325 : Trace::FALSE_VALUE); 3326 RecursionCheck rc(compiler); 3327 on_success()->Emit(compiler, &successor_trace); 3328 } 3329 3330 3331 void Trace::InvalidateCurrentCharacter() { 3332 characters_preloaded_ = 0; 3333 } 3334 3335 3336 void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) { 3337 // We don't have an instruction for shifting the current character register 3338 // down or for using a shifted value for anything so lets just forget that 3339 // we preloaded any characters into it. 3340 characters_preloaded_ = 0; 3341 // Adjust the offsets of the quick check performed information. This 3342 // information is used to find out what we already determined about the 3343 // characters by means of mask and compare. 3344 quick_check_performed_.Advance(by, compiler->one_byte()); 3345 cp_offset_ += by; 3346 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) { 3347 compiler->SetRegExpTooBig(); 3348 cp_offset_ = 0; 3349 } 3350 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by); 3351 } 3352 3353 3354 void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) { 3355 int element_count = elements()->length(); 3356 for (int i = 0; i < element_count; i++) { 3357 TextElement elm = elements()->at(i); 3358 if (elm.text_type() == TextElement::CHAR_CLASS) { 3359 RegExpCharacterClass* cc = elm.char_class(); 3360 #ifdef V8_INTL_SUPPORT 3361 bool case_equivalents_already_added = 3362 NeedsUnicodeCaseEquivalents(cc->flags()); 3363 #else 3364 bool case_equivalents_already_added = false; 3365 #endif 3366 if (IgnoreCase(cc->flags()) && !case_equivalents_already_added) { 3367 // None of the standard character classes is different in the case 3368 // independent case and it slows us down if we don't know that. 3369 if (cc->is_standard(zone())) continue; 3370 ZoneList<CharacterRange>* ranges = cc->ranges(zone()); 3371 CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, 3372 is_one_byte); 3373 } 3374 } 3375 } 3376 } 3377 3378 3379 int TextNode::GreedyLoopTextLength() { return Length(); } 3380 3381 3382 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode( 3383 RegExpCompiler* compiler) { 3384 if (read_backward()) return nullptr; 3385 if (elements()->length() != 1) return nullptr; 3386 TextElement elm = elements()->at(0); 3387 if (elm.text_type() != TextElement::CHAR_CLASS) return nullptr; 3388 RegExpCharacterClass* node = elm.char_class(); 3389 ZoneList<CharacterRange>* ranges = node->ranges(zone()); 3390 CharacterRange::Canonicalize(ranges); 3391 if (node->is_negated()) { 3392 return ranges->length() == 0 ? on_success() : nullptr; 3393 } 3394 if (ranges->length() != 1) return nullptr; 3395 uint32_t max_char; 3396 if (compiler->one_byte()) { 3397 max_char = String::kMaxOneByteCharCode; 3398 } else { 3399 max_char = String::kMaxUtf16CodeUnit; 3400 } 3401 return ranges->at(0).IsEverything(max_char) ? on_success() : nullptr; 3402 } 3403 3404 3405 // Finds the fixed match length of a sequence of nodes that goes from 3406 // this alternative and back to this choice node. If there are variable 3407 // length nodes or other complications in the way then return a sentinel 3408 // value indicating that a greedy loop cannot be constructed. 3409 int ChoiceNode::GreedyLoopTextLengthForAlternative( 3410 GuardedAlternative* alternative) { 3411 int length = 0; 3412 RegExpNode* node = alternative->node(); 3413 // Later we will generate code for all these text nodes using recursion 3414 // so we have to limit the max number. 3415 int recursion_depth = 0; 3416 while (node != this) { 3417 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) { 3418 return kNodeIsTooComplexForGreedyLoops; 3419 } 3420 int node_length = node->GreedyLoopTextLength(); 3421 if (node_length == kNodeIsTooComplexForGreedyLoops) { 3422 return kNodeIsTooComplexForGreedyLoops; 3423 } 3424 length += node_length; 3425 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node); 3426 node = seq_node->on_success(); 3427 } 3428 return read_backward() ? -length : length; 3429 } 3430 3431 3432 void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) { 3433 DCHECK_NULL(loop_node_); 3434 AddAlternative(alt); 3435 loop_node_ = alt.node(); 3436 } 3437 3438 3439 void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) { 3440 DCHECK_NULL(continue_node_); 3441 AddAlternative(alt); 3442 continue_node_ = alt.node(); 3443 } 3444 3445 3446 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3447 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 3448 if (trace->stop_node() == this) { 3449 // Back edge of greedy optimized loop node graph. 3450 int text_length = 3451 GreedyLoopTextLengthForAlternative(&(alternatives_->at(0))); 3452 DCHECK_NE(kNodeIsTooComplexForGreedyLoops, text_length); 3453 // Update the counter-based backtracking info on the stack. This is an 3454 // optimization for greedy loops (see below). 3455 DCHECK(trace->cp_offset() == text_length); 3456 macro_assembler->AdvanceCurrentPosition(text_length); 3457 macro_assembler->GoTo(trace->loop_label()); 3458 return; 3459 } 3460 DCHECK_NULL(trace->stop_node()); 3461 if (!trace->is_trivial()) { 3462 trace->Flush(compiler, this); 3463 return; 3464 } 3465 ChoiceNode::Emit(compiler, trace); 3466 } 3467 3468 3469 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler, 3470 int eats_at_least) { 3471 int preload_characters = Min(4, eats_at_least); 3472 DCHECK_LE(preload_characters, 4); 3473 if (compiler->macro_assembler()->CanReadUnaligned()) { 3474 bool one_byte = compiler->one_byte(); 3475 if (one_byte) { 3476 // We can't preload 3 characters because there is no machine instruction 3477 // to do that. We can't just load 4 because we could be reading 3478 // beyond the end of the string, which could cause a memory fault. 3479 if (preload_characters == 3) preload_characters = 2; 3480 } else { 3481 if (preload_characters > 2) preload_characters = 2; 3482 } 3483 } else { 3484 if (preload_characters > 1) preload_characters = 1; 3485 } 3486 return preload_characters; 3487 } 3488 3489 3490 // This class is used when generating the alternatives in a choice node. It 3491 // records the way the alternative is being code generated. 3492 class AlternativeGeneration: public Malloced { 3493 public: 3494 AlternativeGeneration() 3495 : possible_success(), 3496 expects_preload(false), 3497 after(), 3498 quick_check_details() { } 3499 Label possible_success; 3500 bool expects_preload; 3501 Label after; 3502 QuickCheckDetails quick_check_details; 3503 }; 3504 3505 3506 // Creates a list of AlternativeGenerations. If the list has a reasonable 3507 // size then it is on the stack, otherwise the excess is on the heap. 3508 class AlternativeGenerationList { 3509 public: 3510 AlternativeGenerationList(int count, Zone* zone) 3511 : alt_gens_(count, zone) { 3512 for (int i = 0; i < count && i < kAFew; i++) { 3513 alt_gens_.Add(a_few_alt_gens_ + i, zone); 3514 } 3515 for (int i = kAFew; i < count; i++) { 3516 alt_gens_.Add(new AlternativeGeneration(), zone); 3517 } 3518 } 3519 ~AlternativeGenerationList() { 3520 for (int i = kAFew; i < alt_gens_.length(); i++) { 3521 delete alt_gens_[i]; 3522 alt_gens_[i] = nullptr; 3523 } 3524 } 3525 3526 AlternativeGeneration* at(int i) { 3527 return alt_gens_[i]; 3528 } 3529 3530 private: 3531 static const int kAFew = 10; 3532 ZoneList<AlternativeGeneration*> alt_gens_; 3533 AlternativeGeneration a_few_alt_gens_[kAFew]; 3534 }; 3535 3536 3537 static const uc32 kRangeEndMarker = 0x110000; 3538 3539 // The '2' variant is has inclusive from and exclusive to. 3540 // This covers \s as defined in ECMA-262 5.1, 15.10.2.12, 3541 // which include WhiteSpace (7.2) or LineTerminator (7.3) values. 3542 static const int kSpaceRanges[] = { 3543 '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680, 3544 0x1681, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030, 3545 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker}; 3546 static const int kSpaceRangeCount = arraysize(kSpaceRanges); 3547 3548 static const int kWordRanges[] = { 3549 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, kRangeEndMarker}; 3550 static const int kWordRangeCount = arraysize(kWordRanges); 3551 static const int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker}; 3552 static const int kDigitRangeCount = arraysize(kDigitRanges); 3553 static const int kSurrogateRanges[] = { 3554 kLeadSurrogateStart, kLeadSurrogateStart + 1, kRangeEndMarker}; 3555 static const int kSurrogateRangeCount = arraysize(kSurrogateRanges); 3556 static const int kLineTerminatorRanges[] = { 3557 0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, kRangeEndMarker}; 3558 static const int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges); 3559 3560 void BoyerMoorePositionInfo::Set(int character) { 3561 SetInterval(Interval(character, character)); 3562 } 3563 3564 3565 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) { 3566 s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval); 3567 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval); 3568 d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval); 3569 surrogate_ = 3570 AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval); 3571 if (interval.to() - interval.from() >= kMapSize - 1) { 3572 if (map_count_ != kMapSize) { 3573 map_count_ = kMapSize; 3574 for (int i = 0; i < kMapSize; i++) map_->at(i) = true; 3575 } 3576 return; 3577 } 3578 for (int i = interval.from(); i <= interval.to(); i++) { 3579 int mod_character = (i & kMask); 3580 if (!map_->at(mod_character)) { 3581 map_count_++; 3582 map_->at(mod_character) = true; 3583 } 3584 if (map_count_ == kMapSize) return; 3585 } 3586 } 3587 3588 3589 void BoyerMoorePositionInfo::SetAll() { 3590 s_ = w_ = d_ = kLatticeUnknown; 3591 if (map_count_ != kMapSize) { 3592 map_count_ = kMapSize; 3593 for (int i = 0; i < kMapSize; i++) map_->at(i) = true; 3594 } 3595 } 3596 3597 3598 BoyerMooreLookahead::BoyerMooreLookahead( 3599 int length, RegExpCompiler* compiler, Zone* zone) 3600 : length_(length), 3601 compiler_(compiler) { 3602 if (compiler->one_byte()) { 3603 max_char_ = String::kMaxOneByteCharCode; 3604 } else { 3605 max_char_ = String::kMaxUtf16CodeUnit; 3606 } 3607 bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone); 3608 for (int i = 0; i < length; i++) { 3609 bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone); 3610 } 3611 } 3612 3613 3614 // Find the longest range of lookahead that has the fewest number of different 3615 // characters that can occur at a given position. Since we are optimizing two 3616 // different parameters at once this is a tradeoff. 3617 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) { 3618 int biggest_points = 0; 3619 // If more than 32 characters out of 128 can occur it is unlikely that we can 3620 // be lucky enough to step forwards much of the time. 3621 const int kMaxMax = 32; 3622 for (int max_number_of_chars = 4; 3623 max_number_of_chars < kMaxMax; 3624 max_number_of_chars *= 2) { 3625 biggest_points = 3626 FindBestInterval(max_number_of_chars, biggest_points, from, to); 3627 } 3628 if (biggest_points == 0) return false; 3629 return true; 3630 } 3631 3632 3633 // Find the highest-points range between 0 and length_ where the character 3634 // information is not too vague. 'Too vague' means that there are more than 3635 // max_number_of_chars that can occur at this position. Calculates the number 3636 // of points as the product of width-of-the-range and 3637 // probability-of-finding-one-of-the-characters, where the probability is 3638 // calculated using the frequency distribution of the sample subject string. 3639 int BoyerMooreLookahead::FindBestInterval( 3640 int max_number_of_chars, int old_biggest_points, int* from, int* to) { 3641 int biggest_points = old_biggest_points; 3642 static const int kSize = RegExpMacroAssembler::kTableSize; 3643 for (int i = 0; i < length_; ) { 3644 while (i < length_ && Count(i) > max_number_of_chars) i++; 3645 if (i == length_) break; 3646 int remembered_from = i; 3647 bool union_map[kSize]; 3648 for (int j = 0; j < kSize; j++) union_map[j] = false; 3649 while (i < length_ && Count(i) <= max_number_of_chars) { 3650 BoyerMoorePositionInfo* map = bitmaps_->at(i); 3651 for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j); 3652 i++; 3653 } 3654 int frequency = 0; 3655 for (int j = 0; j < kSize; j++) { 3656 if (union_map[j]) { 3657 // Add 1 to the frequency to give a small per-character boost for 3658 // the cases where our sampling is not good enough and many 3659 // characters have a frequency of zero. This means the frequency 3660 // can theoretically be up to 2*kSize though we treat it mostly as 3661 // a fraction of kSize. 3662 frequency += compiler_->frequency_collator()->Frequency(j) + 1; 3663 } 3664 } 3665 // We use the probability of skipping times the distance we are skipping to 3666 // judge the effectiveness of this. Actually we have a cut-off: By 3667 // dividing by 2 we switch off the skipping if the probability of skipping 3668 // is less than 50%. This is because the multibyte mask-and-compare 3669 // skipping in quickcheck is more likely to do well on this case. 3670 bool in_quickcheck_range = 3671 ((i - remembered_from < 4) || 3672 (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2)); 3673 // Called 'probability' but it is only a rough estimate and can actually 3674 // be outside the 0-kSize range. 3675 int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency; 3676 int points = (i - remembered_from) * probability; 3677 if (points > biggest_points) { 3678 *from = remembered_from; 3679 *to = i - 1; 3680 biggest_points = points; 3681 } 3682 } 3683 return biggest_points; 3684 } 3685 3686 3687 // Take all the characters that will not prevent a successful match if they 3688 // occur in the subject string in the range between min_lookahead and 3689 // max_lookahead (inclusive) measured from the current position. If the 3690 // character at max_lookahead offset is not one of these characters, then we 3691 // can safely skip forwards by the number of characters in the range. 3692 int BoyerMooreLookahead::GetSkipTable(int min_lookahead, 3693 int max_lookahead, 3694 Handle<ByteArray> boolean_skip_table) { 3695 const int kSize = RegExpMacroAssembler::kTableSize; 3696 3697 const int kSkipArrayEntry = 0; 3698 const int kDontSkipArrayEntry = 1; 3699 3700 for (int i = 0; i < kSize; i++) { 3701 boolean_skip_table->set(i, kSkipArrayEntry); 3702 } 3703 int skip = max_lookahead + 1 - min_lookahead; 3704 3705 for (int i = max_lookahead; i >= min_lookahead; i--) { 3706 BoyerMoorePositionInfo* map = bitmaps_->at(i); 3707 for (int j = 0; j < kSize; j++) { 3708 if (map->at(j)) { 3709 boolean_skip_table->set(j, kDontSkipArrayEntry); 3710 } 3711 } 3712 } 3713 3714 return skip; 3715 } 3716 3717 3718 // See comment above on the implementation of GetSkipTable. 3719 void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) { 3720 const int kSize = RegExpMacroAssembler::kTableSize; 3721 3722 int min_lookahead = 0; 3723 int max_lookahead = 0; 3724 3725 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return; 3726 3727 bool found_single_character = false; 3728 int single_character = 0; 3729 for (int i = max_lookahead; i >= min_lookahead; i--) { 3730 BoyerMoorePositionInfo* map = bitmaps_->at(i); 3731 if (map->map_count() > 1 || 3732 (found_single_character && map->map_count() != 0)) { 3733 found_single_character = false; 3734 break; 3735 } 3736 for (int j = 0; j < kSize; j++) { 3737 if (map->at(j)) { 3738 found_single_character = true; 3739 single_character = j; 3740 break; 3741 } 3742 } 3743 } 3744 3745 int lookahead_width = max_lookahead + 1 - min_lookahead; 3746 3747 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) { 3748 // The mask-compare can probably handle this better. 3749 return; 3750 } 3751 3752 if (found_single_character) { 3753 Label cont, again; 3754 masm->Bind(&again); 3755 masm->LoadCurrentCharacter(max_lookahead, &cont, true); 3756 if (max_char_ > kSize) { 3757 masm->CheckCharacterAfterAnd(single_character, 3758 RegExpMacroAssembler::kTableMask, 3759 &cont); 3760 } else { 3761 masm->CheckCharacter(single_character, &cont); 3762 } 3763 masm->AdvanceCurrentPosition(lookahead_width); 3764 masm->GoTo(&again); 3765 masm->Bind(&cont); 3766 return; 3767 } 3768 3769 Factory* factory = masm->isolate()->factory(); 3770 Handle<ByteArray> boolean_skip_table = factory->NewByteArray(kSize, TENURED); 3771 int skip_distance = GetSkipTable( 3772 min_lookahead, max_lookahead, boolean_skip_table); 3773 DCHECK_NE(0, skip_distance); 3774 3775 Label cont, again; 3776 masm->Bind(&again); 3777 masm->LoadCurrentCharacter(max_lookahead, &cont, true); 3778 masm->CheckBitInTable(boolean_skip_table, &cont); 3779 masm->AdvanceCurrentPosition(skip_distance); 3780 masm->GoTo(&again); 3781 masm->Bind(&cont); 3782 } 3783 3784 3785 /* Code generation for choice nodes. 3786 * 3787 * We generate quick checks that do a mask and compare to eliminate a 3788 * choice. If the quick check succeeds then it jumps to the continuation to 3789 * do slow checks and check subsequent nodes. If it fails (the common case) 3790 * it falls through to the next choice. 3791 * 3792 * Here is the desired flow graph. Nodes directly below each other imply 3793 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative 3794 * 3 doesn't have a quick check so we have to call the slow check. 3795 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire 3796 * regexp continuation is generated directly after the Sn node, up to the 3797 * next GoTo if we decide to reuse some already generated code. Some 3798 * nodes expect preload_characters to be preloaded into the current 3799 * character register. R nodes do this preloading. Vertices are marked 3800 * F for failures and S for success (possible success in the case of quick 3801 * nodes). L, V, < and > are used as arrow heads. 3802 * 3803 * ----------> R 3804 * | 3805 * V 3806 * Q1 -----> S1 3807 * | S / 3808 * F| / 3809 * | F/ 3810 * | / 3811 * | R 3812 * | / 3813 * V L 3814 * Q2 -----> S2 3815 * | S / 3816 * F| / 3817 * | F/ 3818 * | / 3819 * | R 3820 * | / 3821 * V L 3822 * S3 3823 * | 3824 * F| 3825 * | 3826 * R 3827 * | 3828 * backtrack V 3829 * <----------Q4 3830 * \ F | 3831 * \ |S 3832 * \ F V 3833 * \-----S4 3834 * 3835 * For greedy loops we push the current position, then generate the code that 3836 * eats the input specially in EmitGreedyLoop. The other choice (the 3837 * continuation) is generated by the normal code in EmitChoices, and steps back 3838 * in the input to the starting position when it fails to match. The loop code 3839 * looks like this (U is the unwind code that steps back in the greedy loop). 3840 * 3841 * _____ 3842 * / \ 3843 * V | 3844 * ----------> S1 | 3845 * /| | 3846 * / |S | 3847 * F/ \_____/ 3848 * / 3849 * |<----- 3850 * | \ 3851 * V |S 3852 * Q2 ---> U----->backtrack 3853 * | F / 3854 * S| / 3855 * V F / 3856 * S2--/ 3857 */ 3858 3859 GreedyLoopState::GreedyLoopState(bool not_at_start) { 3860 counter_backtrack_trace_.set_backtrack(&label_); 3861 if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE); 3862 } 3863 3864 3865 void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) { 3866 #ifdef DEBUG 3867 int choice_count = alternatives_->length(); 3868 for (int i = 0; i < choice_count - 1; i++) { 3869 GuardedAlternative alternative = alternatives_->at(i); 3870 ZoneList<Guard*>* guards = alternative.guards(); 3871 int guard_count = (guards == nullptr) ? 0 : guards->length(); 3872 for (int j = 0; j < guard_count; j++) { 3873 DCHECK(!trace->mentions_reg(guards->at(j)->reg())); 3874 } 3875 } 3876 #endif 3877 } 3878 3879 3880 void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler, 3881 Trace* current_trace, 3882 PreloadState* state) { 3883 if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) { 3884 // Save some time by looking at most one machine word ahead. 3885 state->eats_at_least_ = 3886 EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget, 3887 current_trace->at_start() == Trace::FALSE_VALUE); 3888 } 3889 state->preload_characters_ = 3890 CalculatePreloadCharacters(compiler, state->eats_at_least_); 3891 3892 state->preload_is_current_ = 3893 (current_trace->characters_preloaded() == state->preload_characters_); 3894 state->preload_has_checked_bounds_ = state->preload_is_current_; 3895 } 3896 3897 3898 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3899 int choice_count = alternatives_->length(); 3900 3901 if (choice_count == 1 && alternatives_->at(0).guards() == nullptr) { 3902 alternatives_->at(0).node()->Emit(compiler, trace); 3903 return; 3904 } 3905 3906 AssertGuardsMentionRegisters(trace); 3907 3908 LimitResult limit_result = LimitVersions(compiler, trace); 3909 if (limit_result == DONE) return; 3910 DCHECK(limit_result == CONTINUE); 3911 3912 // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for 3913 // other choice nodes we only flush if we are out of code size budget. 3914 if (trace->flush_budget() == 0 && trace->actions() != nullptr) { 3915 trace->Flush(compiler, this); 3916 return; 3917 } 3918 3919 RecursionCheck rc(compiler); 3920 3921 PreloadState preload; 3922 preload.init(); 3923 GreedyLoopState greedy_loop_state(not_at_start()); 3924 3925 int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0)); 3926 AlternativeGenerationList alt_gens(choice_count, zone()); 3927 3928 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) { 3929 trace = EmitGreedyLoop(compiler, 3930 trace, 3931 &alt_gens, 3932 &preload, 3933 &greedy_loop_state, 3934 text_length); 3935 } else { 3936 // TODO(erikcorry): Delete this. We don't need this label, but it makes us 3937 // match the traces produced pre-cleanup. 3938 Label second_choice; 3939 compiler->macro_assembler()->Bind(&second_choice); 3940 3941 preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace); 3942 3943 EmitChoices(compiler, 3944 &alt_gens, 3945 0, 3946 trace, 3947 &preload); 3948 } 3949 3950 // At this point we need to generate slow checks for the alternatives where 3951 // the quick check was inlined. We can recognize these because the associated 3952 // label was bound. 3953 int new_flush_budget = trace->flush_budget() / choice_count; 3954 for (int i = 0; i < choice_count; i++) { 3955 AlternativeGeneration* alt_gen = alt_gens.at(i); 3956 Trace new_trace(*trace); 3957 // If there are actions to be flushed we have to limit how many times 3958 // they are flushed. Take the budget of the parent trace and distribute 3959 // it fairly amongst the children. 3960 if (new_trace.actions() != nullptr) { 3961 new_trace.set_flush_budget(new_flush_budget); 3962 } 3963 bool next_expects_preload = 3964 i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload; 3965 EmitOutOfLineContinuation(compiler, 3966 &new_trace, 3967 alternatives_->at(i), 3968 alt_gen, 3969 preload.preload_characters_, 3970 next_expects_preload); 3971 } 3972 } 3973 3974 3975 Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler, 3976 Trace* trace, 3977 AlternativeGenerationList* alt_gens, 3978 PreloadState* preload, 3979 GreedyLoopState* greedy_loop_state, 3980 int text_length) { 3981 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 3982 // Here we have special handling for greedy loops containing only text nodes 3983 // and other simple nodes. These are handled by pushing the current 3984 // position on the stack and then incrementing the current position each 3985 // time around the switch. On backtrack we decrement the current position 3986 // and check it against the pushed value. This avoids pushing backtrack 3987 // information for each iteration of the loop, which could take up a lot of 3988 // space. 3989 DCHECK(trace->stop_node() == nullptr); 3990 macro_assembler->PushCurrentPosition(); 3991 Label greedy_match_failed; 3992 Trace greedy_match_trace; 3993 if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE); 3994 greedy_match_trace.set_backtrack(&greedy_match_failed); 3995 Label loop_label; 3996 macro_assembler->Bind(&loop_label); 3997 greedy_match_trace.set_stop_node(this); 3998 greedy_match_trace.set_loop_label(&loop_label); 3999 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace); 4000 macro_assembler->Bind(&greedy_match_failed); 4001 4002 Label second_choice; // For use in greedy matches. 4003 macro_assembler->Bind(&second_choice); 4004 4005 Trace* new_trace = greedy_loop_state->counter_backtrack_trace(); 4006 4007 EmitChoices(compiler, 4008 alt_gens, 4009 1, 4010 new_trace, 4011 preload); 4012 4013 macro_assembler->Bind(greedy_loop_state->label()); 4014 // If we have unwound to the bottom then backtrack. 4015 macro_assembler->CheckGreedyLoop(trace->backtrack()); 4016 // Otherwise try the second priority at an earlier position. 4017 macro_assembler->AdvanceCurrentPosition(-text_length); 4018 macro_assembler->GoTo(&second_choice); 4019 return new_trace; 4020 } 4021 4022 int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler, 4023 Trace* trace) { 4024 int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized; 4025 if (alternatives_->length() != 2) return eats_at_least; 4026 4027 GuardedAlternative alt1 = alternatives_->at(1); 4028 if (alt1.guards() != nullptr && alt1.guards()->length() != 0) { 4029 return eats_at_least; 4030 } 4031 RegExpNode* eats_anything_node = alt1.node(); 4032 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) { 4033 return eats_at_least; 4034 } 4035 4036 // Really we should be creating a new trace when we execute this function, 4037 // but there is no need, because the code it generates cannot backtrack, and 4038 // we always arrive here with a trivial trace (since it's the entry to a 4039 // loop. That also implies that there are no preloaded characters, which is 4040 // good, because it means we won't be violating any assumptions by 4041 // overwriting those characters with new load instructions. 4042 DCHECK(trace->is_trivial()); 4043 4044 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 4045 Isolate* isolate = macro_assembler->isolate(); 4046 // At this point we know that we are at a non-greedy loop that will eat 4047 // any character one at a time. Any non-anchored regexp has such a 4048 // loop prepended to it in order to find where it starts. We look for 4049 // a pattern of the form ...abc... where we can look 6 characters ahead 4050 // and step forwards 3 if the character is not one of abc. Abc need 4051 // not be atoms, they can be any reasonably limited character class or 4052 // small alternation. 4053 BoyerMooreLookahead* bm = bm_info(false); 4054 if (bm == nullptr) { 4055 eats_at_least = Min(kMaxLookaheadForBoyerMoore, 4056 EatsAtLeast(kMaxLookaheadForBoyerMoore, 4057 kRecursionBudget, 4058 false)); 4059 if (eats_at_least >= 1) { 4060 bm = new(zone()) BoyerMooreLookahead(eats_at_least, 4061 compiler, 4062 zone()); 4063 GuardedAlternative alt0 = alternatives_->at(0); 4064 alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false); 4065 } 4066 } 4067 if (bm != nullptr) { 4068 bm->EmitSkipInstructions(macro_assembler); 4069 } 4070 return eats_at_least; 4071 } 4072 4073 4074 void ChoiceNode::EmitChoices(RegExpCompiler* compiler, 4075 AlternativeGenerationList* alt_gens, 4076 int first_choice, 4077 Trace* trace, 4078 PreloadState* preload) { 4079 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 4080 SetUpPreLoad(compiler, trace, preload); 4081 4082 // For now we just call all choices one after the other. The idea ultimately 4083 // is to use the Dispatch table to try only the relevant ones. 4084 int choice_count = alternatives_->length(); 4085 4086 int new_flush_budget = trace->flush_budget() / choice_count; 4087 4088 for (int i = first_choice; i < choice_count; i++) { 4089 bool is_last = i == choice_count - 1; 4090 bool fall_through_on_failure = !is_last; 4091 GuardedAlternative alternative = alternatives_->at(i); 4092 AlternativeGeneration* alt_gen = alt_gens->at(i); 4093 alt_gen->quick_check_details.set_characters(preload->preload_characters_); 4094 ZoneList<Guard*>* guards = alternative.guards(); 4095 int guard_count = (guards == nullptr) ? 0 : guards->length(); 4096 Trace new_trace(*trace); 4097 new_trace.set_characters_preloaded(preload->preload_is_current_ ? 4098 preload->preload_characters_ : 4099 0); 4100 if (preload->preload_has_checked_bounds_) { 4101 new_trace.set_bound_checked_up_to(preload->preload_characters_); 4102 } 4103 new_trace.quick_check_performed()->Clear(); 4104 if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE); 4105 if (!is_last) { 4106 new_trace.set_backtrack(&alt_gen->after); 4107 } 4108 alt_gen->expects_preload = preload->preload_is_current_; 4109 bool generate_full_check_inline = false; 4110 if (compiler->optimize() && 4111 try_to_emit_quick_check_for_alternative(i == 0) && 4112 alternative.node()->EmitQuickCheck( 4113 compiler, trace, &new_trace, preload->preload_has_checked_bounds_, 4114 &alt_gen->possible_success, &alt_gen->quick_check_details, 4115 fall_through_on_failure)) { 4116 // Quick check was generated for this choice. 4117 preload->preload_is_current_ = true; 4118 preload->preload_has_checked_bounds_ = true; 4119 // If we generated the quick check to fall through on possible success, 4120 // we now need to generate the full check inline. 4121 if (!fall_through_on_failure) { 4122 macro_assembler->Bind(&alt_gen->possible_success); 4123 new_trace.set_quick_check_performed(&alt_gen->quick_check_details); 4124 new_trace.set_characters_preloaded(preload->preload_characters_); 4125 new_trace.set_bound_checked_up_to(preload->preload_characters_); 4126 generate_full_check_inline = true; 4127 } 4128 } else if (alt_gen->quick_check_details.cannot_match()) { 4129 if (!fall_through_on_failure) { 4130 macro_assembler->GoTo(trace->backtrack()); 4131 } 4132 continue; 4133 } else { 4134 // No quick check was generated. Put the full code here. 4135 // If this is not the first choice then there could be slow checks from 4136 // previous cases that go here when they fail. There's no reason to 4137 // insist that they preload characters since the slow check we are about 4138 // to generate probably can't use it. 4139 if (i != first_choice) { 4140 alt_gen->expects_preload = false; 4141 new_trace.InvalidateCurrentCharacter(); 4142 } 4143 generate_full_check_inline = true; 4144 } 4145 if (generate_full_check_inline) { 4146 if (new_trace.actions() != nullptr) { 4147 new_trace.set_flush_budget(new_flush_budget); 4148 } 4149 for (int j = 0; j < guard_count; j++) { 4150 GenerateGuard(macro_assembler, guards->at(j), &new_trace); 4151 } 4152 alternative.node()->Emit(compiler, &new_trace); 4153 preload->preload_is_current_ = false; 4154 } 4155 macro_assembler->Bind(&alt_gen->after); 4156 } 4157 } 4158 4159 4160 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler, 4161 Trace* trace, 4162 GuardedAlternative alternative, 4163 AlternativeGeneration* alt_gen, 4164 int preload_characters, 4165 bool next_expects_preload) { 4166 if (!alt_gen->possible_success.is_linked()) return; 4167 4168 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 4169 macro_assembler->Bind(&alt_gen->possible_success); 4170 Trace out_of_line_trace(*trace); 4171 out_of_line_trace.set_characters_preloaded(preload_characters); 4172 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details); 4173 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE); 4174 ZoneList<Guard*>* guards = alternative.guards(); 4175 int guard_count = (guards == nullptr) ? 0 : guards->length(); 4176 if (next_expects_preload) { 4177 Label reload_current_char; 4178 out_of_line_trace.set_backtrack(&reload_current_char); 4179 for (int j = 0; j < guard_count; j++) { 4180 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace); 4181 } 4182 alternative.node()->Emit(compiler, &out_of_line_trace); 4183 macro_assembler->Bind(&reload_current_char); 4184 // Reload the current character, since the next quick check expects that. 4185 // We don't need to check bounds here because we only get into this 4186 // code through a quick check which already did the checked load. 4187 macro_assembler->LoadCurrentCharacter(trace->cp_offset(), nullptr, false, 4188 preload_characters); 4189 macro_assembler->GoTo(&(alt_gen->after)); 4190 } else { 4191 out_of_line_trace.set_backtrack(&(alt_gen->after)); 4192 for (int j = 0; j < guard_count; j++) { 4193 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace); 4194 } 4195 alternative.node()->Emit(compiler, &out_of_line_trace); 4196 } 4197 } 4198 4199 4200 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) { 4201 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 4202 LimitResult limit_result = LimitVersions(compiler, trace); 4203 if (limit_result == DONE) return; 4204 DCHECK(limit_result == CONTINUE); 4205 4206 RecursionCheck rc(compiler); 4207 4208 switch (action_type_) { 4209 case STORE_POSITION: { 4210 Trace::DeferredCapture 4211 new_capture(data_.u_position_register.reg, 4212 data_.u_position_register.is_capture, 4213 trace); 4214 Trace new_trace = *trace; 4215 new_trace.add_action(&new_capture); 4216 on_success()->Emit(compiler, &new_trace); 4217 break; 4218 } 4219 case INCREMENT_REGISTER: { 4220 Trace::DeferredIncrementRegister 4221 new_increment(data_.u_increment_register.reg); 4222 Trace new_trace = *trace; 4223 new_trace.add_action(&new_increment); 4224 on_success()->Emit(compiler, &new_trace); 4225 break; 4226 } 4227 case SET_REGISTER: { 4228 Trace::DeferredSetRegister 4229 new_set(data_.u_store_register.reg, data_.u_store_register.value); 4230 Trace new_trace = *trace; 4231 new_trace.add_action(&new_set); 4232 on_success()->Emit(compiler, &new_trace); 4233 break; 4234 } 4235 case CLEAR_CAPTURES: { 4236 Trace::DeferredClearCaptures 4237 new_capture(Interval(data_.u_clear_captures.range_from, 4238 data_.u_clear_captures.range_to)); 4239 Trace new_trace = *trace; 4240 new_trace.add_action(&new_capture); 4241 on_success()->Emit(compiler, &new_trace); 4242 break; 4243 } 4244 case BEGIN_SUBMATCH: 4245 if (!trace->is_trivial()) { 4246 trace->Flush(compiler, this); 4247 } else { 4248 assembler->WriteCurrentPositionToRegister( 4249 data_.u_submatch.current_position_register, 0); 4250 assembler->WriteStackPointerToRegister( 4251 data_.u_submatch.stack_pointer_register); 4252 on_success()->Emit(compiler, trace); 4253 } 4254 break; 4255 case EMPTY_MATCH_CHECK: { 4256 int start_pos_reg = data_.u_empty_match_check.start_register; 4257 int stored_pos = 0; 4258 int rep_reg = data_.u_empty_match_check.repetition_register; 4259 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister); 4260 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos); 4261 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) { 4262 // If we know we haven't advanced and there is no minimum we 4263 // can just backtrack immediately. 4264 assembler->GoTo(trace->backtrack()); 4265 } else if (know_dist && stored_pos < trace->cp_offset()) { 4266 // If we know we've advanced we can generate the continuation 4267 // immediately. 4268 on_success()->Emit(compiler, trace); 4269 } else if (!trace->is_trivial()) { 4270 trace->Flush(compiler, this); 4271 } else { 4272 Label skip_empty_check; 4273 // If we have a minimum number of repetitions we check the current 4274 // number first and skip the empty check if it's not enough. 4275 if (has_minimum) { 4276 int limit = data_.u_empty_match_check.repetition_limit; 4277 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check); 4278 } 4279 // If the match is empty we bail out, otherwise we fall through 4280 // to the on-success continuation. 4281 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register, 4282 trace->backtrack()); 4283 assembler->Bind(&skip_empty_check); 4284 on_success()->Emit(compiler, trace); 4285 } 4286 break; 4287 } 4288 case POSITIVE_SUBMATCH_SUCCESS: { 4289 if (!trace->is_trivial()) { 4290 trace->Flush(compiler, this); 4291 return; 4292 } 4293 assembler->ReadCurrentPositionFromRegister( 4294 data_.u_submatch.current_position_register); 4295 assembler->ReadStackPointerFromRegister( 4296 data_.u_submatch.stack_pointer_register); 4297 int clear_register_count = data_.u_submatch.clear_register_count; 4298 if (clear_register_count == 0) { 4299 on_success()->Emit(compiler, trace); 4300 return; 4301 } 4302 int clear_registers_from = data_.u_submatch.clear_register_from; 4303 Label clear_registers_backtrack; 4304 Trace new_trace = *trace; 4305 new_trace.set_backtrack(&clear_registers_backtrack); 4306 on_success()->Emit(compiler, &new_trace); 4307 4308 assembler->Bind(&clear_registers_backtrack); 4309 int clear_registers_to = clear_registers_from + clear_register_count - 1; 4310 assembler->ClearRegisters(clear_registers_from, clear_registers_to); 4311 4312 DCHECK(trace->backtrack() == nullptr); 4313 assembler->Backtrack(); 4314 return; 4315 } 4316 default: 4317 UNREACHABLE(); 4318 } 4319 } 4320 4321 4322 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 4323 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 4324 if (!trace->is_trivial()) { 4325 trace->Flush(compiler, this); 4326 return; 4327 } 4328 4329 LimitResult limit_result = LimitVersions(compiler, trace); 4330 if (limit_result == DONE) return; 4331 DCHECK(limit_result == CONTINUE); 4332 4333 RecursionCheck rc(compiler); 4334 4335 DCHECK_EQ(start_reg_ + 1, end_reg_); 4336 if (IgnoreCase(flags_)) { 4337 assembler->CheckNotBackReferenceIgnoreCase( 4338 start_reg_, read_backward(), IsUnicode(flags_), trace->backtrack()); 4339 } else { 4340 assembler->CheckNotBackReference(start_reg_, read_backward(), 4341 trace->backtrack()); 4342 } 4343 // We are going to advance backward, so we may end up at the start. 4344 if (read_backward()) trace->set_at_start(Trace::UNKNOWN); 4345 4346 // Check that the back reference does not end inside a surrogate pair. 4347 if (IsUnicode(flags_) && !compiler->one_byte()) { 4348 assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack()); 4349 } 4350 on_success()->Emit(compiler, trace); 4351 } 4352 4353 4354 // ------------------------------------------------------------------- 4355 // Dot/dotty output 4356 4357 4358 #ifdef DEBUG 4359 4360 4361 class DotPrinter: public NodeVisitor { 4362 public: 4363 DotPrinter(std::ostream& os, bool ignore_case) // NOLINT 4364 : os_(os), 4365 ignore_case_(ignore_case) {} 4366 void PrintNode(const char* label, RegExpNode* node); 4367 void Visit(RegExpNode* node); 4368 void PrintAttributes(RegExpNode* from); 4369 void PrintOnFailure(RegExpNode* from, RegExpNode* to); 4370 #define DECLARE_VISIT(Type) \ 4371 virtual void Visit##Type(Type##Node* that); 4372 FOR_EACH_NODE_TYPE(DECLARE_VISIT) 4373 #undef DECLARE_VISIT 4374 private: 4375 std::ostream& os_; 4376 bool ignore_case_; 4377 }; 4378 4379 4380 void DotPrinter::PrintNode(const char* label, RegExpNode* node) { 4381 os_ << "digraph G {\n graph [label=\""; 4382 for (int i = 0; label[i]; i++) { 4383 switch (label[i]) { 4384 case '\\': 4385 os_ << "\\\\"; 4386 break; 4387 case '"': 4388 os_ << "\""; 4389 break; 4390 default: 4391 os_ << label[i]; 4392 break; 4393 } 4394 } 4395 os_ << "\"];\n"; 4396 Visit(node); 4397 os_ << "}" << std::endl; 4398 } 4399 4400 4401 void DotPrinter::Visit(RegExpNode* node) { 4402 if (node->info()->visited) return; 4403 node->info()->visited = true; 4404 node->Accept(this); 4405 } 4406 4407 4408 void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) { 4409 os_ << " n" << from << " -> n" << on_failure << " [style=dotted];\n"; 4410 Visit(on_failure); 4411 } 4412 4413 4414 class TableEntryBodyPrinter { 4415 public: 4416 TableEntryBodyPrinter(std::ostream& os, ChoiceNode* choice) // NOLINT 4417 : os_(os), 4418 choice_(choice) {} 4419 void Call(uc16 from, DispatchTable::Entry entry) { 4420 OutSet* out_set = entry.out_set(); 4421 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { 4422 if (out_set->Get(i)) { 4423 os_ << " n" << choice() << ":s" << from << "o" << i << " -> n" 4424 << choice()->alternatives()->at(i).node() << ";\n"; 4425 } 4426 } 4427 } 4428 private: 4429 ChoiceNode* choice() { return choice_; } 4430 std::ostream& os_; 4431 ChoiceNode* choice_; 4432 }; 4433 4434 4435 class TableEntryHeaderPrinter { 4436 public: 4437 explicit TableEntryHeaderPrinter(std::ostream& os) // NOLINT 4438 : first_(true), 4439 os_(os) {} 4440 void Call(uc16 from, DispatchTable::Entry entry) { 4441 if (first_) { 4442 first_ = false; 4443 } else { 4444 os_ << "|"; 4445 } 4446 os_ << "{\\" << AsUC16(from) << "-\\" << AsUC16(entry.to()) << "|{"; 4447 OutSet* out_set = entry.out_set(); 4448 int priority = 0; 4449 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { 4450 if (out_set->Get(i)) { 4451 if (priority > 0) os_ << "|"; 4452 os_ << "<s" << from << "o" << i << "> " << priority; 4453 priority++; 4454 } 4455 } 4456 os_ << "}}"; 4457 } 4458 4459 private: 4460 bool first_; 4461 std::ostream& os_; 4462 }; 4463 4464 4465 class AttributePrinter { 4466 public: 4467 explicit AttributePrinter(std::ostream& os) // NOLINT 4468 : os_(os), 4469 first_(true) {} 4470 void PrintSeparator() { 4471 if (first_) { 4472 first_ = false; 4473 } else { 4474 os_ << "|"; 4475 } 4476 } 4477 void PrintBit(const char* name, bool value) { 4478 if (!value) return; 4479 PrintSeparator(); 4480 os_ << "{" << name << "}"; 4481 } 4482 void PrintPositive(const char* name, int value) { 4483 if (value < 0) return; 4484 PrintSeparator(); 4485 os_ << "{" << name << "|" << value << "}"; 4486 } 4487 4488 private: 4489 std::ostream& os_; 4490 bool first_; 4491 }; 4492 4493 4494 void DotPrinter::PrintAttributes(RegExpNode* that) { 4495 os_ << " a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, " 4496 << "margin=0.1, fontsize=10, label=\"{"; 4497 AttributePrinter printer(os_); 4498 NodeInfo* info = that->info(); 4499 printer.PrintBit("NI", info->follows_newline_interest); 4500 printer.PrintBit("WI", info->follows_word_interest); 4501 printer.PrintBit("SI", info->follows_start_interest); 4502 Label* label = that->label(); 4503 if (label->is_bound()) 4504 printer.PrintPositive("@", label->pos()); 4505 os_ << "}\"];\n" 4506 << " a" << that << " -> n" << that 4507 << " [style=dashed, color=grey, arrowhead=none];\n"; 4508 } 4509 4510 4511 static const bool kPrintDispatchTable = false; 4512 void DotPrinter::VisitChoice(ChoiceNode* that) { 4513 if (kPrintDispatchTable) { 4514 os_ << " n" << that << " [shape=Mrecord, label=\""; 4515 TableEntryHeaderPrinter header_printer(os_); 4516 that->GetTable(ignore_case_)->ForEach(&header_printer); 4517 os_ << "\"]\n"; 4518 PrintAttributes(that); 4519 TableEntryBodyPrinter body_printer(os_, that); 4520 that->GetTable(ignore_case_)->ForEach(&body_printer); 4521 } else { 4522 os_ << " n" << that << " [shape=Mrecord, label=\"?\"];\n"; 4523 for (int i = 0; i < that->alternatives()->length(); i++) { 4524 GuardedAlternative alt = that->alternatives()->at(i); 4525 os_ << " n" << that << " -> n" << alt.node(); 4526 } 4527 } 4528 for (int i = 0; i < that->alternatives()->length(); i++) { 4529 GuardedAlternative alt = that->alternatives()->at(i); 4530 alt.node()->Accept(this); 4531 } 4532 } 4533 4534 4535 void DotPrinter::VisitText(TextNode* that) { 4536 Zone* zone = that->zone(); 4537 os_ << " n" << that << " [label=\""; 4538 for (int i = 0; i < that->elements()->length(); i++) { 4539 if (i > 0) os_ << " "; 4540 TextElement elm = that->elements()->at(i); 4541 switch (elm.text_type()) { 4542 case TextElement::ATOM: { 4543 Vector<const uc16> data = elm.atom()->data(); 4544 for (int i = 0; i < data.length(); i++) { 4545 os_ << static_cast<char>(data[i]); 4546 } 4547 break; 4548 } 4549 case TextElement::CHAR_CLASS: { 4550 RegExpCharacterClass* node = elm.char_class(); 4551 os_ << "["; 4552 if (node->is_negated()) os_ << "^"; 4553 for (int j = 0; j < node->ranges(zone)->length(); j++) { 4554 CharacterRange range = node->ranges(zone)->at(j); 4555 os_ << AsUC16(range.from()) << "-" << AsUC16(range.to()); 4556 } 4557 os_ << "]"; 4558 break; 4559 } 4560 default: 4561 UNREACHABLE(); 4562 } 4563 } 4564 os_ << "\", shape=box, peripheries=2];\n"; 4565 PrintAttributes(that); 4566 os_ << " n" << that << " -> n" << that->on_success() << ";\n"; 4567 Visit(that->on_success()); 4568 } 4569 4570 4571 void DotPrinter::VisitBackReference(BackReferenceNode* that) { 4572 os_ << " n" << that << " [label=\"$" << that->start_register() << "..$" 4573 << that->end_register() << "\", shape=doubleoctagon];\n"; 4574 PrintAttributes(that); 4575 os_ << " n" << that << " -> n" << that->on_success() << ";\n"; 4576 Visit(that->on_success()); 4577 } 4578 4579 4580 void DotPrinter::VisitEnd(EndNode* that) { 4581 os_ << " n" << that << " [style=bold, shape=point];\n"; 4582 PrintAttributes(that); 4583 } 4584 4585 4586 void DotPrinter::VisitAssertion(AssertionNode* that) { 4587 os_ << " n" << that << " ["; 4588 switch (that->assertion_type()) { 4589 case AssertionNode::AT_END: 4590 os_ << "label=\"$\", shape=septagon"; 4591 break; 4592 case AssertionNode::AT_START: 4593 os_ << "label=\"^\", shape=septagon"; 4594 break; 4595 case AssertionNode::AT_BOUNDARY: 4596 os_ << "label=\"\\b\", shape=septagon"; 4597 break; 4598 case AssertionNode::AT_NON_BOUNDARY: 4599 os_ << "label=\"\\B\", shape=septagon"; 4600 break; 4601 case AssertionNode::AFTER_NEWLINE: 4602 os_ << "label=\"(?<=\\n)\", shape=septagon"; 4603 break; 4604 } 4605 os_ << "];\n"; 4606 PrintAttributes(that); 4607 RegExpNode* successor = that->on_success(); 4608 os_ << " n" << that << " -> n" << successor << ";\n"; 4609 Visit(successor); 4610 } 4611 4612 4613 void DotPrinter::VisitAction(ActionNode* that) { 4614 os_ << " n" << that << " ["; 4615 switch (that->action_type_) { 4616 case ActionNode::SET_REGISTER: 4617 os_ << "label=\"$" << that->data_.u_store_register.reg 4618 << ":=" << that->data_.u_store_register.value << "\", shape=octagon"; 4619 break; 4620 case ActionNode::INCREMENT_REGISTER: 4621 os_ << "label=\"$" << that->data_.u_increment_register.reg 4622 << "++\", shape=octagon"; 4623 break; 4624 case ActionNode::STORE_POSITION: 4625 os_ << "label=\"$" << that->data_.u_position_register.reg 4626 << ":=$pos\", shape=octagon"; 4627 break; 4628 case ActionNode::BEGIN_SUBMATCH: 4629 os_ << "label=\"$" << that->data_.u_submatch.current_position_register 4630 << ":=$pos,begin\", shape=septagon"; 4631 break; 4632 case ActionNode::POSITIVE_SUBMATCH_SUCCESS: 4633 os_ << "label=\"escape\", shape=septagon"; 4634 break; 4635 case ActionNode::EMPTY_MATCH_CHECK: 4636 os_ << "label=\"$" << that->data_.u_empty_match_check.start_register 4637 << "=$pos?,$" << that->data_.u_empty_match_check.repetition_register 4638 << "<" << that->data_.u_empty_match_check.repetition_limit 4639 << "?\", shape=septagon"; 4640 break; 4641 case ActionNode::CLEAR_CAPTURES: { 4642 os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from 4643 << " to $" << that->data_.u_clear_captures.range_to 4644 << "\", shape=septagon"; 4645 break; 4646 } 4647 } 4648 os_ << "];\n"; 4649 PrintAttributes(that); 4650 RegExpNode* successor = that->on_success(); 4651 os_ << " n" << that << " -> n" << successor << ";\n"; 4652 Visit(successor); 4653 } 4654 4655 4656 class DispatchTableDumper { 4657 public: 4658 explicit DispatchTableDumper(std::ostream& os) : os_(os) {} 4659 void Call(uc16 key, DispatchTable::Entry entry); 4660 private: 4661 std::ostream& os_; 4662 }; 4663 4664 4665 void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) { 4666 os_ << "[" << AsUC16(key) << "-" << AsUC16(entry.to()) << "]: {"; 4667 OutSet* set = entry.out_set(); 4668 bool first = true; 4669 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { 4670 if (set->Get(i)) { 4671 if (first) { 4672 first = false; 4673 } else { 4674 os_ << ", "; 4675 } 4676 os_ << i; 4677 } 4678 } 4679 os_ << "}\n"; 4680 } 4681 4682 4683 void DispatchTable::Dump() { 4684 OFStream os(stderr); 4685 DispatchTableDumper dumper(os); 4686 tree()->ForEach(&dumper); 4687 } 4688 4689 4690 void RegExpEngine::DotPrint(const char* label, 4691 RegExpNode* node, 4692 bool ignore_case) { 4693 StdoutStream os; 4694 DotPrinter printer(os, ignore_case); 4695 printer.PrintNode(label, node); 4696 } 4697 4698 4699 #endif // DEBUG 4700 4701 4702 // ------------------------------------------------------------------- 4703 // Tree to graph conversion 4704 4705 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler, 4706 RegExpNode* on_success) { 4707 ZoneList<TextElement>* elms = 4708 new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone()); 4709 elms->Add(TextElement::Atom(this), compiler->zone()); 4710 return new (compiler->zone()) 4711 TextNode(elms, compiler->read_backward(), on_success); 4712 } 4713 4714 4715 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, 4716 RegExpNode* on_success) { 4717 return new (compiler->zone()) 4718 TextNode(elements(), compiler->read_backward(), on_success); 4719 } 4720 4721 4722 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges, 4723 const int* special_class, 4724 int length) { 4725 length--; // Remove final marker. 4726 DCHECK_EQ(kRangeEndMarker, special_class[length]); 4727 DCHECK_NE(0, ranges->length()); 4728 DCHECK_NE(0, length); 4729 DCHECK_NE(0, special_class[0]); 4730 if (ranges->length() != (length >> 1) + 1) { 4731 return false; 4732 } 4733 CharacterRange range = ranges->at(0); 4734 if (range.from() != 0) { 4735 return false; 4736 } 4737 for (int i = 0; i < length; i += 2) { 4738 if (special_class[i] != (range.to() + 1)) { 4739 return false; 4740 } 4741 range = ranges->at((i >> 1) + 1); 4742 if (special_class[i+1] != range.from()) { 4743 return false; 4744 } 4745 } 4746 if (range.to() != String::kMaxCodePoint) { 4747 return false; 4748 } 4749 return true; 4750 } 4751 4752 4753 static bool CompareRanges(ZoneList<CharacterRange>* ranges, 4754 const int* special_class, 4755 int length) { 4756 length--; // Remove final marker. 4757 DCHECK_EQ(kRangeEndMarker, special_class[length]); 4758 if (ranges->length() * 2 != length) { 4759 return false; 4760 } 4761 for (int i = 0; i < length; i += 2) { 4762 CharacterRange range = ranges->at(i >> 1); 4763 if (range.from() != special_class[i] || 4764 range.to() != special_class[i + 1] - 1) { 4765 return false; 4766 } 4767 } 4768 return true; 4769 } 4770 4771 4772 bool RegExpCharacterClass::is_standard(Zone* zone) { 4773 // TODO(lrn): Remove need for this function, by not throwing away information 4774 // along the way. 4775 if (is_negated()) { 4776 return false; 4777 } 4778 if (set_.is_standard()) { 4779 return true; 4780 } 4781 if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) { 4782 set_.set_standard_set_type('s'); 4783 return true; 4784 } 4785 if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) { 4786 set_.set_standard_set_type('S'); 4787 return true; 4788 } 4789 if (CompareInverseRanges(set_.ranges(zone), 4790 kLineTerminatorRanges, 4791 kLineTerminatorRangeCount)) { 4792 set_.set_standard_set_type('.'); 4793 return true; 4794 } 4795 if (CompareRanges(set_.ranges(zone), 4796 kLineTerminatorRanges, 4797 kLineTerminatorRangeCount)) { 4798 set_.set_standard_set_type('n'); 4799 return true; 4800 } 4801 if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) { 4802 set_.set_standard_set_type('w'); 4803 return true; 4804 } 4805 if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) { 4806 set_.set_standard_set_type('W'); 4807 return true; 4808 } 4809 return false; 4810 } 4811 4812 4813 UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone, 4814 ZoneList<CharacterRange>* base) 4815 : zone_(zone), 4816 table_(zone), 4817 bmp_(nullptr), 4818 lead_surrogates_(nullptr), 4819 trail_surrogates_(nullptr), 4820 non_bmp_(nullptr) { 4821 // The unicode range splitter categorizes given character ranges into: 4822 // - Code points from the BMP representable by one code unit. 4823 // - Code points outside the BMP that need to be split into surrogate pairs. 4824 // - Lone lead surrogates. 4825 // - Lone trail surrogates. 4826 // Lone surrogates are valid code points, even though no actual characters. 4827 // They require special matching to make sure we do not split surrogate pairs. 4828 // We use the dispatch table to accomplish this. The base range is split up 4829 // by the table by the overlay ranges, and the Call callback is used to 4830 // filter and collect ranges for each category. 4831 for (int i = 0; i < base->length(); i++) { 4832 table_.AddRange(base->at(i), kBase, zone_); 4833 } 4834 // Add overlay ranges. 4835 table_.AddRange(CharacterRange::Range(0, kLeadSurrogateStart - 1), 4836 kBmpCodePoints, zone_); 4837 table_.AddRange(CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd), 4838 kLeadSurrogates, zone_); 4839 table_.AddRange( 4840 CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd), 4841 kTrailSurrogates, zone_); 4842 table_.AddRange( 4843 CharacterRange::Range(kTrailSurrogateEnd + 1, kNonBmpStart - 1), 4844 kBmpCodePoints, zone_); 4845 table_.AddRange(CharacterRange::Range(kNonBmpStart, kNonBmpEnd), 4846 kNonBmpCodePoints, zone_); 4847 table_.ForEach(this); 4848 } 4849 4850 4851 void UnicodeRangeSplitter::Call(uc32 from, DispatchTable::Entry entry) { 4852 OutSet* outset = entry.out_set(); 4853 if (!outset->Get(kBase)) return; 4854 ZoneList<CharacterRange>** target = nullptr; 4855 if (outset->Get(kBmpCodePoints)) { 4856 target = &bmp_; 4857 } else if (outset->Get(kLeadSurrogates)) { 4858 target = &lead_surrogates_; 4859 } else if (outset->Get(kTrailSurrogates)) { 4860 target = &trail_surrogates_; 4861 } else { 4862 DCHECK(outset->Get(kNonBmpCodePoints)); 4863 target = &non_bmp_; 4864 } 4865 if (*target == nullptr) 4866 *target = new (zone_) ZoneList<CharacterRange>(2, zone_); 4867 (*target)->Add(CharacterRange::Range(entry.from(), entry.to()), zone_); 4868 } 4869 4870 void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result, 4871 RegExpNode* on_success, UnicodeRangeSplitter* splitter) { 4872 ZoneList<CharacterRange>* bmp = splitter->bmp(); 4873 if (bmp == nullptr) return; 4874 JSRegExp::Flags default_flags = JSRegExp::Flags(); 4875 result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges( 4876 compiler->zone(), bmp, compiler->read_backward(), on_success, 4877 default_flags))); 4878 } 4879 4880 void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result, 4881 RegExpNode* on_success, 4882 UnicodeRangeSplitter* splitter) { 4883 ZoneList<CharacterRange>* non_bmp = splitter->non_bmp(); 4884 if (non_bmp == nullptr) return; 4885 DCHECK(!compiler->one_byte()); 4886 Zone* zone = compiler->zone(); 4887 JSRegExp::Flags default_flags = JSRegExp::Flags(); 4888 CharacterRange::Canonicalize(non_bmp); 4889 for (int i = 0; i < non_bmp->length(); i++) { 4890 // Match surrogate pair. 4891 // E.g. [\u10005-\u11005] becomes 4892 // \ud800[\udc05-\udfff]| 4893 // [\ud801-\ud803][\udc00-\udfff]| 4894 // \ud804[\udc00-\udc05] 4895 uc32 from = non_bmp->at(i).from(); 4896 uc32 to = non_bmp->at(i).to(); 4897 uc16 from_l = unibrow::Utf16::LeadSurrogate(from); 4898 uc16 from_t = unibrow::Utf16::TrailSurrogate(from); 4899 uc16 to_l = unibrow::Utf16::LeadSurrogate(to); 4900 uc16 to_t = unibrow::Utf16::TrailSurrogate(to); 4901 if (from_l == to_l) { 4902 // The lead surrogate is the same. 4903 result->AddAlternative( 4904 GuardedAlternative(TextNode::CreateForSurrogatePair( 4905 zone, CharacterRange::Singleton(from_l), 4906 CharacterRange::Range(from_t, to_t), compiler->read_backward(), 4907 on_success, default_flags))); 4908 } else { 4909 if (from_t != kTrailSurrogateStart) { 4910 // Add [from_l][from_t-\udfff] 4911 result->AddAlternative( 4912 GuardedAlternative(TextNode::CreateForSurrogatePair( 4913 zone, CharacterRange::Singleton(from_l), 4914 CharacterRange::Range(from_t, kTrailSurrogateEnd), 4915 compiler->read_backward(), on_success, default_flags))); 4916 from_l++; 4917 } 4918 if (to_t != kTrailSurrogateEnd) { 4919 // Add [to_l][\udc00-to_t] 4920 result->AddAlternative( 4921 GuardedAlternative(TextNode::CreateForSurrogatePair( 4922 zone, CharacterRange::Singleton(to_l), 4923 CharacterRange::Range(kTrailSurrogateStart, to_t), 4924 compiler->read_backward(), on_success, default_flags))); 4925 to_l--; 4926 } 4927 if (from_l <= to_l) { 4928 // Add [from_l-to_l][\udc00-\udfff] 4929 result->AddAlternative( 4930 GuardedAlternative(TextNode::CreateForSurrogatePair( 4931 zone, CharacterRange::Range(from_l, to_l), 4932 CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd), 4933 compiler->read_backward(), on_success, default_flags))); 4934 } 4935 } 4936 } 4937 } 4938 4939 RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch( 4940 RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind, 4941 ZoneList<CharacterRange>* match, RegExpNode* on_success, bool read_backward, 4942 JSRegExp::Flags flags) { 4943 Zone* zone = compiler->zone(); 4944 RegExpNode* match_node = TextNode::CreateForCharacterRanges( 4945 zone, match, read_backward, on_success, flags); 4946 int stack_register = compiler->UnicodeLookaroundStackRegister(); 4947 int position_register = compiler->UnicodeLookaroundPositionRegister(); 4948 RegExpLookaround::Builder lookaround(false, match_node, stack_register, 4949 position_register); 4950 RegExpNode* negative_match = TextNode::CreateForCharacterRanges( 4951 zone, lookbehind, !read_backward, lookaround.on_match_success(), flags); 4952 return lookaround.ForMatch(negative_match); 4953 } 4954 4955 RegExpNode* MatchAndNegativeLookaroundInReadDirection( 4956 RegExpCompiler* compiler, ZoneList<CharacterRange>* match, 4957 ZoneList<CharacterRange>* lookahead, RegExpNode* on_success, 4958 bool read_backward, JSRegExp::Flags flags) { 4959 Zone* zone = compiler->zone(); 4960 int stack_register = compiler->UnicodeLookaroundStackRegister(); 4961 int position_register = compiler->UnicodeLookaroundPositionRegister(); 4962 RegExpLookaround::Builder lookaround(false, on_success, stack_register, 4963 position_register); 4964 RegExpNode* negative_match = TextNode::CreateForCharacterRanges( 4965 zone, lookahead, read_backward, lookaround.on_match_success(), flags); 4966 return TextNode::CreateForCharacterRanges( 4967 zone, match, read_backward, lookaround.ForMatch(negative_match), flags); 4968 } 4969 4970 void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result, 4971 RegExpNode* on_success, 4972 UnicodeRangeSplitter* splitter) { 4973 JSRegExp::Flags default_flags = JSRegExp::Flags(); 4974 ZoneList<CharacterRange>* lead_surrogates = splitter->lead_surrogates(); 4975 if (lead_surrogates == nullptr) return; 4976 Zone* zone = compiler->zone(); 4977 // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]). 4978 ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List( 4979 zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd)); 4980 4981 RegExpNode* match; 4982 if (compiler->read_backward()) { 4983 // Reading backward. Assert that reading forward, there is no trail 4984 // surrogate, and then backward match the lead surrogate. 4985 match = NegativeLookaroundAgainstReadDirectionAndMatch( 4986 compiler, trail_surrogates, lead_surrogates, on_success, true, 4987 default_flags); 4988 } else { 4989 // Reading forward. Forward match the lead surrogate and assert that 4990 // no trail surrogate follows. 4991 match = MatchAndNegativeLookaroundInReadDirection( 4992 compiler, lead_surrogates, trail_surrogates, on_success, false, 4993 default_flags); 4994 } 4995 result->AddAlternative(GuardedAlternative(match)); 4996 } 4997 4998 void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result, 4999 RegExpNode* on_success, 5000 UnicodeRangeSplitter* splitter) { 5001 JSRegExp::Flags default_flags = JSRegExp::Flags(); 5002 ZoneList<CharacterRange>* trail_surrogates = splitter->trail_surrogates(); 5003 if (trail_surrogates == nullptr) return; 5004 Zone* zone = compiler->zone(); 5005 // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01 5006 ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List( 5007 zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd)); 5008 5009 RegExpNode* match; 5010 if (compiler->read_backward()) { 5011 // Reading backward. Backward match the trail surrogate and assert that no 5012 // lead surrogate precedes it. 5013 match = MatchAndNegativeLookaroundInReadDirection( 5014 compiler, trail_surrogates, lead_surrogates, on_success, true, 5015 default_flags); 5016 } else { 5017 // Reading forward. Assert that reading backward, there is no lead 5018 // surrogate, and then forward match the trail surrogate. 5019 match = NegativeLookaroundAgainstReadDirectionAndMatch( 5020 compiler, lead_surrogates, trail_surrogates, on_success, false, 5021 default_flags); 5022 } 5023 result->AddAlternative(GuardedAlternative(match)); 5024 } 5025 5026 RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler, 5027 RegExpNode* on_success) { 5028 // This implements ES2015 21.2.5.2.3, AdvanceStringIndex. 5029 DCHECK(!compiler->read_backward()); 5030 Zone* zone = compiler->zone(); 5031 // Advance any character. If the character happens to be a lead surrogate and 5032 // we advanced into the middle of a surrogate pair, it will work out, as 5033 // nothing will match from there. We will have to advance again, consuming 5034 // the associated trail surrogate. 5035 ZoneList<CharacterRange>* range = CharacterRange::List( 5036 zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit)); 5037 JSRegExp::Flags default_flags = JSRegExp::Flags(); 5038 return TextNode::CreateForCharacterRanges(zone, range, false, on_success, 5039 default_flags); 5040 } 5041 5042 void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, Zone* zone) { 5043 #ifdef V8_INTL_SUPPORT 5044 DCHECK(CharacterRange::IsCanonical(ranges)); 5045 5046 // Micro-optimization to avoid passing large ranges to UnicodeSet::closeOver. 5047 // See also https://crbug.com/v8/6727. 5048 // TODO(jgruber): This only covers the special case of the {0,0x10FFFF} range, 5049 // which we use frequently internally. But large ranges can also easily be 5050 // created by the user. We might want to have a more general caching mechanism 5051 // for such ranges. 5052 if (ranges->length() == 1 && ranges->at(0).IsEverything(kNonBmpEnd)) return; 5053 5054 // Use ICU to compute the case fold closure over the ranges. 5055 icu::UnicodeSet set; 5056 for (int i = 0; i < ranges->length(); i++) { 5057 set.add(ranges->at(i).from(), ranges->at(i).to()); 5058 } 5059 ranges->Clear(); 5060 set.closeOver(USET_CASE_INSENSITIVE); 5061 // Full case mapping map single characters to multiple characters. 5062 // Those are represented as strings in the set. Remove them so that 5063 // we end up with only simple and common case mappings. 5064 set.removeAllStrings(); 5065 for (int i = 0; i < set.getRangeCount(); i++) { 5066 ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)), 5067 zone); 5068 } 5069 // No errors and everything we collected have been ranges. 5070 CharacterRange::Canonicalize(ranges); 5071 #endif // V8_INTL_SUPPORT 5072 } 5073 5074 5075 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, 5076 RegExpNode* on_success) { 5077 set_.Canonicalize(); 5078 Zone* zone = compiler->zone(); 5079 ZoneList<CharacterRange>* ranges = this->ranges(zone); 5080 if (NeedsUnicodeCaseEquivalents(flags_)) { 5081 AddUnicodeCaseEquivalents(ranges, zone); 5082 } 5083 if (IsUnicode(flags_) && !compiler->one_byte() && 5084 !contains_split_surrogate()) { 5085 if (is_negated()) { 5086 ZoneList<CharacterRange>* negated = 5087 new (zone) ZoneList<CharacterRange>(2, zone); 5088 CharacterRange::Negate(ranges, negated, zone); 5089 ranges = negated; 5090 } 5091 if (ranges->length() == 0) { 5092 JSRegExp::Flags default_flags; 5093 RegExpCharacterClass* fail = 5094 new (zone) RegExpCharacterClass(zone, ranges, default_flags); 5095 return new (zone) TextNode(fail, compiler->read_backward(), on_success); 5096 } 5097 if (standard_type() == '*') { 5098 return UnanchoredAdvance(compiler, on_success); 5099 } else { 5100 ChoiceNode* result = new (zone) ChoiceNode(2, zone); 5101 UnicodeRangeSplitter splitter(zone, ranges); 5102 AddBmpCharacters(compiler, result, on_success, &splitter); 5103 AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter); 5104 AddLoneLeadSurrogates(compiler, result, on_success, &splitter); 5105 AddLoneTrailSurrogates(compiler, result, on_success, &splitter); 5106 return result; 5107 } 5108 } else { 5109 return new (zone) TextNode(this, compiler->read_backward(), on_success); 5110 } 5111 } 5112 5113 5114 int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) { 5115 RegExpAtom* atom1 = (*a)->AsAtom(); 5116 RegExpAtom* atom2 = (*b)->AsAtom(); 5117 uc16 character1 = atom1->data().at(0); 5118 uc16 character2 = atom2->data().at(0); 5119 if (character1 < character2) return -1; 5120 if (character1 > character2) return 1; 5121 return 0; 5122 } 5123 5124 5125 static unibrow::uchar Canonical( 5126 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize, 5127 unibrow::uchar c) { 5128 unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth]; 5129 int length = canonicalize->get(c, '\0', chars); 5130 DCHECK_LE(length, 1); 5131 unibrow::uchar canonical = c; 5132 if (length == 1) canonical = chars[0]; 5133 return canonical; 5134 } 5135 5136 5137 int CompareFirstCharCaseIndependent( 5138 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize, 5139 RegExpTree* const* a, RegExpTree* const* b) { 5140 RegExpAtom* atom1 = (*a)->AsAtom(); 5141 RegExpAtom* atom2 = (*b)->AsAtom(); 5142 unibrow::uchar character1 = atom1->data().at(0); 5143 unibrow::uchar character2 = atom2->data().at(0); 5144 if (character1 == character2) return 0; 5145 if (character1 >= 'a' || character2 >= 'a') { 5146 character1 = Canonical(canonicalize, character1); 5147 character2 = Canonical(canonicalize, character2); 5148 } 5149 return static_cast<int>(character1) - static_cast<int>(character2); 5150 } 5151 5152 5153 // We can stable sort runs of atoms, since the order does not matter if they 5154 // start with different characters. 5155 // Returns true if any consecutive atoms were found. 5156 bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) { 5157 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 5158 int length = alternatives->length(); 5159 bool found_consecutive_atoms = false; 5160 for (int i = 0; i < length; i++) { 5161 while (i < length) { 5162 RegExpTree* alternative = alternatives->at(i); 5163 if (alternative->IsAtom()) break; 5164 i++; 5165 } 5166 // i is length or it is the index of an atom. 5167 if (i == length) break; 5168 int first_atom = i; 5169 JSRegExp::Flags flags = alternatives->at(i)->AsAtom()->flags(); 5170 i++; 5171 while (i < length) { 5172 RegExpTree* alternative = alternatives->at(i); 5173 if (!alternative->IsAtom()) break; 5174 if (alternative->AsAtom()->flags() != flags) break; 5175 i++; 5176 } 5177 // Sort atoms to get ones with common prefixes together. 5178 // This step is more tricky if we are in a case-independent regexp, 5179 // because it would change /is|I/ to /I|is/, and order matters when 5180 // the regexp parts don't match only disjoint starting points. To fix 5181 // this we have a version of CompareFirstChar that uses case- 5182 // independent character classes for comparison. 5183 DCHECK_LT(first_atom, alternatives->length()); 5184 DCHECK_LE(i, alternatives->length()); 5185 DCHECK_LE(first_atom, i); 5186 if (IgnoreCase(flags)) { 5187 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize = 5188 compiler->isolate()->regexp_macro_assembler_canonicalize(); 5189 auto compare_closure = 5190 [canonicalize](RegExpTree* const* a, RegExpTree* const* b) { 5191 return CompareFirstCharCaseIndependent(canonicalize, a, b); 5192 }; 5193 alternatives->StableSort(compare_closure, first_atom, i - first_atom); 5194 } else { 5195 alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom); 5196 } 5197 if (i - first_atom > 1) found_consecutive_atoms = true; 5198 } 5199 return found_consecutive_atoms; 5200 } 5201 5202 5203 // Optimizes ab|ac|az to a(?:b|c|d). 5204 void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { 5205 Zone* zone = compiler->zone(); 5206 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 5207 int length = alternatives->length(); 5208 5209 int write_posn = 0; 5210 int i = 0; 5211 while (i < length) { 5212 RegExpTree* alternative = alternatives->at(i); 5213 if (!alternative->IsAtom()) { 5214 alternatives->at(write_posn++) = alternatives->at(i); 5215 i++; 5216 continue; 5217 } 5218 RegExpAtom* const atom = alternative->AsAtom(); 5219 JSRegExp::Flags flags = atom->flags(); 5220 unibrow::uchar common_prefix = atom->data().at(0); 5221 int first_with_prefix = i; 5222 int prefix_length = atom->length(); 5223 i++; 5224 while (i < length) { 5225 alternative = alternatives->at(i); 5226 if (!alternative->IsAtom()) break; 5227 RegExpAtom* const atom = alternative->AsAtom(); 5228 if (atom->flags() != flags) break; 5229 unibrow::uchar new_prefix = atom->data().at(0); 5230 if (new_prefix != common_prefix) { 5231 if (!IgnoreCase(flags)) break; 5232 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize = 5233 compiler->isolate()->regexp_macro_assembler_canonicalize(); 5234 new_prefix = Canonical(canonicalize, new_prefix); 5235 common_prefix = Canonical(canonicalize, common_prefix); 5236 if (new_prefix != common_prefix) break; 5237 } 5238 prefix_length = Min(prefix_length, atom->length()); 5239 i++; 5240 } 5241 if (i > first_with_prefix + 2) { 5242 // Found worthwhile run of alternatives with common prefix of at least one 5243 // character. The sorting function above did not sort on more than one 5244 // character for reasons of correctness, but there may still be a longer 5245 // common prefix if the terms were similar or presorted in the input. 5246 // Find out how long the common prefix is. 5247 int run_length = i - first_with_prefix; 5248 RegExpAtom* const atom = alternatives->at(first_with_prefix)->AsAtom(); 5249 for (int j = 1; j < run_length && prefix_length > 1; j++) { 5250 RegExpAtom* old_atom = 5251 alternatives->at(j + first_with_prefix)->AsAtom(); 5252 for (int k = 1; k < prefix_length; k++) { 5253 if (atom->data().at(k) != old_atom->data().at(k)) { 5254 prefix_length = k; 5255 break; 5256 } 5257 } 5258 } 5259 RegExpAtom* prefix = new (zone) 5260 RegExpAtom(atom->data().SubVector(0, prefix_length), flags); 5261 ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone); 5262 pair->Add(prefix, zone); 5263 ZoneList<RegExpTree*>* suffixes = 5264 new (zone) ZoneList<RegExpTree*>(run_length, zone); 5265 for (int j = 0; j < run_length; j++) { 5266 RegExpAtom* old_atom = 5267 alternatives->at(j + first_with_prefix)->AsAtom(); 5268 int len = old_atom->length(); 5269 if (len == prefix_length) { 5270 suffixes->Add(new (zone) RegExpEmpty(), zone); 5271 } else { 5272 RegExpTree* suffix = new (zone) RegExpAtom( 5273 old_atom->data().SubVector(prefix_length, old_atom->length()), 5274 flags); 5275 suffixes->Add(suffix, zone); 5276 } 5277 } 5278 pair->Add(new (zone) RegExpDisjunction(suffixes), zone); 5279 alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair); 5280 } else { 5281 // Just copy any non-worthwhile alternatives. 5282 for (int j = first_with_prefix; j < i; j++) { 5283 alternatives->at(write_posn++) = alternatives->at(j); 5284 } 5285 } 5286 } 5287 alternatives->Rewind(write_posn); // Trim end of array. 5288 } 5289 5290 5291 // Optimizes b|c|z to [bcz]. 5292 void RegExpDisjunction::FixSingleCharacterDisjunctions( 5293 RegExpCompiler* compiler) { 5294 Zone* zone = compiler->zone(); 5295 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 5296 int length = alternatives->length(); 5297 5298 int write_posn = 0; 5299 int i = 0; 5300 while (i < length) { 5301 RegExpTree* alternative = alternatives->at(i); 5302 if (!alternative->IsAtom()) { 5303 alternatives->at(write_posn++) = alternatives->at(i); 5304 i++; 5305 continue; 5306 } 5307 RegExpAtom* const atom = alternative->AsAtom(); 5308 if (atom->length() != 1) { 5309 alternatives->at(write_posn++) = alternatives->at(i); 5310 i++; 5311 continue; 5312 } 5313 JSRegExp::Flags flags = atom->flags(); 5314 DCHECK_IMPLIES(IsUnicode(flags), 5315 !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0))); 5316 bool contains_trail_surrogate = 5317 unibrow::Utf16::IsTrailSurrogate(atom->data().at(0)); 5318 int first_in_run = i; 5319 i++; 5320 // Find a run of single-character atom alternatives that have identical 5321 // flags (case independence and unicode-ness). 5322 while (i < length) { 5323 alternative = alternatives->at(i); 5324 if (!alternative->IsAtom()) break; 5325 RegExpAtom* const atom = alternative->AsAtom(); 5326 if (atom->length() != 1) break; 5327 if (atom->flags() != flags) break; 5328 DCHECK_IMPLIES(IsUnicode(flags), 5329 !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0))); 5330 contains_trail_surrogate |= 5331 unibrow::Utf16::IsTrailSurrogate(atom->data().at(0)); 5332 i++; 5333 } 5334 if (i > first_in_run + 1) { 5335 // Found non-trivial run of single-character alternatives. 5336 int run_length = i - first_in_run; 5337 ZoneList<CharacterRange>* ranges = 5338 new (zone) ZoneList<CharacterRange>(2, zone); 5339 for (int j = 0; j < run_length; j++) { 5340 RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom(); 5341 DCHECK_EQ(old_atom->length(), 1); 5342 ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone); 5343 } 5344 RegExpCharacterClass::CharacterClassFlags character_class_flags; 5345 if (IsUnicode(flags) && contains_trail_surrogate) { 5346 character_class_flags = RegExpCharacterClass::CONTAINS_SPLIT_SURROGATE; 5347 } 5348 alternatives->at(write_posn++) = new (zone) 5349 RegExpCharacterClass(zone, ranges, flags, character_class_flags); 5350 } else { 5351 // Just copy any trivial alternatives. 5352 for (int j = first_in_run; j < i; j++) { 5353 alternatives->at(write_posn++) = alternatives->at(j); 5354 } 5355 } 5356 } 5357 alternatives->Rewind(write_posn); // Trim end of array. 5358 } 5359 5360 5361 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler, 5362 RegExpNode* on_success) { 5363 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 5364 5365 if (alternatives->length() > 2) { 5366 bool found_consecutive_atoms = SortConsecutiveAtoms(compiler); 5367 if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler); 5368 FixSingleCharacterDisjunctions(compiler); 5369 if (alternatives->length() == 1) { 5370 return alternatives->at(0)->ToNode(compiler, on_success); 5371 } 5372 } 5373 5374 int length = alternatives->length(); 5375 5376 ChoiceNode* result = 5377 new(compiler->zone()) ChoiceNode(length, compiler->zone()); 5378 for (int i = 0; i < length; i++) { 5379 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler, 5380 on_success)); 5381 result->AddAlternative(alternative); 5382 } 5383 return result; 5384 } 5385 5386 5387 RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler, 5388 RegExpNode* on_success) { 5389 return ToNode(min(), 5390 max(), 5391 is_greedy(), 5392 body(), 5393 compiler, 5394 on_success); 5395 } 5396 5397 5398 // Scoped object to keep track of how much we unroll quantifier loops in the 5399 // regexp graph generator. 5400 class RegExpExpansionLimiter { 5401 public: 5402 static const int kMaxExpansionFactor = 6; 5403 RegExpExpansionLimiter(RegExpCompiler* compiler, int factor) 5404 : compiler_(compiler), 5405 saved_expansion_factor_(compiler->current_expansion_factor()), 5406 ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) { 5407 DCHECK_LT(0, factor); 5408 if (ok_to_expand_) { 5409 if (factor > kMaxExpansionFactor) { 5410 // Avoid integer overflow of the current expansion factor. 5411 ok_to_expand_ = false; 5412 compiler->set_current_expansion_factor(kMaxExpansionFactor + 1); 5413 } else { 5414 int new_factor = saved_expansion_factor_ * factor; 5415 ok_to_expand_ = (new_factor <= kMaxExpansionFactor); 5416 compiler->set_current_expansion_factor(new_factor); 5417 } 5418 } 5419 } 5420 5421 ~RegExpExpansionLimiter() { 5422 compiler_->set_current_expansion_factor(saved_expansion_factor_); 5423 } 5424 5425 bool ok_to_expand() { return ok_to_expand_; } 5426 5427 private: 5428 RegExpCompiler* compiler_; 5429 int saved_expansion_factor_; 5430 bool ok_to_expand_; 5431 5432 DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter); 5433 }; 5434 5435 5436 RegExpNode* RegExpQuantifier::ToNode(int min, 5437 int max, 5438 bool is_greedy, 5439 RegExpTree* body, 5440 RegExpCompiler* compiler, 5441 RegExpNode* on_success, 5442 bool not_at_start) { 5443 // x{f, t} becomes this: 5444 // 5445 // (r++)<-. 5446 // | ` 5447 // | (x) 5448 // v ^ 5449 // (r=0)-->(?)---/ [if r < t] 5450 // | 5451 // [if r >= f] \----> ... 5452 // 5453 5454 // 15.10.2.5 RepeatMatcher algorithm. 5455 // The parser has already eliminated the case where max is 0. In the case 5456 // where max_match is zero the parser has removed the quantifier if min was 5457 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom. 5458 5459 // If we know that we cannot match zero length then things are a little 5460 // simpler since we don't need to make the special zero length match check 5461 // from step 2.1. If the min and max are small we can unroll a little in 5462 // this case. 5463 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,} 5464 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3} 5465 if (max == 0) return on_success; // This can happen due to recursion. 5466 bool body_can_be_empty = (body->min_match() == 0); 5467 int body_start_reg = RegExpCompiler::kNoRegister; 5468 Interval capture_registers = body->CaptureRegisters(); 5469 bool needs_capture_clearing = !capture_registers.is_empty(); 5470 Zone* zone = compiler->zone(); 5471 5472 if (body_can_be_empty) { 5473 body_start_reg = compiler->AllocateRegister(); 5474 } else if (compiler->optimize() && !needs_capture_clearing) { 5475 // Only unroll if there are no captures and the body can't be 5476 // empty. 5477 { 5478 RegExpExpansionLimiter limiter( 5479 compiler, min + ((max != min) ? 1 : 0)); 5480 if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) { 5481 int new_max = (max == kInfinity) ? max : max - min; 5482 // Recurse once to get the loop or optional matches after the fixed 5483 // ones. 5484 RegExpNode* answer = ToNode( 5485 0, new_max, is_greedy, body, compiler, on_success, true); 5486 // Unroll the forced matches from 0 to min. This can cause chains of 5487 // TextNodes (which the parser does not generate). These should be 5488 // combined if it turns out they hinder good code generation. 5489 for (int i = 0; i < min; i++) { 5490 answer = body->ToNode(compiler, answer); 5491 } 5492 return answer; 5493 } 5494 } 5495 if (max <= kMaxUnrolledMaxMatches && min == 0) { 5496 DCHECK_LT(0, max); // Due to the 'if' above. 5497 RegExpExpansionLimiter limiter(compiler, max); 5498 if (limiter.ok_to_expand()) { 5499 // Unroll the optional matches up to max. 5500 RegExpNode* answer = on_success; 5501 for (int i = 0; i < max; i++) { 5502 ChoiceNode* alternation = new(zone) ChoiceNode(2, zone); 5503 if (is_greedy) { 5504 alternation->AddAlternative( 5505 GuardedAlternative(body->ToNode(compiler, answer))); 5506 alternation->AddAlternative(GuardedAlternative(on_success)); 5507 } else { 5508 alternation->AddAlternative(GuardedAlternative(on_success)); 5509 alternation->AddAlternative( 5510 GuardedAlternative(body->ToNode(compiler, answer))); 5511 } 5512 answer = alternation; 5513 if (not_at_start && !compiler->read_backward()) { 5514 alternation->set_not_at_start(); 5515 } 5516 } 5517 return answer; 5518 } 5519 } 5520 } 5521 bool has_min = min > 0; 5522 bool has_max = max < RegExpTree::kInfinity; 5523 bool needs_counter = has_min || has_max; 5524 int reg_ctr = needs_counter 5525 ? compiler->AllocateRegister() 5526 : RegExpCompiler::kNoRegister; 5527 LoopChoiceNode* center = new (zone) 5528 LoopChoiceNode(body->min_match() == 0, compiler->read_backward(), zone); 5529 if (not_at_start && !compiler->read_backward()) center->set_not_at_start(); 5530 RegExpNode* loop_return = needs_counter 5531 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center)) 5532 : static_cast<RegExpNode*>(center); 5533 if (body_can_be_empty) { 5534 // If the body can be empty we need to check if it was and then 5535 // backtrack. 5536 loop_return = ActionNode::EmptyMatchCheck(body_start_reg, 5537 reg_ctr, 5538 min, 5539 loop_return); 5540 } 5541 RegExpNode* body_node = body->ToNode(compiler, loop_return); 5542 if (body_can_be_empty) { 5543 // If the body can be empty we need to store the start position 5544 // so we can bail out if it was empty. 5545 body_node = ActionNode::StorePosition(body_start_reg, false, body_node); 5546 } 5547 if (needs_capture_clearing) { 5548 // Before entering the body of this loop we need to clear captures. 5549 body_node = ActionNode::ClearCaptures(capture_registers, body_node); 5550 } 5551 GuardedAlternative body_alt(body_node); 5552 if (has_max) { 5553 Guard* body_guard = 5554 new(zone) Guard(reg_ctr, Guard::LT, max); 5555 body_alt.AddGuard(body_guard, zone); 5556 } 5557 GuardedAlternative rest_alt(on_success); 5558 if (has_min) { 5559 Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min); 5560 rest_alt.AddGuard(rest_guard, zone); 5561 } 5562 if (is_greedy) { 5563 center->AddLoopAlternative(body_alt); 5564 center->AddContinueAlternative(rest_alt); 5565 } else { 5566 center->AddContinueAlternative(rest_alt); 5567 center->AddLoopAlternative(body_alt); 5568 } 5569 if (needs_counter) { 5570 return ActionNode::SetRegister(reg_ctr, 0, center); 5571 } else { 5572 return center; 5573 } 5574 } 5575 5576 namespace { 5577 // Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and 5578 // \B to (?<=\w)(?=\w)|(?<=\W)(?=\W) 5579 RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler, 5580 RegExpNode* on_success, 5581 RegExpAssertion::AssertionType type, 5582 JSRegExp::Flags flags) { 5583 DCHECK(NeedsUnicodeCaseEquivalents(flags)); 5584 Zone* zone = compiler->zone(); 5585 ZoneList<CharacterRange>* word_range = 5586 new (zone) ZoneList<CharacterRange>(2, zone); 5587 CharacterRange::AddClassEscape('w', word_range, true, zone); 5588 int stack_register = compiler->UnicodeLookaroundStackRegister(); 5589 int position_register = compiler->UnicodeLookaroundPositionRegister(); 5590 ChoiceNode* result = new (zone) ChoiceNode(2, zone); 5591 // Add two choices. The (non-)boundary could start with a word or 5592 // a non-word-character. 5593 for (int i = 0; i < 2; i++) { 5594 bool lookbehind_for_word = i == 0; 5595 bool lookahead_for_word = 5596 (type == RegExpAssertion::BOUNDARY) ^ lookbehind_for_word; 5597 // Look to the left. 5598 RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success, 5599 stack_register, position_register); 5600 RegExpNode* backward = TextNode::CreateForCharacterRanges( 5601 zone, word_range, true, lookbehind.on_match_success(), flags); 5602 // Look to the right. 5603 RegExpLookaround::Builder lookahead(lookahead_for_word, 5604 lookbehind.ForMatch(backward), 5605 stack_register, position_register); 5606 RegExpNode* forward = TextNode::CreateForCharacterRanges( 5607 zone, word_range, false, lookahead.on_match_success(), flags); 5608 result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward))); 5609 } 5610 return result; 5611 } 5612 } // anonymous namespace 5613 5614 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, 5615 RegExpNode* on_success) { 5616 NodeInfo info; 5617 Zone* zone = compiler->zone(); 5618 5619 switch (assertion_type()) { 5620 case START_OF_LINE: 5621 return AssertionNode::AfterNewline(on_success); 5622 case START_OF_INPUT: 5623 return AssertionNode::AtStart(on_success); 5624 case BOUNDARY: 5625 return NeedsUnicodeCaseEquivalents(flags_) 5626 ? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY, 5627 flags_) 5628 : AssertionNode::AtBoundary(on_success); 5629 case NON_BOUNDARY: 5630 return NeedsUnicodeCaseEquivalents(flags_) 5631 ? BoundaryAssertionAsLookaround(compiler, on_success, 5632 NON_BOUNDARY, flags_) 5633 : AssertionNode::AtNonBoundary(on_success); 5634 case END_OF_INPUT: 5635 return AssertionNode::AtEnd(on_success); 5636 case END_OF_LINE: { 5637 // Compile $ in multiline regexps as an alternation with a positive 5638 // lookahead in one side and an end-of-input on the other side. 5639 // We need two registers for the lookahead. 5640 int stack_pointer_register = compiler->AllocateRegister(); 5641 int position_register = compiler->AllocateRegister(); 5642 // The ChoiceNode to distinguish between a newline and end-of-input. 5643 ChoiceNode* result = new(zone) ChoiceNode(2, zone); 5644 // Create a newline atom. 5645 ZoneList<CharacterRange>* newline_ranges = 5646 new(zone) ZoneList<CharacterRange>(3, zone); 5647 CharacterRange::AddClassEscape('n', newline_ranges, false, zone); 5648 JSRegExp::Flags default_flags = JSRegExp::Flags(); 5649 RegExpCharacterClass* newline_atom = 5650 new (zone) RegExpCharacterClass('n', default_flags); 5651 TextNode* newline_matcher = new (zone) TextNode( 5652 newline_atom, false, ActionNode::PositiveSubmatchSuccess( 5653 stack_pointer_register, position_register, 5654 0, // No captures inside. 5655 -1, // Ignored if no captures. 5656 on_success)); 5657 // Create an end-of-input matcher. 5658 RegExpNode* end_of_line = ActionNode::BeginSubmatch( 5659 stack_pointer_register, 5660 position_register, 5661 newline_matcher); 5662 // Add the two alternatives to the ChoiceNode. 5663 GuardedAlternative eol_alternative(end_of_line); 5664 result->AddAlternative(eol_alternative); 5665 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success)); 5666 result->AddAlternative(end_alternative); 5667 return result; 5668 } 5669 default: 5670 UNREACHABLE(); 5671 } 5672 return on_success; 5673 } 5674 5675 5676 RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler, 5677 RegExpNode* on_success) { 5678 return new (compiler->zone()) 5679 BackReferenceNode(RegExpCapture::StartRegister(index()), 5680 RegExpCapture::EndRegister(index()), flags_, 5681 compiler->read_backward(), on_success); 5682 } 5683 5684 5685 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler, 5686 RegExpNode* on_success) { 5687 return on_success; 5688 } 5689 5690 5691 RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success, 5692 int stack_pointer_register, 5693 int position_register, 5694 int capture_register_count, 5695 int capture_register_start) 5696 : is_positive_(is_positive), 5697 on_success_(on_success), 5698 stack_pointer_register_(stack_pointer_register), 5699 position_register_(position_register) { 5700 if (is_positive_) { 5701 on_match_success_ = ActionNode::PositiveSubmatchSuccess( 5702 stack_pointer_register, position_register, capture_register_count, 5703 capture_register_start, on_success_); 5704 } else { 5705 Zone* zone = on_success_->zone(); 5706 on_match_success_ = new (zone) NegativeSubmatchSuccess( 5707 stack_pointer_register, position_register, capture_register_count, 5708 capture_register_start, zone); 5709 } 5710 } 5711 5712 5713 RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) { 5714 if (is_positive_) { 5715 return ActionNode::BeginSubmatch(stack_pointer_register_, 5716 position_register_, match); 5717 } else { 5718 Zone* zone = on_success_->zone(); 5719 // We use a ChoiceNode to represent the negative lookaround. The first 5720 // alternative is the negative match. On success, the end node backtracks. 5721 // On failure, the second alternative is tried and leads to success. 5722 // NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the 5723 // first exit when calculating quick checks. 5724 ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode( 5725 GuardedAlternative(match), GuardedAlternative(on_success_), zone); 5726 return ActionNode::BeginSubmatch(stack_pointer_register_, 5727 position_register_, choice_node); 5728 } 5729 } 5730 5731 5732 RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler, 5733 RegExpNode* on_success) { 5734 int stack_pointer_register = compiler->AllocateRegister(); 5735 int position_register = compiler->AllocateRegister(); 5736 5737 const int registers_per_capture = 2; 5738 const int register_of_first_capture = 2; 5739 int register_count = capture_count_ * registers_per_capture; 5740 int register_start = 5741 register_of_first_capture + capture_from_ * registers_per_capture; 5742 5743 RegExpNode* result; 5744 bool was_reading_backward = compiler->read_backward(); 5745 compiler->set_read_backward(type() == LOOKBEHIND); 5746 Builder builder(is_positive(), on_success, stack_pointer_register, 5747 position_register, register_count, register_start); 5748 RegExpNode* match = body_->ToNode(compiler, builder.on_match_success()); 5749 result = builder.ForMatch(match); 5750 compiler->set_read_backward(was_reading_backward); 5751 return result; 5752 } 5753 5754 5755 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler, 5756 RegExpNode* on_success) { 5757 return ToNode(body(), index(), compiler, on_success); 5758 } 5759 5760 5761 RegExpNode* RegExpCapture::ToNode(RegExpTree* body, 5762 int index, 5763 RegExpCompiler* compiler, 5764 RegExpNode* on_success) { 5765 DCHECK_NOT_NULL(body); 5766 int start_reg = RegExpCapture::StartRegister(index); 5767 int end_reg = RegExpCapture::EndRegister(index); 5768 if (compiler->read_backward()) std::swap(start_reg, end_reg); 5769 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success); 5770 RegExpNode* body_node = body->ToNode(compiler, store_end); 5771 return ActionNode::StorePosition(start_reg, true, body_node); 5772 } 5773 5774 5775 RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler, 5776 RegExpNode* on_success) { 5777 ZoneList<RegExpTree*>* children = nodes(); 5778 RegExpNode* current = on_success; 5779 if (compiler->read_backward()) { 5780 for (int i = 0; i < children->length(); i++) { 5781 current = children->at(i)->ToNode(compiler, current); 5782 } 5783 } else { 5784 for (int i = children->length() - 1; i >= 0; i--) { 5785 current = children->at(i)->ToNode(compiler, current); 5786 } 5787 } 5788 return current; 5789 } 5790 5791 5792 static void AddClass(const int* elmv, 5793 int elmc, 5794 ZoneList<CharacterRange>* ranges, 5795 Zone* zone) { 5796 elmc--; 5797 DCHECK_EQ(kRangeEndMarker, elmv[elmc]); 5798 for (int i = 0; i < elmc; i += 2) { 5799 DCHECK(elmv[i] < elmv[i + 1]); 5800 ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone); 5801 } 5802 } 5803 5804 5805 static void AddClassNegated(const int *elmv, 5806 int elmc, 5807 ZoneList<CharacterRange>* ranges, 5808 Zone* zone) { 5809 elmc--; 5810 DCHECK_EQ(kRangeEndMarker, elmv[elmc]); 5811 DCHECK_NE(0x0000, elmv[0]); 5812 DCHECK_NE(String::kMaxCodePoint, elmv[elmc - 1]); 5813 uc16 last = 0x0000; 5814 for (int i = 0; i < elmc; i += 2) { 5815 DCHECK(last <= elmv[i] - 1); 5816 DCHECK(elmv[i] < elmv[i + 1]); 5817 ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone); 5818 last = elmv[i + 1]; 5819 } 5820 ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone); 5821 } 5822 5823 void CharacterRange::AddClassEscape(char type, ZoneList<CharacterRange>* ranges, 5824 bool add_unicode_case_equivalents, 5825 Zone* zone) { 5826 if (add_unicode_case_equivalents && (type == 'w' || type == 'W')) { 5827 // See #sec-runtime-semantics-wordcharacters-abstract-operation 5828 // In case of unicode and ignore_case, we need to create the closure over 5829 // case equivalent characters before negating. 5830 ZoneList<CharacterRange>* new_ranges = 5831 new (zone) ZoneList<CharacterRange>(2, zone); 5832 AddClass(kWordRanges, kWordRangeCount, new_ranges, zone); 5833 AddUnicodeCaseEquivalents(new_ranges, zone); 5834 if (type == 'W') { 5835 ZoneList<CharacterRange>* negated = 5836 new (zone) ZoneList<CharacterRange>(2, zone); 5837 CharacterRange::Negate(new_ranges, negated, zone); 5838 new_ranges = negated; 5839 } 5840 ranges->AddAll(*new_ranges, zone); 5841 return; 5842 } 5843 AddClassEscape(type, ranges, zone); 5844 } 5845 5846 void CharacterRange::AddClassEscape(char type, ZoneList<CharacterRange>* ranges, 5847 Zone* zone) { 5848 switch (type) { 5849 case 's': 5850 AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone); 5851 break; 5852 case 'S': 5853 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone); 5854 break; 5855 case 'w': 5856 AddClass(kWordRanges, kWordRangeCount, ranges, zone); 5857 break; 5858 case 'W': 5859 AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone); 5860 break; 5861 case 'd': 5862 AddClass(kDigitRanges, kDigitRangeCount, ranges, zone); 5863 break; 5864 case 'D': 5865 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone); 5866 break; 5867 case '.': 5868 AddClassNegated(kLineTerminatorRanges, 5869 kLineTerminatorRangeCount, 5870 ranges, 5871 zone); 5872 break; 5873 // This is not a character range as defined by the spec but a 5874 // convenient shorthand for a character class that matches any 5875 // character. 5876 case '*': 5877 ranges->Add(CharacterRange::Everything(), zone); 5878 break; 5879 // This is the set of characters matched by the $ and ^ symbols 5880 // in multiline mode. 5881 case 'n': 5882 AddClass(kLineTerminatorRanges, 5883 kLineTerminatorRangeCount, 5884 ranges, 5885 zone); 5886 break; 5887 default: 5888 UNREACHABLE(); 5889 } 5890 } 5891 5892 5893 Vector<const int> CharacterRange::GetWordBounds() { 5894 return Vector<const int>(kWordRanges, kWordRangeCount - 1); 5895 } 5896 5897 // static 5898 void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone, 5899 ZoneList<CharacterRange>* ranges, 5900 bool is_one_byte) { 5901 CharacterRange::Canonicalize(ranges); 5902 int range_count = ranges->length(); 5903 for (int i = 0; i < range_count; i++) { 5904 CharacterRange range = ranges->at(i); 5905 uc32 bottom = range.from(); 5906 if (bottom > String::kMaxUtf16CodeUnit) continue; 5907 uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit); 5908 // Nothing to be done for surrogates. 5909 if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) continue; 5910 if (is_one_byte && !RangeContainsLatin1Equivalents(range)) { 5911 if (bottom > String::kMaxOneByteCharCode) continue; 5912 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode; 5913 } 5914 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 5915 if (top == bottom) { 5916 // If this is a singleton we just expand the one character. 5917 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars); 5918 for (int i = 0; i < length; i++) { 5919 uc32 chr = chars[i]; 5920 if (chr != bottom) { 5921 ranges->Add(CharacterRange::Singleton(chars[i]), zone); 5922 } 5923 } 5924 } else { 5925 // If this is a range we expand the characters block by block, expanding 5926 // contiguous subranges (blocks) one at a time. The approach is as 5927 // follows. For a given start character we look up the remainder of the 5928 // block that contains it (represented by the end point), for instance we 5929 // find 'z' if the character is 'c'. A block is characterized by the 5930 // property that all characters uncanonicalize in the same way, except 5931 // that each entry in the result is incremented by the distance from the 5932 // first element. So a-z is a block because 'a' uncanonicalizes to ['a', 5933 // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. Once 5934 // we've found the end point we look up its uncanonicalization and 5935 // produce a range for each element. For instance for [c-f] we look up 5936 // ['z', 'Z'] and produce [c-f] and [C-F]. We then only add a range if 5937 // it is not already contained in the input, so [c-f] will be skipped but 5938 // [C-F] will be added. If this range is not completely contained in a 5939 // block we do this for all the blocks covered by the range (handling 5940 // characters that is not in a block as a "singleton block"). 5941 unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 5942 int pos = bottom; 5943 while (pos <= top) { 5944 int length = 5945 isolate->jsregexp_canonrange()->get(pos, '\0', equivalents); 5946 uc32 block_end; 5947 if (length == 0) { 5948 block_end = pos; 5949 } else { 5950 DCHECK_EQ(1, length); 5951 block_end = equivalents[0]; 5952 } 5953 int end = (block_end > top) ? top : block_end; 5954 length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', 5955 equivalents); 5956 for (int i = 0; i < length; i++) { 5957 uc32 c = equivalents[i]; 5958 uc32 range_from = c - (block_end - pos); 5959 uc32 range_to = c - (block_end - end); 5960 if (!(bottom <= range_from && range_to <= top)) { 5961 ranges->Add(CharacterRange::Range(range_from, range_to), zone); 5962 } 5963 } 5964 pos = end + 1; 5965 } 5966 } 5967 } 5968 } 5969 5970 5971 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) { 5972 DCHECK_NOT_NULL(ranges); 5973 int n = ranges->length(); 5974 if (n <= 1) return true; 5975 int max = ranges->at(0).to(); 5976 for (int i = 1; i < n; i++) { 5977 CharacterRange next_range = ranges->at(i); 5978 if (next_range.from() <= max + 1) return false; 5979 max = next_range.to(); 5980 } 5981 return true; 5982 } 5983 5984 5985 ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) { 5986 if (ranges_ == nullptr) { 5987 ranges_ = new(zone) ZoneList<CharacterRange>(2, zone); 5988 CharacterRange::AddClassEscape(standard_set_type_, ranges_, false, zone); 5989 } 5990 return ranges_; 5991 } 5992 5993 5994 // Move a number of elements in a zonelist to another position 5995 // in the same list. Handles overlapping source and target areas. 5996 static void MoveRanges(ZoneList<CharacterRange>* list, 5997 int from, 5998 int to, 5999 int count) { 6000 // Ranges are potentially overlapping. 6001 if (from < to) { 6002 for (int i = count - 1; i >= 0; i--) { 6003 list->at(to + i) = list->at(from + i); 6004 } 6005 } else { 6006 for (int i = 0; i < count; i++) { 6007 list->at(to + i) = list->at(from + i); 6008 } 6009 } 6010 } 6011 6012 6013 static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list, 6014 int count, 6015 CharacterRange insert) { 6016 // Inserts a range into list[0..count[, which must be sorted 6017 // by from value and non-overlapping and non-adjacent, using at most 6018 // list[0..count] for the result. Returns the number of resulting 6019 // canonicalized ranges. Inserting a range may collapse existing ranges into 6020 // fewer ranges, so the return value can be anything in the range 1..count+1. 6021 uc32 from = insert.from(); 6022 uc32 to = insert.to(); 6023 int start_pos = 0; 6024 int end_pos = count; 6025 for (int i = count - 1; i >= 0; i--) { 6026 CharacterRange current = list->at(i); 6027 if (current.from() > to + 1) { 6028 end_pos = i; 6029 } else if (current.to() + 1 < from) { 6030 start_pos = i + 1; 6031 break; 6032 } 6033 } 6034 6035 // Inserted range overlaps, or is adjacent to, ranges at positions 6036 // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are 6037 // not affected by the insertion. 6038 // If start_pos == end_pos, the range must be inserted before start_pos. 6039 // if start_pos < end_pos, the entire range from start_pos to end_pos 6040 // must be merged with the insert range. 6041 6042 if (start_pos == end_pos) { 6043 // Insert between existing ranges at position start_pos. 6044 if (start_pos < count) { 6045 MoveRanges(list, start_pos, start_pos + 1, count - start_pos); 6046 } 6047 list->at(start_pos) = insert; 6048 return count + 1; 6049 } 6050 if (start_pos + 1 == end_pos) { 6051 // Replace single existing range at position start_pos. 6052 CharacterRange to_replace = list->at(start_pos); 6053 int new_from = Min(to_replace.from(), from); 6054 int new_to = Max(to_replace.to(), to); 6055 list->at(start_pos) = CharacterRange::Range(new_from, new_to); 6056 return count; 6057 } 6058 // Replace a number of existing ranges from start_pos to end_pos - 1. 6059 // Move the remaining ranges down. 6060 6061 int new_from = Min(list->at(start_pos).from(), from); 6062 int new_to = Max(list->at(end_pos - 1).to(), to); 6063 if (end_pos < count) { 6064 MoveRanges(list, end_pos, start_pos + 1, count - end_pos); 6065 } 6066 list->at(start_pos) = CharacterRange::Range(new_from, new_to); 6067 return count - (end_pos - start_pos) + 1; 6068 } 6069 6070 6071 void CharacterSet::Canonicalize() { 6072 // Special/default classes are always considered canonical. The result 6073 // of calling ranges() will be sorted. 6074 if (ranges_ == nullptr) return; 6075 CharacterRange::Canonicalize(ranges_); 6076 } 6077 6078 6079 void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) { 6080 if (character_ranges->length() <= 1) return; 6081 // Check whether ranges are already canonical (increasing, non-overlapping, 6082 // non-adjacent). 6083 int n = character_ranges->length(); 6084 int max = character_ranges->at(0).to(); 6085 int i = 1; 6086 while (i < n) { 6087 CharacterRange current = character_ranges->at(i); 6088 if (current.from() <= max + 1) { 6089 break; 6090 } 6091 max = current.to(); 6092 i++; 6093 } 6094 // Canonical until the i'th range. If that's all of them, we are done. 6095 if (i == n) return; 6096 6097 // The ranges at index i and forward are not canonicalized. Make them so by 6098 // doing the equivalent of insertion sort (inserting each into the previous 6099 // list, in order). 6100 // Notice that inserting a range can reduce the number of ranges in the 6101 // result due to combining of adjacent and overlapping ranges. 6102 int read = i; // Range to insert. 6103 int num_canonical = i; // Length of canonicalized part of list. 6104 do { 6105 num_canonical = InsertRangeInCanonicalList(character_ranges, 6106 num_canonical, 6107 character_ranges->at(read)); 6108 read++; 6109 } while (read < n); 6110 character_ranges->Rewind(num_canonical); 6111 6112 DCHECK(CharacterRange::IsCanonical(character_ranges)); 6113 } 6114 6115 6116 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges, 6117 ZoneList<CharacterRange>* negated_ranges, 6118 Zone* zone) { 6119 DCHECK(CharacterRange::IsCanonical(ranges)); 6120 DCHECK_EQ(0, negated_ranges->length()); 6121 int range_count = ranges->length(); 6122 uc32 from = 0; 6123 int i = 0; 6124 if (range_count > 0 && ranges->at(0).from() == 0) { 6125 from = ranges->at(0).to() + 1; 6126 i = 1; 6127 } 6128 while (i < range_count) { 6129 CharacterRange range = ranges->at(i); 6130 negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone); 6131 from = range.to() + 1; 6132 i++; 6133 } 6134 if (from < String::kMaxCodePoint) { 6135 negated_ranges->Add(CharacterRange::Range(from, String::kMaxCodePoint), 6136 zone); 6137 } 6138 } 6139 6140 6141 // ------------------------------------------------------------------- 6142 // Splay tree 6143 6144 6145 OutSet* OutSet::Extend(unsigned value, Zone* zone) { 6146 if (Get(value)) 6147 return this; 6148 if (successors(zone) != nullptr) { 6149 for (int i = 0; i < successors(zone)->length(); i++) { 6150 OutSet* successor = successors(zone)->at(i); 6151 if (successor->Get(value)) 6152 return successor; 6153 } 6154 } else { 6155 successors_ = new(zone) ZoneList<OutSet*>(2, zone); 6156 } 6157 OutSet* result = new(zone) OutSet(first_, remaining_); 6158 result->Set(value, zone); 6159 successors(zone)->Add(result, zone); 6160 return result; 6161 } 6162 6163 6164 void OutSet::Set(unsigned value, Zone *zone) { 6165 if (value < kFirstLimit) { 6166 first_ |= (1 << value); 6167 } else { 6168 if (remaining_ == nullptr) 6169 remaining_ = new(zone) ZoneList<unsigned>(1, zone); 6170 if (remaining_->is_empty() || !remaining_->Contains(value)) 6171 remaining_->Add(value, zone); 6172 } 6173 } 6174 6175 6176 bool OutSet::Get(unsigned value) const { 6177 if (value < kFirstLimit) { 6178 return (first_ & (1 << value)) != 0; 6179 } else if (remaining_ == nullptr) { 6180 return false; 6181 } else { 6182 return remaining_->Contains(value); 6183 } 6184 } 6185 6186 6187 const uc32 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar; 6188 6189 6190 void DispatchTable::AddRange(CharacterRange full_range, int value, 6191 Zone* zone) { 6192 CharacterRange current = full_range; 6193 if (tree()->is_empty()) { 6194 // If this is the first range we just insert into the table. 6195 ZoneSplayTree<Config>::Locator loc; 6196 bool inserted = tree()->Insert(current.from(), &loc); 6197 DCHECK(inserted); 6198 USE(inserted); 6199 loc.set_value(Entry(current.from(), current.to(), 6200 empty()->Extend(value, zone))); 6201 return; 6202 } 6203 // First see if there is a range to the left of this one that 6204 // overlaps. 6205 ZoneSplayTree<Config>::Locator loc; 6206 if (tree()->FindGreatestLessThan(current.from(), &loc)) { 6207 Entry* entry = &loc.value(); 6208 // If we've found a range that overlaps with this one, and it 6209 // starts strictly to the left of this one, we have to fix it 6210 // because the following code only handles ranges that start on 6211 // or after the start point of the range we're adding. 6212 if (entry->from() < current.from() && entry->to() >= current.from()) { 6213 // Snap the overlapping range in half around the start point of 6214 // the range we're adding. 6215 CharacterRange left = 6216 CharacterRange::Range(entry->from(), current.from() - 1); 6217 CharacterRange right = CharacterRange::Range(current.from(), entry->to()); 6218 // The left part of the overlapping range doesn't overlap. 6219 // Truncate the whole entry to be just the left part. 6220 entry->set_to(left.to()); 6221 // The right part is the one that overlaps. We add this part 6222 // to the map and let the next step deal with merging it with 6223 // the range we're adding. 6224 ZoneSplayTree<Config>::Locator loc; 6225 bool inserted = tree()->Insert(right.from(), &loc); 6226 DCHECK(inserted); 6227 USE(inserted); 6228 loc.set_value(Entry(right.from(), 6229 right.to(), 6230 entry->out_set())); 6231 } 6232 } 6233 while (current.is_valid()) { 6234 if (tree()->FindLeastGreaterThan(current.from(), &loc) && 6235 (loc.value().from() <= current.to()) && 6236 (loc.value().to() >= current.from())) { 6237 Entry* entry = &loc.value(); 6238 // We have overlap. If there is space between the start point of 6239 // the range we're adding and where the overlapping range starts 6240 // then we have to add a range covering just that space. 6241 if (current.from() < entry->from()) { 6242 ZoneSplayTree<Config>::Locator ins; 6243 bool inserted = tree()->Insert(current.from(), &ins); 6244 DCHECK(inserted); 6245 USE(inserted); 6246 ins.set_value(Entry(current.from(), 6247 entry->from() - 1, 6248 empty()->Extend(value, zone))); 6249 current.set_from(entry->from()); 6250 } 6251 DCHECK_EQ(current.from(), entry->from()); 6252 // If the overlapping range extends beyond the one we want to add 6253 // we have to snap the right part off and add it separately. 6254 if (entry->to() > current.to()) { 6255 ZoneSplayTree<Config>::Locator ins; 6256 bool inserted = tree()->Insert(current.to() + 1, &ins); 6257 DCHECK(inserted); 6258 USE(inserted); 6259 ins.set_value(Entry(current.to() + 1, 6260 entry->to(), 6261 entry->out_set())); 6262 entry->set_to(current.to()); 6263 } 6264 DCHECK(entry->to() <= current.to()); 6265 // The overlapping range is now completely contained by the range 6266 // we're adding so we can just update it and move the start point 6267 // of the range we're adding just past it. 6268 entry->AddValue(value, zone); 6269 DCHECK(entry->to() + 1 > current.from()); 6270 current.set_from(entry->to() + 1); 6271 } else { 6272 // There is no overlap so we can just add the range 6273 ZoneSplayTree<Config>::Locator ins; 6274 bool inserted = tree()->Insert(current.from(), &ins); 6275 DCHECK(inserted); 6276 USE(inserted); 6277 ins.set_value(Entry(current.from(), 6278 current.to(), 6279 empty()->Extend(value, zone))); 6280 break; 6281 } 6282 } 6283 } 6284 6285 6286 OutSet* DispatchTable::Get(uc32 value) { 6287 ZoneSplayTree<Config>::Locator loc; 6288 if (!tree()->FindGreatestLessThan(value, &loc)) 6289 return empty(); 6290 Entry* entry = &loc.value(); 6291 if (value <= entry->to()) 6292 return entry->out_set(); 6293 else 6294 return empty(); 6295 } 6296 6297 6298 // ------------------------------------------------------------------- 6299 // Analysis 6300 6301 6302 void Analysis::EnsureAnalyzed(RegExpNode* that) { 6303 StackLimitCheck check(isolate()); 6304 if (check.HasOverflowed()) { 6305 fail("Stack overflow"); 6306 return; 6307 } 6308 if (that->info()->been_analyzed || that->info()->being_analyzed) 6309 return; 6310 that->info()->being_analyzed = true; 6311 that->Accept(this); 6312 that->info()->being_analyzed = false; 6313 that->info()->been_analyzed = true; 6314 } 6315 6316 6317 void Analysis::VisitEnd(EndNode* that) { 6318 // nothing to do 6319 } 6320 6321 6322 void TextNode::CalculateOffsets() { 6323 int element_count = elements()->length(); 6324 // Set up the offsets of the elements relative to the start. This is a fixed 6325 // quantity since a TextNode can only contain fixed-width things. 6326 int cp_offset = 0; 6327 for (int i = 0; i < element_count; i++) { 6328 TextElement& elm = elements()->at(i); 6329 elm.set_cp_offset(cp_offset); 6330 cp_offset += elm.length(); 6331 } 6332 } 6333 6334 6335 void Analysis::VisitText(TextNode* that) { 6336 that->MakeCaseIndependent(isolate(), is_one_byte_); 6337 EnsureAnalyzed(that->on_success()); 6338 if (!has_failed()) { 6339 that->CalculateOffsets(); 6340 } 6341 } 6342 6343 6344 void Analysis::VisitAction(ActionNode* that) { 6345 RegExpNode* target = that->on_success(); 6346 EnsureAnalyzed(target); 6347 if (!has_failed()) { 6348 // If the next node is interested in what it follows then this node 6349 // has to be interested too so it can pass the information on. 6350 that->info()->AddFromFollowing(target->info()); 6351 } 6352 } 6353 6354 6355 void Analysis::VisitChoice(ChoiceNode* that) { 6356 NodeInfo* info = that->info(); 6357 for (int i = 0; i < that->alternatives()->length(); i++) { 6358 RegExpNode* node = that->alternatives()->at(i).node(); 6359 EnsureAnalyzed(node); 6360 if (has_failed()) return; 6361 // Anything the following nodes need to know has to be known by 6362 // this node also, so it can pass it on. 6363 info->AddFromFollowing(node->info()); 6364 } 6365 } 6366 6367 6368 void Analysis::VisitLoopChoice(LoopChoiceNode* that) { 6369 NodeInfo* info = that->info(); 6370 for (int i = 0; i < that->alternatives()->length(); i++) { 6371 RegExpNode* node = that->alternatives()->at(i).node(); 6372 if (node != that->loop_node()) { 6373 EnsureAnalyzed(node); 6374 if (has_failed()) return; 6375 info->AddFromFollowing(node->info()); 6376 } 6377 } 6378 // Check the loop last since it may need the value of this node 6379 // to get a correct result. 6380 EnsureAnalyzed(that->loop_node()); 6381 if (!has_failed()) { 6382 info->AddFromFollowing(that->loop_node()->info()); 6383 } 6384 } 6385 6386 6387 void Analysis::VisitBackReference(BackReferenceNode* that) { 6388 EnsureAnalyzed(that->on_success()); 6389 } 6390 6391 6392 void Analysis::VisitAssertion(AssertionNode* that) { 6393 EnsureAnalyzed(that->on_success()); 6394 } 6395 6396 6397 void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget, 6398 BoyerMooreLookahead* bm, 6399 bool not_at_start) { 6400 // Working out the set of characters that a backreference can match is too 6401 // hard, so we just say that any character can match. 6402 bm->SetRest(offset); 6403 SaveBMInfo(bm, not_at_start, offset); 6404 } 6405 6406 6407 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize == 6408 RegExpMacroAssembler::kTableSize); 6409 6410 6411 void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget, 6412 BoyerMooreLookahead* bm, bool not_at_start) { 6413 ZoneList<GuardedAlternative>* alts = alternatives(); 6414 budget = (budget - 1) / alts->length(); 6415 for (int i = 0; i < alts->length(); i++) { 6416 GuardedAlternative& alt = alts->at(i); 6417 if (alt.guards() != nullptr && alt.guards()->length() != 0) { 6418 bm->SetRest(offset); // Give up trying to fill in info. 6419 SaveBMInfo(bm, not_at_start, offset); 6420 return; 6421 } 6422 alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start); 6423 } 6424 SaveBMInfo(bm, not_at_start, offset); 6425 } 6426 6427 6428 void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget, 6429 BoyerMooreLookahead* bm, bool not_at_start) { 6430 if (initial_offset >= bm->length()) return; 6431 int offset = initial_offset; 6432 int max_char = bm->max_char(); 6433 for (int i = 0; i < elements()->length(); i++) { 6434 if (offset >= bm->length()) { 6435 if (initial_offset == 0) set_bm_info(not_at_start, bm); 6436 return; 6437 } 6438 TextElement text = elements()->at(i); 6439 if (text.text_type() == TextElement::ATOM) { 6440 RegExpAtom* atom = text.atom(); 6441 for (int j = 0; j < atom->length(); j++, offset++) { 6442 if (offset >= bm->length()) { 6443 if (initial_offset == 0) set_bm_info(not_at_start, bm); 6444 return; 6445 } 6446 uc16 character = atom->data()[j]; 6447 if (IgnoreCase(atom->flags())) { 6448 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 6449 int length = GetCaseIndependentLetters( 6450 isolate, character, bm->max_char() == String::kMaxOneByteCharCode, 6451 chars); 6452 for (int j = 0; j < length; j++) { 6453 bm->Set(offset, chars[j]); 6454 } 6455 } else { 6456 if (character <= max_char) bm->Set(offset, character); 6457 } 6458 } 6459 } else { 6460 DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type()); 6461 RegExpCharacterClass* char_class = text.char_class(); 6462 ZoneList<CharacterRange>* ranges = char_class->ranges(zone()); 6463 if (char_class->is_negated()) { 6464 bm->SetAll(offset); 6465 } else { 6466 for (int k = 0; k < ranges->length(); k++) { 6467 CharacterRange& range = ranges->at(k); 6468 if (range.from() > max_char) continue; 6469 int to = Min(max_char, static_cast<int>(range.to())); 6470 bm->SetInterval(offset, Interval(range.from(), to)); 6471 } 6472 } 6473 offset++; 6474 } 6475 } 6476 if (offset >= bm->length()) { 6477 if (initial_offset == 0) set_bm_info(not_at_start, bm); 6478 return; 6479 } 6480 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, 6481 true); // Not at start after a text node. 6482 if (initial_offset == 0) set_bm_info(not_at_start, bm); 6483 } 6484 6485 6486 // ------------------------------------------------------------------- 6487 // Dispatch table construction 6488 6489 6490 void DispatchTableConstructor::VisitEnd(EndNode* that) { 6491 AddRange(CharacterRange::Everything()); 6492 } 6493 6494 6495 void DispatchTableConstructor::BuildTable(ChoiceNode* node) { 6496 node->set_being_calculated(true); 6497 ZoneList<GuardedAlternative>* alternatives = node->alternatives(); 6498 for (int i = 0; i < alternatives->length(); i++) { 6499 set_choice_index(i); 6500 alternatives->at(i).node()->Accept(this); 6501 } 6502 node->set_being_calculated(false); 6503 } 6504 6505 6506 class AddDispatchRange { 6507 public: 6508 explicit AddDispatchRange(DispatchTableConstructor* constructor) 6509 : constructor_(constructor) { } 6510 void Call(uc32 from, DispatchTable::Entry entry); 6511 private: 6512 DispatchTableConstructor* constructor_; 6513 }; 6514 6515 6516 void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) { 6517 constructor_->AddRange(CharacterRange::Range(from, entry.to())); 6518 } 6519 6520 6521 void DispatchTableConstructor::VisitChoice(ChoiceNode* node) { 6522 if (node->being_calculated()) 6523 return; 6524 DispatchTable* table = node->GetTable(ignore_case_); 6525 AddDispatchRange adder(this); 6526 table->ForEach(&adder); 6527 } 6528 6529 6530 void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) { 6531 // TODO(160): Find the node that we refer back to and propagate its start 6532 // set back to here. For now we just accept anything. 6533 AddRange(CharacterRange::Everything()); 6534 } 6535 6536 6537 void DispatchTableConstructor::VisitAssertion(AssertionNode* that) { 6538 RegExpNode* target = that->on_success(); 6539 target->Accept(this); 6540 } 6541 6542 6543 static int CompareRangeByFrom(const CharacterRange* a, 6544 const CharacterRange* b) { 6545 return Compare<uc16>(a->from(), b->from()); 6546 } 6547 6548 6549 void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) { 6550 ranges->Sort(CompareRangeByFrom); 6551 uc16 last = 0; 6552 for (int i = 0; i < ranges->length(); i++) { 6553 CharacterRange range = ranges->at(i); 6554 if (last < range.from()) 6555 AddRange(CharacterRange::Range(last, range.from() - 1)); 6556 if (range.to() >= last) { 6557 if (range.to() == String::kMaxCodePoint) { 6558 return; 6559 } else { 6560 last = range.to() + 1; 6561 } 6562 } 6563 } 6564 AddRange(CharacterRange::Range(last, String::kMaxCodePoint)); 6565 } 6566 6567 6568 void DispatchTableConstructor::VisitText(TextNode* that) { 6569 TextElement elm = that->elements()->at(0); 6570 switch (elm.text_type()) { 6571 case TextElement::ATOM: { 6572 uc16 c = elm.atom()->data()[0]; 6573 AddRange(CharacterRange::Range(c, c)); 6574 break; 6575 } 6576 case TextElement::CHAR_CLASS: { 6577 RegExpCharacterClass* tree = elm.char_class(); 6578 ZoneList<CharacterRange>* ranges = tree->ranges(that->zone()); 6579 if (tree->is_negated()) { 6580 AddInverse(ranges); 6581 } else { 6582 for (int i = 0; i < ranges->length(); i++) 6583 AddRange(ranges->at(i)); 6584 } 6585 break; 6586 } 6587 default: { 6588 UNIMPLEMENTED(); 6589 } 6590 } 6591 } 6592 6593 6594 void DispatchTableConstructor::VisitAction(ActionNode* that) { 6595 RegExpNode* target = that->on_success(); 6596 target->Accept(this); 6597 } 6598 6599 RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler, 6600 RegExpNode* on_success, 6601 JSRegExp::Flags flags) { 6602 // If the regexp matching starts within a surrogate pair, step back 6603 // to the lead surrogate and start matching from there. 6604 DCHECK(!compiler->read_backward()); 6605 Zone* zone = compiler->zone(); 6606 ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List( 6607 zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd)); 6608 ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List( 6609 zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd)); 6610 6611 ChoiceNode* optional_step_back = new (zone) ChoiceNode(2, zone); 6612 6613 int stack_register = compiler->UnicodeLookaroundStackRegister(); 6614 int position_register = compiler->UnicodeLookaroundPositionRegister(); 6615 RegExpNode* step_back = TextNode::CreateForCharacterRanges( 6616 zone, lead_surrogates, true, on_success, flags); 6617 RegExpLookaround::Builder builder(true, step_back, stack_register, 6618 position_register); 6619 RegExpNode* match_trail = TextNode::CreateForCharacterRanges( 6620 zone, trail_surrogates, false, builder.on_match_success(), flags); 6621 6622 optional_step_back->AddAlternative( 6623 GuardedAlternative(builder.ForMatch(match_trail))); 6624 optional_step_back->AddAlternative(GuardedAlternative(on_success)); 6625 6626 return optional_step_back; 6627 } 6628 6629 6630 RegExpEngine::CompilationResult RegExpEngine::Compile( 6631 Isolate* isolate, Zone* zone, RegExpCompileData* data, 6632 JSRegExp::Flags flags, Handle<String> pattern, 6633 Handle<String> sample_subject, bool is_one_byte) { 6634 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) { 6635 return IrregexpRegExpTooBig(isolate); 6636 } 6637 bool is_sticky = IsSticky(flags); 6638 bool is_global = IsGlobal(flags); 6639 bool is_unicode = IsUnicode(flags); 6640 RegExpCompiler compiler(isolate, zone, data->capture_count, is_one_byte); 6641 6642 if (compiler.optimize()) 6643 compiler.set_optimize(!TooMuchRegExpCode(isolate, pattern)); 6644 6645 // Sample some characters from the middle of the string. 6646 static const int kSampleSize = 128; 6647 6648 sample_subject = String::Flatten(isolate, sample_subject); 6649 int chars_sampled = 0; 6650 int half_way = (sample_subject->length() - kSampleSize) / 2; 6651 for (int i = Max(0, half_way); 6652 i < sample_subject->length() && chars_sampled < kSampleSize; 6653 i++, chars_sampled++) { 6654 compiler.frequency_collator()->CountCharacter(sample_subject->Get(i)); 6655 } 6656 6657 // Wrap the body of the regexp in capture #0. 6658 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree, 6659 0, 6660 &compiler, 6661 compiler.accept()); 6662 RegExpNode* node = captured_body; 6663 bool is_end_anchored = data->tree->IsAnchoredAtEnd(); 6664 bool is_start_anchored = data->tree->IsAnchoredAtStart(); 6665 int max_length = data->tree->max_match(); 6666 if (!is_start_anchored && !is_sticky) { 6667 // Add a .*? at the beginning, outside the body capture, unless 6668 // this expression is anchored at the beginning or sticky. 6669 JSRegExp::Flags default_flags = JSRegExp::Flags(); 6670 RegExpNode* loop_node = RegExpQuantifier::ToNode( 6671 0, RegExpTree::kInfinity, false, 6672 new (zone) RegExpCharacterClass('*', default_flags), &compiler, 6673 captured_body, data->contains_anchor); 6674 6675 if (data->contains_anchor) { 6676 // Unroll loop once, to take care of the case that might start 6677 // at the start of input. 6678 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone); 6679 first_step_node->AddAlternative(GuardedAlternative(captured_body)); 6680 first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode( 6681 new (zone) RegExpCharacterClass('*', default_flags), false, 6682 loop_node))); 6683 node = first_step_node; 6684 } else { 6685 node = loop_node; 6686 } 6687 } 6688 if (is_one_byte) { 6689 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion); 6690 // Do it again to propagate the new nodes to places where they were not 6691 // put because they had not been calculated yet. 6692 if (node != nullptr) { 6693 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion); 6694 } 6695 } else if (is_unicode && (is_global || is_sticky)) { 6696 node = OptionallyStepBackToLeadSurrogate(&compiler, node, flags); 6697 } 6698 6699 if (node == nullptr) node = new (zone) EndNode(EndNode::BACKTRACK, zone); 6700 data->node = node; 6701 Analysis analysis(isolate, is_one_byte); 6702 analysis.EnsureAnalyzed(node); 6703 if (analysis.has_failed()) { 6704 const char* error_message = analysis.error_message(); 6705 return CompilationResult(isolate, error_message); 6706 } 6707 6708 // Create the correct assembler for the architecture. 6709 #ifndef V8_INTERPRETED_REGEXP 6710 // Native regexp implementation. 6711 6712 NativeRegExpMacroAssembler::Mode mode = 6713 is_one_byte ? NativeRegExpMacroAssembler::LATIN1 6714 : NativeRegExpMacroAssembler::UC16; 6715 6716 #if V8_TARGET_ARCH_IA32 6717 RegExpMacroAssemblerIA32 macro_assembler(isolate, zone, mode, 6718 (data->capture_count + 1) * 2); 6719 #elif V8_TARGET_ARCH_X64 6720 RegExpMacroAssemblerX64 macro_assembler(isolate, zone, mode, 6721 (data->capture_count + 1) * 2); 6722 #elif V8_TARGET_ARCH_ARM 6723 RegExpMacroAssemblerARM macro_assembler(isolate, zone, mode, 6724 (data->capture_count + 1) * 2); 6725 #elif V8_TARGET_ARCH_ARM64 6726 RegExpMacroAssemblerARM64 macro_assembler(isolate, zone, mode, 6727 (data->capture_count + 1) * 2); 6728 #elif V8_TARGET_ARCH_S390 6729 RegExpMacroAssemblerS390 macro_assembler(isolate, zone, mode, 6730 (data->capture_count + 1) * 2); 6731 #elif V8_TARGET_ARCH_PPC 6732 RegExpMacroAssemblerPPC macro_assembler(isolate, zone, mode, 6733 (data->capture_count + 1) * 2); 6734 #elif V8_TARGET_ARCH_MIPS 6735 RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode, 6736 (data->capture_count + 1) * 2); 6737 #elif V8_TARGET_ARCH_MIPS64 6738 RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode, 6739 (data->capture_count + 1) * 2); 6740 #else 6741 #error "Unsupported architecture" 6742 #endif 6743 6744 #else // V8_INTERPRETED_REGEXP 6745 // Interpreted regexp implementation. 6746 EmbeddedVector<byte, 1024> codes; 6747 RegExpMacroAssemblerIrregexp macro_assembler(isolate, codes, zone); 6748 #endif // V8_INTERPRETED_REGEXP 6749 6750 macro_assembler.set_slow_safe(TooMuchRegExpCode(isolate, pattern)); 6751 6752 // Inserted here, instead of in Assembler, because it depends on information 6753 // in the AST that isn't replicated in the Node structure. 6754 static const int kMaxBacksearchLimit = 1024; 6755 if (is_end_anchored && !is_start_anchored && !is_sticky && 6756 max_length < kMaxBacksearchLimit) { 6757 macro_assembler.SetCurrentPositionFromEnd(max_length); 6758 } 6759 6760 if (is_global) { 6761 RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL; 6762 if (data->tree->min_match() > 0) { 6763 mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK; 6764 } else if (is_unicode) { 6765 mode = RegExpMacroAssembler::GLOBAL_UNICODE; 6766 } 6767 macro_assembler.set_global_mode(mode); 6768 } 6769 6770 return compiler.Assemble(isolate, ¯o_assembler, node, data->capture_count, 6771 pattern); 6772 } 6773 6774 bool RegExpEngine::TooMuchRegExpCode(Isolate* isolate, Handle<String> pattern) { 6775 Heap* heap = isolate->heap(); 6776 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize; 6777 if (heap->isolate()->total_regexp_code_generated() > 6778 RegExpImpl::kRegExpCompiledLimit && 6779 heap->CommittedMemoryExecutable() > 6780 RegExpImpl::kRegExpExecutableMemoryLimit) { 6781 too_much = true; 6782 } 6783 return too_much; 6784 } 6785 6786 6787 Object* RegExpResultsCache::Lookup(Heap* heap, String* key_string, 6788 Object* key_pattern, 6789 FixedArray** last_match_cache, 6790 ResultsCacheType type) { 6791 FixedArray* cache; 6792 if (!key_string->IsInternalizedString()) return Smi::kZero; 6793 if (type == STRING_SPLIT_SUBSTRINGS) { 6794 DCHECK(key_pattern->IsString()); 6795 if (!key_pattern->IsInternalizedString()) return Smi::kZero; 6796 cache = heap->string_split_cache(); 6797 } else { 6798 DCHECK(type == REGEXP_MULTIPLE_INDICES); 6799 DCHECK(key_pattern->IsFixedArray()); 6800 cache = heap->regexp_multiple_cache(); 6801 } 6802 6803 uint32_t hash = key_string->Hash(); 6804 uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) & 6805 ~(kArrayEntriesPerCacheEntry - 1)); 6806 if (cache->get(index + kStringOffset) != key_string || 6807 cache->get(index + kPatternOffset) != key_pattern) { 6808 index = 6809 ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1)); 6810 if (cache->get(index + kStringOffset) != key_string || 6811 cache->get(index + kPatternOffset) != key_pattern) { 6812 return Smi::kZero; 6813 } 6814 } 6815 6816 *last_match_cache = FixedArray::cast(cache->get(index + kLastMatchOffset)); 6817 return cache->get(index + kArrayOffset); 6818 } 6819 6820 6821 void RegExpResultsCache::Enter(Isolate* isolate, Handle<String> key_string, 6822 Handle<Object> key_pattern, 6823 Handle<FixedArray> value_array, 6824 Handle<FixedArray> last_match_cache, 6825 ResultsCacheType type) { 6826 Factory* factory = isolate->factory(); 6827 Handle<FixedArray> cache; 6828 if (!key_string->IsInternalizedString()) return; 6829 if (type == STRING_SPLIT_SUBSTRINGS) { 6830 DCHECK(key_pattern->IsString()); 6831 if (!key_pattern->IsInternalizedString()) return; 6832 cache = factory->string_split_cache(); 6833 } else { 6834 DCHECK(type == REGEXP_MULTIPLE_INDICES); 6835 DCHECK(key_pattern->IsFixedArray()); 6836 cache = factory->regexp_multiple_cache(); 6837 } 6838 6839 uint32_t hash = key_string->Hash(); 6840 uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) & 6841 ~(kArrayEntriesPerCacheEntry - 1)); 6842 if (cache->get(index + kStringOffset) == Smi::kZero) { 6843 cache->set(index + kStringOffset, *key_string); 6844 cache->set(index + kPatternOffset, *key_pattern); 6845 cache->set(index + kArrayOffset, *value_array); 6846 cache->set(index + kLastMatchOffset, *last_match_cache); 6847 } else { 6848 uint32_t index2 = 6849 ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1)); 6850 if (cache->get(index2 + kStringOffset) == Smi::kZero) { 6851 cache->set(index2 + kStringOffset, *key_string); 6852 cache->set(index2 + kPatternOffset, *key_pattern); 6853 cache->set(index2 + kArrayOffset, *value_array); 6854 cache->set(index2 + kLastMatchOffset, *last_match_cache); 6855 } else { 6856 cache->set(index2 + kStringOffset, Smi::kZero); 6857 cache->set(index2 + kPatternOffset, Smi::kZero); 6858 cache->set(index2 + kArrayOffset, Smi::kZero); 6859 cache->set(index2 + kLastMatchOffset, Smi::kZero); 6860 cache->set(index + kStringOffset, *key_string); 6861 cache->set(index + kPatternOffset, *key_pattern); 6862 cache->set(index + kArrayOffset, *value_array); 6863 cache->set(index + kLastMatchOffset, *last_match_cache); 6864 } 6865 } 6866 // If the array is a reasonably short list of substrings, convert it into a 6867 // list of internalized strings. 6868 if (type == STRING_SPLIT_SUBSTRINGS && value_array->length() < 100) { 6869 for (int i = 0; i < value_array->length(); i++) { 6870 Handle<String> str(String::cast(value_array->get(i)), isolate); 6871 Handle<String> internalized_str = factory->InternalizeString(str); 6872 value_array->set(i, *internalized_str); 6873 } 6874 } 6875 // Convert backing store to a copy-on-write array. 6876 value_array->set_map_no_write_barrier( 6877 ReadOnlyRoots(isolate).fixed_cow_array_map()); 6878 } 6879 6880 6881 void RegExpResultsCache::Clear(FixedArray* cache) { 6882 for (int i = 0; i < kRegExpResultsCacheSize; i++) { 6883 cache->set(i, Smi::kZero); 6884 } 6885 } 6886 6887 } // namespace internal 6888 } // namespace v8 6889