1 // Copyright 2012 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "src/regexp/jsregexp.h" 6 7 #include "src/ast/ast.h" 8 #include "src/base/platform/platform.h" 9 #include "src/compilation-cache.h" 10 #include "src/compiler.h" 11 #include "src/execution.h" 12 #include "src/factory.h" 13 #include "src/isolate-inl.h" 14 #include "src/messages.h" 15 #include "src/ostreams.h" 16 #include "src/regexp/interpreter-irregexp.h" 17 #include "src/regexp/jsregexp-inl.h" 18 #include "src/regexp/regexp-macro-assembler.h" 19 #include "src/regexp/regexp-macro-assembler-irregexp.h" 20 #include "src/regexp/regexp-macro-assembler-tracer.h" 21 #include "src/regexp/regexp-parser.h" 22 #include "src/regexp/regexp-stack.h" 23 #include "src/runtime/runtime.h" 24 #include "src/splay-tree-inl.h" 25 #include "src/string-search.h" 26 #include "src/unicode-decoder.h" 27 28 #ifndef V8_INTERPRETED_REGEXP 29 #if V8_TARGET_ARCH_IA32 30 #include "src/regexp/ia32/regexp-macro-assembler-ia32.h" 31 #elif V8_TARGET_ARCH_X64 32 #include "src/regexp/x64/regexp-macro-assembler-x64.h" 33 #elif V8_TARGET_ARCH_ARM64 34 #include "src/regexp/arm64/regexp-macro-assembler-arm64.h" 35 #elif V8_TARGET_ARCH_ARM 36 #include "src/regexp/arm/regexp-macro-assembler-arm.h" 37 #elif V8_TARGET_ARCH_PPC 38 #include "src/regexp/ppc/regexp-macro-assembler-ppc.h" 39 #elif V8_TARGET_ARCH_MIPS 40 #include "src/regexp/mips/regexp-macro-assembler-mips.h" 41 #elif V8_TARGET_ARCH_MIPS64 42 #include "src/regexp/mips64/regexp-macro-assembler-mips64.h" 43 #elif V8_TARGET_ARCH_X87 44 #include "src/regexp/x87/regexp-macro-assembler-x87.h" 45 #else 46 #error Unsupported target architecture. 47 #endif 48 #endif 49 50 51 namespace v8 { 52 namespace internal { 53 54 MUST_USE_RESULT 55 static inline MaybeHandle<Object> ThrowRegExpException( 56 Handle<JSRegExp> re, Handle<String> pattern, Handle<String> error_text) { 57 Isolate* isolate = re->GetIsolate(); 58 THROW_NEW_ERROR(isolate, NewSyntaxError(MessageTemplate::kMalformedRegExp, 59 pattern, error_text), 60 Object); 61 } 62 63 64 inline void ThrowRegExpException(Handle<JSRegExp> re, 65 Handle<String> error_text) { 66 USE(ThrowRegExpException(re, Handle<String>(re->Pattern()), error_text)); 67 } 68 69 70 ContainedInLattice AddRange(ContainedInLattice containment, 71 const int* ranges, 72 int ranges_length, 73 Interval new_range) { 74 DCHECK((ranges_length & 1) == 1); 75 DCHECK(ranges[ranges_length - 1] == String::kMaxUtf16CodeUnit + 1); 76 if (containment == kLatticeUnknown) return containment; 77 bool inside = false; 78 int last = 0; 79 for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) { 80 // Consider the range from last to ranges[i]. 81 // We haven't got to the new range yet. 82 if (ranges[i] <= new_range.from()) continue; 83 // New range is wholly inside last-ranges[i]. Note that new_range.to() is 84 // inclusive, but the values in ranges are not. 85 if (last <= new_range.from() && new_range.to() < ranges[i]) { 86 return Combine(containment, inside ? kLatticeIn : kLatticeOut); 87 } 88 return kLatticeUnknown; 89 } 90 return containment; 91 } 92 93 94 // More makes code generation slower, less makes V8 benchmark score lower. 95 const int kMaxLookaheadForBoyerMoore = 8; 96 // In a 3-character pattern you can maximally step forwards 3 characters 97 // at a time, which is not always enough to pay for the extra logic. 98 const int kPatternTooShortForBoyerMoore = 2; 99 100 101 // Identifies the sort of regexps where the regexp engine is faster 102 // than the code used for atom matches. 103 static bool HasFewDifferentCharacters(Handle<String> pattern) { 104 int length = Min(kMaxLookaheadForBoyerMoore, pattern->length()); 105 if (length <= kPatternTooShortForBoyerMoore) return false; 106 const int kMod = 128; 107 bool character_found[kMod]; 108 int different = 0; 109 memset(&character_found[0], 0, sizeof(character_found)); 110 for (int i = 0; i < length; i++) { 111 int ch = (pattern->Get(i) & (kMod - 1)); 112 if (!character_found[ch]) { 113 character_found[ch] = true; 114 different++; 115 // We declare a regexp low-alphabet if it has at least 3 times as many 116 // characters as it has different characters. 117 if (different * 3 > length) return false; 118 } 119 } 120 return true; 121 } 122 123 124 // Generic RegExp methods. Dispatches to implementation specific methods. 125 126 127 MaybeHandle<Object> RegExpImpl::Compile(Handle<JSRegExp> re, 128 Handle<String> pattern, 129 JSRegExp::Flags flags) { 130 Isolate* isolate = re->GetIsolate(); 131 Zone zone; 132 CompilationCache* compilation_cache = isolate->compilation_cache(); 133 MaybeHandle<FixedArray> maybe_cached = 134 compilation_cache->LookupRegExp(pattern, flags); 135 Handle<FixedArray> cached; 136 bool in_cache = maybe_cached.ToHandle(&cached); 137 LOG(isolate, RegExpCompileEvent(re, in_cache)); 138 139 Handle<Object> result; 140 if (in_cache) { 141 re->set_data(*cached); 142 return re; 143 } 144 pattern = String::Flatten(pattern); 145 PostponeInterruptsScope postpone(isolate); 146 RegExpCompileData parse_result; 147 FlatStringReader reader(isolate, pattern); 148 if (!RegExpParser::ParseRegExp(re->GetIsolate(), &zone, &reader, 149 flags & JSRegExp::kMultiline, 150 flags & JSRegExp::kUnicode, &parse_result)) { 151 // Throw an exception if we fail to parse the pattern. 152 return ThrowRegExpException(re, pattern, parse_result.error); 153 } 154 155 bool has_been_compiled = false; 156 157 if (parse_result.simple && !(flags & JSRegExp::kIgnoreCase) && 158 !(flags & JSRegExp::kSticky) && !HasFewDifferentCharacters(pattern)) { 159 // Parse-tree is a single atom that is equal to the pattern. 160 AtomCompile(re, pattern, flags, pattern); 161 has_been_compiled = true; 162 } else if (parse_result.tree->IsAtom() && !(flags & JSRegExp::kIgnoreCase) && 163 !(flags & JSRegExp::kSticky) && parse_result.capture_count == 0) { 164 RegExpAtom* atom = parse_result.tree->AsAtom(); 165 Vector<const uc16> atom_pattern = atom->data(); 166 Handle<String> atom_string; 167 ASSIGN_RETURN_ON_EXCEPTION( 168 isolate, atom_string, 169 isolate->factory()->NewStringFromTwoByte(atom_pattern), 170 Object); 171 if (!HasFewDifferentCharacters(atom_string)) { 172 AtomCompile(re, pattern, flags, atom_string); 173 has_been_compiled = true; 174 } 175 } 176 if (!has_been_compiled) { 177 IrregexpInitialize(re, pattern, flags, parse_result.capture_count); 178 } 179 DCHECK(re->data()->IsFixedArray()); 180 // Compilation succeeded so the data is set on the regexp 181 // and we can store it in the cache. 182 Handle<FixedArray> data(FixedArray::cast(re->data())); 183 compilation_cache->PutRegExp(pattern, flags, data); 184 185 return re; 186 } 187 188 189 MaybeHandle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp, 190 Handle<String> subject, 191 int index, 192 Handle<JSArray> last_match_info) { 193 switch (regexp->TypeTag()) { 194 case JSRegExp::ATOM: 195 return AtomExec(regexp, subject, index, last_match_info); 196 case JSRegExp::IRREGEXP: { 197 return IrregexpExec(regexp, subject, index, last_match_info); 198 } 199 default: 200 UNREACHABLE(); 201 return MaybeHandle<Object>(); 202 } 203 } 204 205 206 // RegExp Atom implementation: Simple string search using indexOf. 207 208 209 void RegExpImpl::AtomCompile(Handle<JSRegExp> re, 210 Handle<String> pattern, 211 JSRegExp::Flags flags, 212 Handle<String> match_pattern) { 213 re->GetIsolate()->factory()->SetRegExpAtomData(re, 214 JSRegExp::ATOM, 215 pattern, 216 flags, 217 match_pattern); 218 } 219 220 221 static void SetAtomLastCapture(FixedArray* array, 222 String* subject, 223 int from, 224 int to) { 225 SealHandleScope shs(array->GetIsolate()); 226 RegExpImpl::SetLastCaptureCount(array, 2); 227 RegExpImpl::SetLastSubject(array, subject); 228 RegExpImpl::SetLastInput(array, subject); 229 RegExpImpl::SetCapture(array, 0, from); 230 RegExpImpl::SetCapture(array, 1, to); 231 } 232 233 234 int RegExpImpl::AtomExecRaw(Handle<JSRegExp> regexp, 235 Handle<String> subject, 236 int index, 237 int32_t* output, 238 int output_size) { 239 Isolate* isolate = regexp->GetIsolate(); 240 241 DCHECK(0 <= index); 242 DCHECK(index <= subject->length()); 243 244 subject = String::Flatten(subject); 245 DisallowHeapAllocation no_gc; // ensure vectors stay valid 246 247 String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex)); 248 int needle_len = needle->length(); 249 DCHECK(needle->IsFlat()); 250 DCHECK_LT(0, needle_len); 251 252 if (index + needle_len > subject->length()) { 253 return RegExpImpl::RE_FAILURE; 254 } 255 256 for (int i = 0; i < output_size; i += 2) { 257 String::FlatContent needle_content = needle->GetFlatContent(); 258 String::FlatContent subject_content = subject->GetFlatContent(); 259 DCHECK(needle_content.IsFlat()); 260 DCHECK(subject_content.IsFlat()); 261 // dispatch on type of strings 262 index = 263 (needle_content.IsOneByte() 264 ? (subject_content.IsOneByte() 265 ? SearchString(isolate, subject_content.ToOneByteVector(), 266 needle_content.ToOneByteVector(), index) 267 : SearchString(isolate, subject_content.ToUC16Vector(), 268 needle_content.ToOneByteVector(), index)) 269 : (subject_content.IsOneByte() 270 ? SearchString(isolate, subject_content.ToOneByteVector(), 271 needle_content.ToUC16Vector(), index) 272 : SearchString(isolate, subject_content.ToUC16Vector(), 273 needle_content.ToUC16Vector(), index))); 274 if (index == -1) { 275 return i / 2; // Return number of matches. 276 } else { 277 output[i] = index; 278 output[i+1] = index + needle_len; 279 index += needle_len; 280 } 281 } 282 return output_size / 2; 283 } 284 285 286 Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re, 287 Handle<String> subject, 288 int index, 289 Handle<JSArray> last_match_info) { 290 Isolate* isolate = re->GetIsolate(); 291 292 static const int kNumRegisters = 2; 293 STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize); 294 int32_t* output_registers = isolate->jsregexp_static_offsets_vector(); 295 296 int res = AtomExecRaw(re, subject, index, output_registers, kNumRegisters); 297 298 if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value(); 299 300 DCHECK_EQ(res, RegExpImpl::RE_SUCCESS); 301 SealHandleScope shs(isolate); 302 FixedArray* array = FixedArray::cast(last_match_info->elements()); 303 SetAtomLastCapture(array, *subject, output_registers[0], output_registers[1]); 304 return last_match_info; 305 } 306 307 308 // Irregexp implementation. 309 310 // Ensures that the regexp object contains a compiled version of the 311 // source for either one-byte or two-byte subject strings. 312 // If the compiled version doesn't already exist, it is compiled 313 // from the source pattern. 314 // If compilation fails, an exception is thrown and this function 315 // returns false. 316 bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, 317 Handle<String> sample_subject, 318 bool is_one_byte) { 319 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_one_byte)); 320 #ifdef V8_INTERPRETED_REGEXP 321 if (compiled_code->IsByteArray()) return true; 322 #else // V8_INTERPRETED_REGEXP (RegExp native code) 323 if (compiled_code->IsCode()) return true; 324 #endif 325 // We could potentially have marked this as flushable, but have kept 326 // a saved version if we did not flush it yet. 327 Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_one_byte)); 328 if (saved_code->IsCode()) { 329 // Reinstate the code in the original place. 330 re->SetDataAt(JSRegExp::code_index(is_one_byte), saved_code); 331 DCHECK(compiled_code->IsSmi()); 332 return true; 333 } 334 return CompileIrregexp(re, sample_subject, is_one_byte); 335 } 336 337 338 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, 339 Handle<String> sample_subject, 340 bool is_one_byte) { 341 // Compile the RegExp. 342 Isolate* isolate = re->GetIsolate(); 343 Zone zone; 344 PostponeInterruptsScope postpone(isolate); 345 // If we had a compilation error the last time this is saved at the 346 // saved code index. 347 Object* entry = re->DataAt(JSRegExp::code_index(is_one_byte)); 348 // When arriving here entry can only be a smi, either representing an 349 // uncompiled regexp, a previous compilation error, or code that has 350 // been flushed. 351 DCHECK(entry->IsSmi()); 352 int entry_value = Smi::cast(entry)->value(); 353 DCHECK(entry_value == JSRegExp::kUninitializedValue || 354 entry_value == JSRegExp::kCompilationErrorValue || 355 (entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0)); 356 357 if (entry_value == JSRegExp::kCompilationErrorValue) { 358 // A previous compilation failed and threw an error which we store in 359 // the saved code index (we store the error message, not the actual 360 // error). Recreate the error object and throw it. 361 Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_one_byte)); 362 DCHECK(error_string->IsString()); 363 Handle<String> error_message(String::cast(error_string)); 364 ThrowRegExpException(re, error_message); 365 return false; 366 } 367 368 JSRegExp::Flags flags = re->GetFlags(); 369 370 Handle<String> pattern(re->Pattern()); 371 pattern = String::Flatten(pattern); 372 RegExpCompileData compile_data; 373 FlatStringReader reader(isolate, pattern); 374 if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, 375 flags & JSRegExp::kMultiline, 376 flags & JSRegExp::kUnicode, &compile_data)) { 377 // Throw an exception if we fail to parse the pattern. 378 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once. 379 USE(ThrowRegExpException(re, pattern, compile_data.error)); 380 return false; 381 } 382 RegExpEngine::CompilationResult result = RegExpEngine::Compile( 383 isolate, &zone, &compile_data, flags & JSRegExp::kIgnoreCase, 384 flags & JSRegExp::kGlobal, flags & JSRegExp::kMultiline, 385 flags & JSRegExp::kSticky, pattern, sample_subject, is_one_byte); 386 if (result.error_message != NULL) { 387 // Unable to compile regexp. 388 Handle<String> error_message = isolate->factory()->NewStringFromUtf8( 389 CStrVector(result.error_message)).ToHandleChecked(); 390 ThrowRegExpException(re, error_message); 391 return false; 392 } 393 394 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data())); 395 data->set(JSRegExp::code_index(is_one_byte), result.code); 396 int register_max = IrregexpMaxRegisterCount(*data); 397 if (result.num_registers > register_max) { 398 SetIrregexpMaxRegisterCount(*data, result.num_registers); 399 } 400 401 return true; 402 } 403 404 405 int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) { 406 return Smi::cast( 407 re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value(); 408 } 409 410 411 void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) { 412 re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value)); 413 } 414 415 416 int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) { 417 return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value(); 418 } 419 420 421 int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) { 422 return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value(); 423 } 424 425 426 ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_one_byte) { 427 return ByteArray::cast(re->get(JSRegExp::code_index(is_one_byte))); 428 } 429 430 431 Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_one_byte) { 432 return Code::cast(re->get(JSRegExp::code_index(is_one_byte))); 433 } 434 435 436 void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re, 437 Handle<String> pattern, 438 JSRegExp::Flags flags, 439 int capture_count) { 440 // Initialize compiled code entries to null. 441 re->GetIsolate()->factory()->SetRegExpIrregexpData(re, 442 JSRegExp::IRREGEXP, 443 pattern, 444 flags, 445 capture_count); 446 } 447 448 449 int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp, 450 Handle<String> subject) { 451 subject = String::Flatten(subject); 452 453 // Check representation of the underlying storage. 454 bool is_one_byte = subject->IsOneByteRepresentationUnderneath(); 455 if (!EnsureCompiledIrregexp(regexp, subject, is_one_byte)) return -1; 456 457 #ifdef V8_INTERPRETED_REGEXP 458 // Byte-code regexp needs space allocated for all its registers. 459 // The result captures are copied to the start of the registers array 460 // if the match succeeds. This way those registers are not clobbered 461 // when we set the last match info from last successful match. 462 return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) + 463 (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2; 464 #else // V8_INTERPRETED_REGEXP 465 // Native regexp only needs room to output captures. Registers are handled 466 // internally. 467 return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2; 468 #endif // V8_INTERPRETED_REGEXP 469 } 470 471 472 int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp, 473 Handle<String> subject, 474 int index, 475 int32_t* output, 476 int output_size) { 477 Isolate* isolate = regexp->GetIsolate(); 478 479 Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate); 480 481 DCHECK(index >= 0); 482 DCHECK(index <= subject->length()); 483 DCHECK(subject->IsFlat()); 484 485 bool is_one_byte = subject->IsOneByteRepresentationUnderneath(); 486 487 #ifndef V8_INTERPRETED_REGEXP 488 DCHECK(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2); 489 do { 490 EnsureCompiledIrregexp(regexp, subject, is_one_byte); 491 Handle<Code> code(IrregexpNativeCode(*irregexp, is_one_byte), isolate); 492 // The stack is used to allocate registers for the compiled regexp code. 493 // This means that in case of failure, the output registers array is left 494 // untouched and contains the capture results from the previous successful 495 // match. We can use that to set the last match info lazily. 496 NativeRegExpMacroAssembler::Result res = 497 NativeRegExpMacroAssembler::Match(code, 498 subject, 499 output, 500 output_size, 501 index, 502 isolate); 503 if (res != NativeRegExpMacroAssembler::RETRY) { 504 DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION || 505 isolate->has_pending_exception()); 506 STATIC_ASSERT( 507 static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS); 508 STATIC_ASSERT( 509 static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE); 510 STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION) 511 == RE_EXCEPTION); 512 return static_cast<IrregexpResult>(res); 513 } 514 // If result is RETRY, the string has changed representation, and we 515 // must restart from scratch. 516 // In this case, it means we must make sure we are prepared to handle 517 // the, potentially, different subject (the string can switch between 518 // being internal and external, and even between being Latin1 and UC16, 519 // but the characters are always the same). 520 IrregexpPrepare(regexp, subject); 521 is_one_byte = subject->IsOneByteRepresentationUnderneath(); 522 } while (true); 523 UNREACHABLE(); 524 return RE_EXCEPTION; 525 #else // V8_INTERPRETED_REGEXP 526 527 DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp)); 528 // We must have done EnsureCompiledIrregexp, so we can get the number of 529 // registers. 530 int number_of_capture_registers = 531 (IrregexpNumberOfCaptures(*irregexp) + 1) * 2; 532 int32_t* raw_output = &output[number_of_capture_registers]; 533 // We do not touch the actual capture result registers until we know there 534 // has been a match so that we can use those capture results to set the 535 // last match info. 536 for (int i = number_of_capture_registers - 1; i >= 0; i--) { 537 raw_output[i] = -1; 538 } 539 Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_one_byte), 540 isolate); 541 542 IrregexpResult result = IrregexpInterpreter::Match(isolate, 543 byte_codes, 544 subject, 545 raw_output, 546 index); 547 if (result == RE_SUCCESS) { 548 // Copy capture results to the start of the registers array. 549 MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t)); 550 } 551 if (result == RE_EXCEPTION) { 552 DCHECK(!isolate->has_pending_exception()); 553 isolate->StackOverflow(); 554 } 555 return result; 556 #endif // V8_INTERPRETED_REGEXP 557 } 558 559 560 MaybeHandle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp, 561 Handle<String> subject, 562 int previous_index, 563 Handle<JSArray> last_match_info) { 564 Isolate* isolate = regexp->GetIsolate(); 565 DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP); 566 567 // Prepare space for the return values. 568 #if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG) 569 if (FLAG_trace_regexp_bytecodes) { 570 String* pattern = regexp->Pattern(); 571 PrintF("\n\nRegexp match: /%s/\n\n", pattern->ToCString().get()); 572 PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get()); 573 } 574 #endif 575 int required_registers = RegExpImpl::IrregexpPrepare(regexp, subject); 576 if (required_registers < 0) { 577 // Compiling failed with an exception. 578 DCHECK(isolate->has_pending_exception()); 579 return MaybeHandle<Object>(); 580 } 581 582 int32_t* output_registers = NULL; 583 if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) { 584 output_registers = NewArray<int32_t>(required_registers); 585 } 586 base::SmartArrayPointer<int32_t> auto_release(output_registers); 587 if (output_registers == NULL) { 588 output_registers = isolate->jsregexp_static_offsets_vector(); 589 } 590 591 int res = RegExpImpl::IrregexpExecRaw( 592 regexp, subject, previous_index, output_registers, required_registers); 593 if (res == RE_SUCCESS) { 594 int capture_count = 595 IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())); 596 return SetLastMatchInfo( 597 last_match_info, subject, capture_count, output_registers); 598 } 599 if (res == RE_EXCEPTION) { 600 DCHECK(isolate->has_pending_exception()); 601 return MaybeHandle<Object>(); 602 } 603 DCHECK(res == RE_FAILURE); 604 return isolate->factory()->null_value(); 605 } 606 607 608 static void EnsureSize(Handle<JSArray> array, uint32_t minimum_size) { 609 if (static_cast<uint32_t>(array->elements()->length()) < minimum_size) { 610 JSArray::SetLength(array, minimum_size); 611 } 612 } 613 614 615 Handle<JSArray> RegExpImpl::SetLastMatchInfo(Handle<JSArray> last_match_info, 616 Handle<String> subject, 617 int capture_count, 618 int32_t* match) { 619 DCHECK(last_match_info->HasFastObjectElements()); 620 int capture_register_count = (capture_count + 1) * 2; 621 EnsureSize(last_match_info, capture_register_count + kLastMatchOverhead); 622 DisallowHeapAllocation no_allocation; 623 FixedArray* array = FixedArray::cast(last_match_info->elements()); 624 if (match != NULL) { 625 for (int i = 0; i < capture_register_count; i += 2) { 626 SetCapture(array, i, match[i]); 627 SetCapture(array, i + 1, match[i + 1]); 628 } 629 } 630 SetLastCaptureCount(array, capture_register_count); 631 SetLastSubject(array, *subject); 632 SetLastInput(array, *subject); 633 return last_match_info; 634 } 635 636 637 RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp, 638 Handle<String> subject, 639 bool is_global, 640 Isolate* isolate) 641 : register_array_(NULL), 642 register_array_size_(0), 643 regexp_(regexp), 644 subject_(subject) { 645 #ifdef V8_INTERPRETED_REGEXP 646 bool interpreted = true; 647 #else 648 bool interpreted = false; 649 #endif // V8_INTERPRETED_REGEXP 650 651 if (regexp_->TypeTag() == JSRegExp::ATOM) { 652 static const int kAtomRegistersPerMatch = 2; 653 registers_per_match_ = kAtomRegistersPerMatch; 654 // There is no distinction between interpreted and native for atom regexps. 655 interpreted = false; 656 } else { 657 registers_per_match_ = RegExpImpl::IrregexpPrepare(regexp_, subject_); 658 if (registers_per_match_ < 0) { 659 num_matches_ = -1; // Signal exception. 660 return; 661 } 662 } 663 664 if (is_global && !interpreted) { 665 register_array_size_ = 666 Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize); 667 max_matches_ = register_array_size_ / registers_per_match_; 668 } else { 669 // Global loop in interpreted regexp is not implemented. We choose 670 // the size of the offsets vector so that it can only store one match. 671 register_array_size_ = registers_per_match_; 672 max_matches_ = 1; 673 } 674 675 if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) { 676 register_array_ = NewArray<int32_t>(register_array_size_); 677 } else { 678 register_array_ = isolate->jsregexp_static_offsets_vector(); 679 } 680 681 // Set state so that fetching the results the first time triggers a call 682 // to the compiled regexp. 683 current_match_index_ = max_matches_ - 1; 684 num_matches_ = max_matches_; 685 DCHECK(registers_per_match_ >= 2); // Each match has at least one capture. 686 DCHECK_GE(register_array_size_, registers_per_match_); 687 int32_t* last_match = 688 ®ister_array_[current_match_index_ * registers_per_match_]; 689 last_match[0] = -1; 690 last_match[1] = 0; 691 } 692 693 694 // ------------------------------------------------------------------- 695 // Implementation of the Irregexp regular expression engine. 696 // 697 // The Irregexp regular expression engine is intended to be a complete 698 // implementation of ECMAScript regular expressions. It generates either 699 // bytecodes or native code. 700 701 // The Irregexp regexp engine is structured in three steps. 702 // 1) The parser generates an abstract syntax tree. See ast.cc. 703 // 2) From the AST a node network is created. The nodes are all 704 // subclasses of RegExpNode. The nodes represent states when 705 // executing a regular expression. Several optimizations are 706 // performed on the node network. 707 // 3) From the nodes we generate either byte codes or native code 708 // that can actually execute the regular expression (perform 709 // the search). The code generation step is described in more 710 // detail below. 711 712 // Code generation. 713 // 714 // The nodes are divided into four main categories. 715 // * Choice nodes 716 // These represent places where the regular expression can 717 // match in more than one way. For example on entry to an 718 // alternation (foo|bar) or a repetition (*, +, ? or {}). 719 // * Action nodes 720 // These represent places where some action should be 721 // performed. Examples include recording the current position 722 // in the input string to a register (in order to implement 723 // captures) or other actions on register for example in order 724 // to implement the counters needed for {} repetitions. 725 // * Matching nodes 726 // These attempt to match some element part of the input string. 727 // Examples of elements include character classes, plain strings 728 // or back references. 729 // * End nodes 730 // These are used to implement the actions required on finding 731 // a successful match or failing to find a match. 732 // 733 // The code generated (whether as byte codes or native code) maintains 734 // some state as it runs. This consists of the following elements: 735 // 736 // * The capture registers. Used for string captures. 737 // * Other registers. Used for counters etc. 738 // * The current position. 739 // * The stack of backtracking information. Used when a matching node 740 // fails to find a match and needs to try an alternative. 741 // 742 // Conceptual regular expression execution model: 743 // 744 // There is a simple conceptual model of regular expression execution 745 // which will be presented first. The actual code generated is a more 746 // efficient simulation of the simple conceptual model: 747 // 748 // * Choice nodes are implemented as follows: 749 // For each choice except the last { 750 // push current position 751 // push backtrack code location 752 // <generate code to test for choice> 753 // backtrack code location: 754 // pop current position 755 // } 756 // <generate code to test for last choice> 757 // 758 // * Actions nodes are generated as follows 759 // <push affected registers on backtrack stack> 760 // <generate code to perform action> 761 // push backtrack code location 762 // <generate code to test for following nodes> 763 // backtrack code location: 764 // <pop affected registers to restore their state> 765 // <pop backtrack location from stack and go to it> 766 // 767 // * Matching nodes are generated as follows: 768 // if input string matches at current position 769 // update current position 770 // <generate code to test for following nodes> 771 // else 772 // <pop backtrack location from stack and go to it> 773 // 774 // Thus it can be seen that the current position is saved and restored 775 // by the choice nodes, whereas the registers are saved and restored by 776 // by the action nodes that manipulate them. 777 // 778 // The other interesting aspect of this model is that nodes are generated 779 // at the point where they are needed by a recursive call to Emit(). If 780 // the node has already been code generated then the Emit() call will 781 // generate a jump to the previously generated code instead. In order to 782 // limit recursion it is possible for the Emit() function to put the node 783 // on a work list for later generation and instead generate a jump. The 784 // destination of the jump is resolved later when the code is generated. 785 // 786 // Actual regular expression code generation. 787 // 788 // Code generation is actually more complicated than the above. In order 789 // to improve the efficiency of the generated code some optimizations are 790 // performed 791 // 792 // * Choice nodes have 1-character lookahead. 793 // A choice node looks at the following character and eliminates some of 794 // the choices immediately based on that character. This is not yet 795 // implemented. 796 // * Simple greedy loops store reduced backtracking information. 797 // A quantifier like /.*foo/m will greedily match the whole input. It will 798 // then need to backtrack to a point where it can match "foo". The naive 799 // implementation of this would push each character position onto the 800 // backtracking stack, then pop them off one by one. This would use space 801 // proportional to the length of the input string. However since the "." 802 // can only match in one way and always has a constant length (in this case 803 // of 1) it suffices to store the current position on the top of the stack 804 // once. Matching now becomes merely incrementing the current position and 805 // backtracking becomes decrementing the current position and checking the 806 // result against the stored current position. This is faster and saves 807 // space. 808 // * The current state is virtualized. 809 // This is used to defer expensive operations until it is clear that they 810 // are needed and to generate code for a node more than once, allowing 811 // specialized an efficient versions of the code to be created. This is 812 // explained in the section below. 813 // 814 // Execution state virtualization. 815 // 816 // Instead of emitting code, nodes that manipulate the state can record their 817 // manipulation in an object called the Trace. The Trace object can record a 818 // current position offset, an optional backtrack code location on the top of 819 // the virtualized backtrack stack and some register changes. When a node is 820 // to be emitted it can flush the Trace or update it. Flushing the Trace 821 // will emit code to bring the actual state into line with the virtual state. 822 // Avoiding flushing the state can postpone some work (e.g. updates of capture 823 // registers). Postponing work can save time when executing the regular 824 // expression since it may be found that the work never has to be done as a 825 // failure to match can occur. In addition it is much faster to jump to a 826 // known backtrack code location than it is to pop an unknown backtrack 827 // location from the stack and jump there. 828 // 829 // The virtual state found in the Trace affects code generation. For example 830 // the virtual state contains the difference between the actual current 831 // position and the virtual current position, and matching code needs to use 832 // this offset to attempt a match in the correct location of the input 833 // string. Therefore code generated for a non-trivial trace is specialized 834 // to that trace. The code generator therefore has the ability to generate 835 // code for each node several times. In order to limit the size of the 836 // generated code there is an arbitrary limit on how many specialized sets of 837 // code may be generated for a given node. If the limit is reached, the 838 // trace is flushed and a generic version of the code for a node is emitted. 839 // This is subsequently used for that node. The code emitted for non-generic 840 // trace is not recorded in the node and so it cannot currently be reused in 841 // the event that code generation is requested for an identical trace. 842 843 844 void RegExpTree::AppendToText(RegExpText* text, Zone* zone) { 845 UNREACHABLE(); 846 } 847 848 849 void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) { 850 text->AddElement(TextElement::Atom(this), zone); 851 } 852 853 854 void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) { 855 text->AddElement(TextElement::CharClass(this), zone); 856 } 857 858 859 void RegExpText::AppendToText(RegExpText* text, Zone* zone) { 860 for (int i = 0; i < elements()->length(); i++) 861 text->AddElement(elements()->at(i), zone); 862 } 863 864 865 TextElement TextElement::Atom(RegExpAtom* atom) { 866 return TextElement(ATOM, atom); 867 } 868 869 870 TextElement TextElement::CharClass(RegExpCharacterClass* char_class) { 871 return TextElement(CHAR_CLASS, char_class); 872 } 873 874 875 int TextElement::length() const { 876 switch (text_type()) { 877 case ATOM: 878 return atom()->length(); 879 880 case CHAR_CLASS: 881 return 1; 882 } 883 UNREACHABLE(); 884 return 0; 885 } 886 887 888 DispatchTable* ChoiceNode::GetTable(bool ignore_case) { 889 if (table_ == NULL) { 890 table_ = new(zone()) DispatchTable(zone()); 891 DispatchTableConstructor cons(table_, ignore_case, zone()); 892 cons.BuildTable(this); 893 } 894 return table_; 895 } 896 897 898 class FrequencyCollator { 899 public: 900 FrequencyCollator() : total_samples_(0) { 901 for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) { 902 frequencies_[i] = CharacterFrequency(i); 903 } 904 } 905 906 void CountCharacter(int character) { 907 int index = (character & RegExpMacroAssembler::kTableMask); 908 frequencies_[index].Increment(); 909 total_samples_++; 910 } 911 912 // Does not measure in percent, but rather per-128 (the table size from the 913 // regexp macro assembler). 914 int Frequency(int in_character) { 915 DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character); 916 if (total_samples_ < 1) return 1; // Division by zero. 917 int freq_in_per128 = 918 (frequencies_[in_character].counter() * 128) / total_samples_; 919 return freq_in_per128; 920 } 921 922 private: 923 class CharacterFrequency { 924 public: 925 CharacterFrequency() : counter_(0), character_(-1) { } 926 explicit CharacterFrequency(int character) 927 : counter_(0), character_(character) { } 928 929 void Increment() { counter_++; } 930 int counter() { return counter_; } 931 int character() { return character_; } 932 933 private: 934 int counter_; 935 int character_; 936 }; 937 938 939 private: 940 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize]; 941 int total_samples_; 942 }; 943 944 945 class RegExpCompiler { 946 public: 947 RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count, 948 bool ignore_case, bool is_one_byte); 949 950 int AllocateRegister() { 951 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) { 952 reg_exp_too_big_ = true; 953 return next_register_; 954 } 955 return next_register_++; 956 } 957 958 RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler, 959 RegExpNode* start, 960 int capture_count, 961 Handle<String> pattern); 962 963 inline void AddWork(RegExpNode* node) { 964 if (!node->on_work_list() && !node->label()->is_bound()) { 965 node->set_on_work_list(true); 966 work_list_->Add(node); 967 } 968 } 969 970 static const int kImplementationOffset = 0; 971 static const int kNumberOfRegistersOffset = 0; 972 static const int kCodeOffset = 1; 973 974 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; } 975 EndNode* accept() { return accept_; } 976 977 static const int kMaxRecursion = 100; 978 inline int recursion_depth() { return recursion_depth_; } 979 inline void IncrementRecursionDepth() { recursion_depth_++; } 980 inline void DecrementRecursionDepth() { recursion_depth_--; } 981 982 void SetRegExpTooBig() { reg_exp_too_big_ = true; } 983 984 inline bool ignore_case() { return ignore_case_; } 985 inline bool one_byte() { return one_byte_; } 986 inline bool optimize() { return optimize_; } 987 inline void set_optimize(bool value) { optimize_ = value; } 988 inline bool limiting_recursion() { return limiting_recursion_; } 989 inline void set_limiting_recursion(bool value) { 990 limiting_recursion_ = value; 991 } 992 bool read_backward() { return read_backward_; } 993 void set_read_backward(bool value) { read_backward_ = value; } 994 FrequencyCollator* frequency_collator() { return &frequency_collator_; } 995 996 int current_expansion_factor() { return current_expansion_factor_; } 997 void set_current_expansion_factor(int value) { 998 current_expansion_factor_ = value; 999 } 1000 1001 Isolate* isolate() const { return isolate_; } 1002 Zone* zone() const { return zone_; } 1003 1004 static const int kNoRegister = -1; 1005 1006 private: 1007 EndNode* accept_; 1008 int next_register_; 1009 List<RegExpNode*>* work_list_; 1010 int recursion_depth_; 1011 RegExpMacroAssembler* macro_assembler_; 1012 bool ignore_case_; 1013 bool one_byte_; 1014 bool reg_exp_too_big_; 1015 bool limiting_recursion_; 1016 bool optimize_; 1017 bool read_backward_; 1018 int current_expansion_factor_; 1019 FrequencyCollator frequency_collator_; 1020 Isolate* isolate_; 1021 Zone* zone_; 1022 }; 1023 1024 1025 class RecursionCheck { 1026 public: 1027 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) { 1028 compiler->IncrementRecursionDepth(); 1029 } 1030 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); } 1031 private: 1032 RegExpCompiler* compiler_; 1033 }; 1034 1035 1036 static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) { 1037 return RegExpEngine::CompilationResult(isolate, "RegExp too big"); 1038 } 1039 1040 1041 // Attempts to compile the regexp using an Irregexp code generator. Returns 1042 // a fixed array or a null handle depending on whether it succeeded. 1043 RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count, 1044 bool ignore_case, bool one_byte) 1045 : next_register_(2 * (capture_count + 1)), 1046 work_list_(NULL), 1047 recursion_depth_(0), 1048 ignore_case_(ignore_case), 1049 one_byte_(one_byte), 1050 reg_exp_too_big_(false), 1051 limiting_recursion_(false), 1052 optimize_(FLAG_regexp_optimization), 1053 read_backward_(false), 1054 current_expansion_factor_(1), 1055 frequency_collator_(), 1056 isolate_(isolate), 1057 zone_(zone) { 1058 accept_ = new(zone) EndNode(EndNode::ACCEPT, zone); 1059 DCHECK(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister); 1060 } 1061 1062 1063 RegExpEngine::CompilationResult RegExpCompiler::Assemble( 1064 RegExpMacroAssembler* macro_assembler, 1065 RegExpNode* start, 1066 int capture_count, 1067 Handle<String> pattern) { 1068 Heap* heap = pattern->GetHeap(); 1069 1070 #ifdef DEBUG 1071 if (FLAG_trace_regexp_assembler) 1072 macro_assembler_ = 1073 new RegExpMacroAssemblerTracer(isolate(), macro_assembler); 1074 else 1075 #endif 1076 macro_assembler_ = macro_assembler; 1077 1078 List <RegExpNode*> work_list(0); 1079 work_list_ = &work_list; 1080 Label fail; 1081 macro_assembler_->PushBacktrack(&fail); 1082 Trace new_trace; 1083 start->Emit(this, &new_trace); 1084 macro_assembler_->Bind(&fail); 1085 macro_assembler_->Fail(); 1086 while (!work_list.is_empty()) { 1087 RegExpNode* node = work_list.RemoveLast(); 1088 node->set_on_work_list(false); 1089 if (!node->label()->is_bound()) node->Emit(this, &new_trace); 1090 } 1091 if (reg_exp_too_big_) { 1092 macro_assembler_->AbortedCodeGeneration(); 1093 return IrregexpRegExpTooBig(isolate_); 1094 } 1095 1096 Handle<HeapObject> code = macro_assembler_->GetCode(pattern); 1097 heap->IncreaseTotalRegexpCodeGenerated(code->Size()); 1098 work_list_ = NULL; 1099 #ifdef ENABLE_DISASSEMBLER 1100 if (FLAG_print_code) { 1101 CodeTracer::Scope trace_scope(heap->isolate()->GetCodeTracer()); 1102 OFStream os(trace_scope.file()); 1103 Handle<Code>::cast(code)->Disassemble(pattern->ToCString().get(), os); 1104 } 1105 #endif 1106 #ifdef DEBUG 1107 if (FLAG_trace_regexp_assembler) { 1108 delete macro_assembler_; 1109 } 1110 #endif 1111 return RegExpEngine::CompilationResult(*code, next_register_); 1112 } 1113 1114 1115 bool Trace::DeferredAction::Mentions(int that) { 1116 if (action_type() == ActionNode::CLEAR_CAPTURES) { 1117 Interval range = static_cast<DeferredClearCaptures*>(this)->range(); 1118 return range.Contains(that); 1119 } else { 1120 return reg() == that; 1121 } 1122 } 1123 1124 1125 bool Trace::mentions_reg(int reg) { 1126 for (DeferredAction* action = actions_; 1127 action != NULL; 1128 action = action->next()) { 1129 if (action->Mentions(reg)) 1130 return true; 1131 } 1132 return false; 1133 } 1134 1135 1136 bool Trace::GetStoredPosition(int reg, int* cp_offset) { 1137 DCHECK_EQ(0, *cp_offset); 1138 for (DeferredAction* action = actions_; 1139 action != NULL; 1140 action = action->next()) { 1141 if (action->Mentions(reg)) { 1142 if (action->action_type() == ActionNode::STORE_POSITION) { 1143 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset(); 1144 return true; 1145 } else { 1146 return false; 1147 } 1148 } 1149 } 1150 return false; 1151 } 1152 1153 1154 int Trace::FindAffectedRegisters(OutSet* affected_registers, 1155 Zone* zone) { 1156 int max_register = RegExpCompiler::kNoRegister; 1157 for (DeferredAction* action = actions_; 1158 action != NULL; 1159 action = action->next()) { 1160 if (action->action_type() == ActionNode::CLEAR_CAPTURES) { 1161 Interval range = static_cast<DeferredClearCaptures*>(action)->range(); 1162 for (int i = range.from(); i <= range.to(); i++) 1163 affected_registers->Set(i, zone); 1164 if (range.to() > max_register) max_register = range.to(); 1165 } else { 1166 affected_registers->Set(action->reg(), zone); 1167 if (action->reg() > max_register) max_register = action->reg(); 1168 } 1169 } 1170 return max_register; 1171 } 1172 1173 1174 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler, 1175 int max_register, 1176 const OutSet& registers_to_pop, 1177 const OutSet& registers_to_clear) { 1178 for (int reg = max_register; reg >= 0; reg--) { 1179 if (registers_to_pop.Get(reg)) { 1180 assembler->PopRegister(reg); 1181 } else if (registers_to_clear.Get(reg)) { 1182 int clear_to = reg; 1183 while (reg > 0 && registers_to_clear.Get(reg - 1)) { 1184 reg--; 1185 } 1186 assembler->ClearRegisters(reg, clear_to); 1187 } 1188 } 1189 } 1190 1191 1192 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler, 1193 int max_register, 1194 const OutSet& affected_registers, 1195 OutSet* registers_to_pop, 1196 OutSet* registers_to_clear, 1197 Zone* zone) { 1198 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1. 1199 const int push_limit = (assembler->stack_limit_slack() + 1) / 2; 1200 1201 // Count pushes performed to force a stack limit check occasionally. 1202 int pushes = 0; 1203 1204 for (int reg = 0; reg <= max_register; reg++) { 1205 if (!affected_registers.Get(reg)) { 1206 continue; 1207 } 1208 1209 // The chronologically first deferred action in the trace 1210 // is used to infer the action needed to restore a register 1211 // to its previous state (or not, if it's safe to ignore it). 1212 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR }; 1213 DeferredActionUndoType undo_action = IGNORE; 1214 1215 int value = 0; 1216 bool absolute = false; 1217 bool clear = false; 1218 static const int kNoStore = kMinInt; 1219 int store_position = kNoStore; 1220 // This is a little tricky because we are scanning the actions in reverse 1221 // historical order (newest first). 1222 for (DeferredAction* action = actions_; 1223 action != NULL; 1224 action = action->next()) { 1225 if (action->Mentions(reg)) { 1226 switch (action->action_type()) { 1227 case ActionNode::SET_REGISTER: { 1228 Trace::DeferredSetRegister* psr = 1229 static_cast<Trace::DeferredSetRegister*>(action); 1230 if (!absolute) { 1231 value += psr->value(); 1232 absolute = true; 1233 } 1234 // SET_REGISTER is currently only used for newly introduced loop 1235 // counters. They can have a significant previous value if they 1236 // occour in a loop. TODO(lrn): Propagate this information, so 1237 // we can set undo_action to IGNORE if we know there is no value to 1238 // restore. 1239 undo_action = RESTORE; 1240 DCHECK_EQ(store_position, kNoStore); 1241 DCHECK(!clear); 1242 break; 1243 } 1244 case ActionNode::INCREMENT_REGISTER: 1245 if (!absolute) { 1246 value++; 1247 } 1248 DCHECK_EQ(store_position, kNoStore); 1249 DCHECK(!clear); 1250 undo_action = RESTORE; 1251 break; 1252 case ActionNode::STORE_POSITION: { 1253 Trace::DeferredCapture* pc = 1254 static_cast<Trace::DeferredCapture*>(action); 1255 if (!clear && store_position == kNoStore) { 1256 store_position = pc->cp_offset(); 1257 } 1258 1259 // For captures we know that stores and clears alternate. 1260 // Other register, are never cleared, and if the occur 1261 // inside a loop, they might be assigned more than once. 1262 if (reg <= 1) { 1263 // Registers zero and one, aka "capture zero", is 1264 // always set correctly if we succeed. There is no 1265 // need to undo a setting on backtrack, because we 1266 // will set it again or fail. 1267 undo_action = IGNORE; 1268 } else { 1269 undo_action = pc->is_capture() ? CLEAR : RESTORE; 1270 } 1271 DCHECK(!absolute); 1272 DCHECK_EQ(value, 0); 1273 break; 1274 } 1275 case ActionNode::CLEAR_CAPTURES: { 1276 // Since we're scanning in reverse order, if we've already 1277 // set the position we have to ignore historically earlier 1278 // clearing operations. 1279 if (store_position == kNoStore) { 1280 clear = true; 1281 } 1282 undo_action = RESTORE; 1283 DCHECK(!absolute); 1284 DCHECK_EQ(value, 0); 1285 break; 1286 } 1287 default: 1288 UNREACHABLE(); 1289 break; 1290 } 1291 } 1292 } 1293 // Prepare for the undo-action (e.g., push if it's going to be popped). 1294 if (undo_action == RESTORE) { 1295 pushes++; 1296 RegExpMacroAssembler::StackCheckFlag stack_check = 1297 RegExpMacroAssembler::kNoStackLimitCheck; 1298 if (pushes == push_limit) { 1299 stack_check = RegExpMacroAssembler::kCheckStackLimit; 1300 pushes = 0; 1301 } 1302 1303 assembler->PushRegister(reg, stack_check); 1304 registers_to_pop->Set(reg, zone); 1305 } else if (undo_action == CLEAR) { 1306 registers_to_clear->Set(reg, zone); 1307 } 1308 // Perform the chronologically last action (or accumulated increment) 1309 // for the register. 1310 if (store_position != kNoStore) { 1311 assembler->WriteCurrentPositionToRegister(reg, store_position); 1312 } else if (clear) { 1313 assembler->ClearRegisters(reg, reg); 1314 } else if (absolute) { 1315 assembler->SetRegister(reg, value); 1316 } else if (value != 0) { 1317 assembler->AdvanceRegister(reg, value); 1318 } 1319 } 1320 } 1321 1322 1323 // This is called as we come into a loop choice node and some other tricky 1324 // nodes. It normalizes the state of the code generator to ensure we can 1325 // generate generic code. 1326 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) { 1327 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1328 1329 DCHECK(!is_trivial()); 1330 1331 if (actions_ == NULL && backtrack() == NULL) { 1332 // Here we just have some deferred cp advances to fix and we are back to 1333 // a normal situation. We may also have to forget some information gained 1334 // through a quick check that was already performed. 1335 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_); 1336 // Create a new trivial state and generate the node with that. 1337 Trace new_state; 1338 successor->Emit(compiler, &new_state); 1339 return; 1340 } 1341 1342 // Generate deferred actions here along with code to undo them again. 1343 OutSet affected_registers; 1344 1345 if (backtrack() != NULL) { 1346 // Here we have a concrete backtrack location. These are set up by choice 1347 // nodes and so they indicate that we have a deferred save of the current 1348 // position which we may need to emit here. 1349 assembler->PushCurrentPosition(); 1350 } 1351 1352 int max_register = FindAffectedRegisters(&affected_registers, 1353 compiler->zone()); 1354 OutSet registers_to_pop; 1355 OutSet registers_to_clear; 1356 PerformDeferredActions(assembler, 1357 max_register, 1358 affected_registers, 1359 ®isters_to_pop, 1360 ®isters_to_clear, 1361 compiler->zone()); 1362 if (cp_offset_ != 0) { 1363 assembler->AdvanceCurrentPosition(cp_offset_); 1364 } 1365 1366 // Create a new trivial state and generate the node with that. 1367 Label undo; 1368 assembler->PushBacktrack(&undo); 1369 if (successor->KeepRecursing(compiler)) { 1370 Trace new_state; 1371 successor->Emit(compiler, &new_state); 1372 } else { 1373 compiler->AddWork(successor); 1374 assembler->GoTo(successor->label()); 1375 } 1376 1377 // On backtrack we need to restore state. 1378 assembler->Bind(&undo); 1379 RestoreAffectedRegisters(assembler, 1380 max_register, 1381 registers_to_pop, 1382 registers_to_clear); 1383 if (backtrack() == NULL) { 1384 assembler->Backtrack(); 1385 } else { 1386 assembler->PopCurrentPosition(); 1387 assembler->GoTo(backtrack()); 1388 } 1389 } 1390 1391 1392 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) { 1393 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1394 1395 // Omit flushing the trace. We discard the entire stack frame anyway. 1396 1397 if (!label()->is_bound()) { 1398 // We are completely independent of the trace, since we ignore it, 1399 // so this code can be used as the generic version. 1400 assembler->Bind(label()); 1401 } 1402 1403 // Throw away everything on the backtrack stack since the start 1404 // of the negative submatch and restore the character position. 1405 assembler->ReadCurrentPositionFromRegister(current_position_register_); 1406 assembler->ReadStackPointerFromRegister(stack_pointer_register_); 1407 if (clear_capture_count_ > 0) { 1408 // Clear any captures that might have been performed during the success 1409 // of the body of the negative look-ahead. 1410 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1; 1411 assembler->ClearRegisters(clear_capture_start_, clear_capture_end); 1412 } 1413 // Now that we have unwound the stack we find at the top of the stack the 1414 // backtrack that the BeginSubmatch node got. 1415 assembler->Backtrack(); 1416 } 1417 1418 1419 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) { 1420 if (!trace->is_trivial()) { 1421 trace->Flush(compiler, this); 1422 return; 1423 } 1424 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1425 if (!label()->is_bound()) { 1426 assembler->Bind(label()); 1427 } 1428 switch (action_) { 1429 case ACCEPT: 1430 assembler->Succeed(); 1431 return; 1432 case BACKTRACK: 1433 assembler->GoTo(trace->backtrack()); 1434 return; 1435 case NEGATIVE_SUBMATCH_SUCCESS: 1436 // This case is handled in a different virtual method. 1437 UNREACHABLE(); 1438 } 1439 UNIMPLEMENTED(); 1440 } 1441 1442 1443 void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) { 1444 if (guards_ == NULL) 1445 guards_ = new(zone) ZoneList<Guard*>(1, zone); 1446 guards_->Add(guard, zone); 1447 } 1448 1449 1450 ActionNode* ActionNode::SetRegister(int reg, 1451 int val, 1452 RegExpNode* on_success) { 1453 ActionNode* result = 1454 new(on_success->zone()) ActionNode(SET_REGISTER, on_success); 1455 result->data_.u_store_register.reg = reg; 1456 result->data_.u_store_register.value = val; 1457 return result; 1458 } 1459 1460 1461 ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) { 1462 ActionNode* result = 1463 new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success); 1464 result->data_.u_increment_register.reg = reg; 1465 return result; 1466 } 1467 1468 1469 ActionNode* ActionNode::StorePosition(int reg, 1470 bool is_capture, 1471 RegExpNode* on_success) { 1472 ActionNode* result = 1473 new(on_success->zone()) ActionNode(STORE_POSITION, on_success); 1474 result->data_.u_position_register.reg = reg; 1475 result->data_.u_position_register.is_capture = is_capture; 1476 return result; 1477 } 1478 1479 1480 ActionNode* ActionNode::ClearCaptures(Interval range, 1481 RegExpNode* on_success) { 1482 ActionNode* result = 1483 new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success); 1484 result->data_.u_clear_captures.range_from = range.from(); 1485 result->data_.u_clear_captures.range_to = range.to(); 1486 return result; 1487 } 1488 1489 1490 ActionNode* ActionNode::BeginSubmatch(int stack_reg, 1491 int position_reg, 1492 RegExpNode* on_success) { 1493 ActionNode* result = 1494 new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success); 1495 result->data_.u_submatch.stack_pointer_register = stack_reg; 1496 result->data_.u_submatch.current_position_register = position_reg; 1497 return result; 1498 } 1499 1500 1501 ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg, 1502 int position_reg, 1503 int clear_register_count, 1504 int clear_register_from, 1505 RegExpNode* on_success) { 1506 ActionNode* result = 1507 new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success); 1508 result->data_.u_submatch.stack_pointer_register = stack_reg; 1509 result->data_.u_submatch.current_position_register = position_reg; 1510 result->data_.u_submatch.clear_register_count = clear_register_count; 1511 result->data_.u_submatch.clear_register_from = clear_register_from; 1512 return result; 1513 } 1514 1515 1516 ActionNode* ActionNode::EmptyMatchCheck(int start_register, 1517 int repetition_register, 1518 int repetition_limit, 1519 RegExpNode* on_success) { 1520 ActionNode* result = 1521 new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success); 1522 result->data_.u_empty_match_check.start_register = start_register; 1523 result->data_.u_empty_match_check.repetition_register = repetition_register; 1524 result->data_.u_empty_match_check.repetition_limit = repetition_limit; 1525 return result; 1526 } 1527 1528 1529 #define DEFINE_ACCEPT(Type) \ 1530 void Type##Node::Accept(NodeVisitor* visitor) { \ 1531 visitor->Visit##Type(this); \ 1532 } 1533 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT) 1534 #undef DEFINE_ACCEPT 1535 1536 1537 void LoopChoiceNode::Accept(NodeVisitor* visitor) { 1538 visitor->VisitLoopChoice(this); 1539 } 1540 1541 1542 // ------------------------------------------------------------------- 1543 // Emit code. 1544 1545 1546 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler, 1547 Guard* guard, 1548 Trace* trace) { 1549 switch (guard->op()) { 1550 case Guard::LT: 1551 DCHECK(!trace->mentions_reg(guard->reg())); 1552 macro_assembler->IfRegisterGE(guard->reg(), 1553 guard->value(), 1554 trace->backtrack()); 1555 break; 1556 case Guard::GEQ: 1557 DCHECK(!trace->mentions_reg(guard->reg())); 1558 macro_assembler->IfRegisterLT(guard->reg(), 1559 guard->value(), 1560 trace->backtrack()); 1561 break; 1562 } 1563 } 1564 1565 1566 // Returns the number of characters in the equivalence class, omitting those 1567 // that cannot occur in the source string because it is Latin1. 1568 static int GetCaseIndependentLetters(Isolate* isolate, uc16 character, 1569 bool one_byte_subject, 1570 unibrow::uchar* letters) { 1571 int length = 1572 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters); 1573 // Unibrow returns 0 or 1 for characters where case independence is 1574 // trivial. 1575 if (length == 0) { 1576 letters[0] = character; 1577 length = 1; 1578 } 1579 1580 if (one_byte_subject) { 1581 int new_length = 0; 1582 for (int i = 0; i < length; i++) { 1583 if (letters[i] <= String::kMaxOneByteCharCode) { 1584 letters[new_length++] = letters[i]; 1585 } 1586 } 1587 length = new_length; 1588 } 1589 1590 return length; 1591 } 1592 1593 1594 static inline bool EmitSimpleCharacter(Isolate* isolate, 1595 RegExpCompiler* compiler, 1596 uc16 c, 1597 Label* on_failure, 1598 int cp_offset, 1599 bool check, 1600 bool preloaded) { 1601 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1602 bool bound_checked = false; 1603 if (!preloaded) { 1604 assembler->LoadCurrentCharacter( 1605 cp_offset, 1606 on_failure, 1607 check); 1608 bound_checked = true; 1609 } 1610 assembler->CheckNotCharacter(c, on_failure); 1611 return bound_checked; 1612 } 1613 1614 1615 // Only emits non-letters (things that don't have case). Only used for case 1616 // independent matches. 1617 static inline bool EmitAtomNonLetter(Isolate* isolate, 1618 RegExpCompiler* compiler, 1619 uc16 c, 1620 Label* on_failure, 1621 int cp_offset, 1622 bool check, 1623 bool preloaded) { 1624 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 1625 bool one_byte = compiler->one_byte(); 1626 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1627 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars); 1628 if (length < 1) { 1629 // This can't match. Must be an one-byte subject and a non-one-byte 1630 // character. We do not need to do anything since the one-byte pass 1631 // already handled this. 1632 return false; // Bounds not checked. 1633 } 1634 bool checked = false; 1635 // We handle the length > 1 case in a later pass. 1636 if (length == 1) { 1637 if (one_byte && c > String::kMaxOneByteCharCodeU) { 1638 // Can't match - see above. 1639 return false; // Bounds not checked. 1640 } 1641 if (!preloaded) { 1642 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 1643 checked = check; 1644 } 1645 macro_assembler->CheckNotCharacter(c, on_failure); 1646 } 1647 return checked; 1648 } 1649 1650 1651 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler, 1652 bool one_byte, uc16 c1, uc16 c2, 1653 Label* on_failure) { 1654 uc16 char_mask; 1655 if (one_byte) { 1656 char_mask = String::kMaxOneByteCharCode; 1657 } else { 1658 char_mask = String::kMaxUtf16CodeUnit; 1659 } 1660 uc16 exor = c1 ^ c2; 1661 // Check whether exor has only one bit set. 1662 if (((exor - 1) & exor) == 0) { 1663 // If c1 and c2 differ only by one bit. 1664 // Ecma262UnCanonicalize always gives the highest number last. 1665 DCHECK(c2 > c1); 1666 uc16 mask = char_mask ^ exor; 1667 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure); 1668 return true; 1669 } 1670 DCHECK(c2 > c1); 1671 uc16 diff = c2 - c1; 1672 if (((diff - 1) & diff) == 0 && c1 >= diff) { 1673 // If the characters differ by 2^n but don't differ by one bit then 1674 // subtract the difference from the found character, then do the or 1675 // trick. We avoid the theoretical case where negative numbers are 1676 // involved in order to simplify code generation. 1677 uc16 mask = char_mask ^ diff; 1678 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff, 1679 diff, 1680 mask, 1681 on_failure); 1682 return true; 1683 } 1684 return false; 1685 } 1686 1687 1688 typedef bool EmitCharacterFunction(Isolate* isolate, 1689 RegExpCompiler* compiler, 1690 uc16 c, 1691 Label* on_failure, 1692 int cp_offset, 1693 bool check, 1694 bool preloaded); 1695 1696 // Only emits letters (things that have case). Only used for case independent 1697 // matches. 1698 static inline bool EmitAtomLetter(Isolate* isolate, 1699 RegExpCompiler* compiler, 1700 uc16 c, 1701 Label* on_failure, 1702 int cp_offset, 1703 bool check, 1704 bool preloaded) { 1705 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 1706 bool one_byte = compiler->one_byte(); 1707 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1708 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars); 1709 if (length <= 1) return false; 1710 // We may not need to check against the end of the input string 1711 // if this character lies before a character that matched. 1712 if (!preloaded) { 1713 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 1714 } 1715 Label ok; 1716 DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4); 1717 switch (length) { 1718 case 2: { 1719 if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0], 1720 chars[1], on_failure)) { 1721 } else { 1722 macro_assembler->CheckCharacter(chars[0], &ok); 1723 macro_assembler->CheckNotCharacter(chars[1], on_failure); 1724 macro_assembler->Bind(&ok); 1725 } 1726 break; 1727 } 1728 case 4: 1729 macro_assembler->CheckCharacter(chars[3], &ok); 1730 // Fall through! 1731 case 3: 1732 macro_assembler->CheckCharacter(chars[0], &ok); 1733 macro_assembler->CheckCharacter(chars[1], &ok); 1734 macro_assembler->CheckNotCharacter(chars[2], on_failure); 1735 macro_assembler->Bind(&ok); 1736 break; 1737 default: 1738 UNREACHABLE(); 1739 break; 1740 } 1741 return true; 1742 } 1743 1744 1745 static void EmitBoundaryTest(RegExpMacroAssembler* masm, 1746 int border, 1747 Label* fall_through, 1748 Label* above_or_equal, 1749 Label* below) { 1750 if (below != fall_through) { 1751 masm->CheckCharacterLT(border, below); 1752 if (above_or_equal != fall_through) masm->GoTo(above_or_equal); 1753 } else { 1754 masm->CheckCharacterGT(border - 1, above_or_equal); 1755 } 1756 } 1757 1758 1759 static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm, 1760 int first, 1761 int last, 1762 Label* fall_through, 1763 Label* in_range, 1764 Label* out_of_range) { 1765 if (in_range == fall_through) { 1766 if (first == last) { 1767 masm->CheckNotCharacter(first, out_of_range); 1768 } else { 1769 masm->CheckCharacterNotInRange(first, last, out_of_range); 1770 } 1771 } else { 1772 if (first == last) { 1773 masm->CheckCharacter(first, in_range); 1774 } else { 1775 masm->CheckCharacterInRange(first, last, in_range); 1776 } 1777 if (out_of_range != fall_through) masm->GoTo(out_of_range); 1778 } 1779 } 1780 1781 1782 // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even. 1783 // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd. 1784 static void EmitUseLookupTable( 1785 RegExpMacroAssembler* masm, 1786 ZoneList<int>* ranges, 1787 int start_index, 1788 int end_index, 1789 int min_char, 1790 Label* fall_through, 1791 Label* even_label, 1792 Label* odd_label) { 1793 static const int kSize = RegExpMacroAssembler::kTableSize; 1794 static const int kMask = RegExpMacroAssembler::kTableMask; 1795 1796 int base = (min_char & ~kMask); 1797 USE(base); 1798 1799 // Assert that everything is on one kTableSize page. 1800 for (int i = start_index; i <= end_index; i++) { 1801 DCHECK_EQ(ranges->at(i) & ~kMask, base); 1802 } 1803 DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base); 1804 1805 char templ[kSize]; 1806 Label* on_bit_set; 1807 Label* on_bit_clear; 1808 int bit; 1809 if (even_label == fall_through) { 1810 on_bit_set = odd_label; 1811 on_bit_clear = even_label; 1812 bit = 1; 1813 } else { 1814 on_bit_set = even_label; 1815 on_bit_clear = odd_label; 1816 bit = 0; 1817 } 1818 for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) { 1819 templ[i] = bit; 1820 } 1821 int j = 0; 1822 bit ^= 1; 1823 for (int i = start_index; i < end_index; i++) { 1824 for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) { 1825 templ[j] = bit; 1826 } 1827 bit ^= 1; 1828 } 1829 for (int i = j; i < kSize; i++) { 1830 templ[i] = bit; 1831 } 1832 Factory* factory = masm->isolate()->factory(); 1833 // TODO(erikcorry): Cache these. 1834 Handle<ByteArray> ba = factory->NewByteArray(kSize, TENURED); 1835 for (int i = 0; i < kSize; i++) { 1836 ba->set(i, templ[i]); 1837 } 1838 masm->CheckBitInTable(ba, on_bit_set); 1839 if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear); 1840 } 1841 1842 1843 static void CutOutRange(RegExpMacroAssembler* masm, 1844 ZoneList<int>* ranges, 1845 int start_index, 1846 int end_index, 1847 int cut_index, 1848 Label* even_label, 1849 Label* odd_label) { 1850 bool odd = (((cut_index - start_index) & 1) == 1); 1851 Label* in_range_label = odd ? odd_label : even_label; 1852 Label dummy; 1853 EmitDoubleBoundaryTest(masm, 1854 ranges->at(cut_index), 1855 ranges->at(cut_index + 1) - 1, 1856 &dummy, 1857 in_range_label, 1858 &dummy); 1859 DCHECK(!dummy.is_linked()); 1860 // Cut out the single range by rewriting the array. This creates a new 1861 // range that is a merger of the two ranges on either side of the one we 1862 // are cutting out. The oddity of the labels is preserved. 1863 for (int j = cut_index; j > start_index; j--) { 1864 ranges->at(j) = ranges->at(j - 1); 1865 } 1866 for (int j = cut_index + 1; j < end_index; j++) { 1867 ranges->at(j) = ranges->at(j + 1); 1868 } 1869 } 1870 1871 1872 // Unicode case. Split the search space into kSize spaces that are handled 1873 // with recursion. 1874 static void SplitSearchSpace(ZoneList<int>* ranges, 1875 int start_index, 1876 int end_index, 1877 int* new_start_index, 1878 int* new_end_index, 1879 int* border) { 1880 static const int kSize = RegExpMacroAssembler::kTableSize; 1881 static const int kMask = RegExpMacroAssembler::kTableMask; 1882 1883 int first = ranges->at(start_index); 1884 int last = ranges->at(end_index) - 1; 1885 1886 *new_start_index = start_index; 1887 *border = (ranges->at(start_index) & ~kMask) + kSize; 1888 while (*new_start_index < end_index) { 1889 if (ranges->at(*new_start_index) > *border) break; 1890 (*new_start_index)++; 1891 } 1892 // new_start_index is the index of the first edge that is beyond the 1893 // current kSize space. 1894 1895 // For very large search spaces we do a binary chop search of the non-Latin1 1896 // space instead of just going to the end of the current kSize space. The 1897 // heuristics are complicated a little by the fact that any 128-character 1898 // encoding space can be quickly tested with a table lookup, so we don't 1899 // wish to do binary chop search at a smaller granularity than that. A 1900 // 128-character space can take up a lot of space in the ranges array if, 1901 // for example, we only want to match every second character (eg. the lower 1902 // case characters on some Unicode pages). 1903 int binary_chop_index = (end_index + start_index) / 2; 1904 // The first test ensures that we get to the code that handles the Latin1 1905 // range with a single not-taken branch, speeding up this important 1906 // character range (even non-Latin1 charset-based text has spaces and 1907 // punctuation). 1908 if (*border - 1 > String::kMaxOneByteCharCode && // Latin1 case. 1909 end_index - start_index > (*new_start_index - start_index) * 2 && 1910 last - first > kSize * 2 && binary_chop_index > *new_start_index && 1911 ranges->at(binary_chop_index) >= first + 2 * kSize) { 1912 int scan_forward_for_section_border = binary_chop_index;; 1913 int new_border = (ranges->at(binary_chop_index) | kMask) + 1; 1914 1915 while (scan_forward_for_section_border < end_index) { 1916 if (ranges->at(scan_forward_for_section_border) > new_border) { 1917 *new_start_index = scan_forward_for_section_border; 1918 *border = new_border; 1919 break; 1920 } 1921 scan_forward_for_section_border++; 1922 } 1923 } 1924 1925 DCHECK(*new_start_index > start_index); 1926 *new_end_index = *new_start_index - 1; 1927 if (ranges->at(*new_end_index) == *border) { 1928 (*new_end_index)--; 1929 } 1930 if (*border >= ranges->at(end_index)) { 1931 *border = ranges->at(end_index); 1932 *new_start_index = end_index; // Won't be used. 1933 *new_end_index = end_index - 1; 1934 } 1935 } 1936 1937 1938 // Gets a series of segment boundaries representing a character class. If the 1939 // character is in the range between an even and an odd boundary (counting from 1940 // start_index) then go to even_label, otherwise go to odd_label. We already 1941 // know that the character is in the range of min_char to max_char inclusive. 1942 // Either label can be NULL indicating backtracking. Either label can also be 1943 // equal to the fall_through label. 1944 static void GenerateBranches(RegExpMacroAssembler* masm, 1945 ZoneList<int>* ranges, 1946 int start_index, 1947 int end_index, 1948 uc16 min_char, 1949 uc16 max_char, 1950 Label* fall_through, 1951 Label* even_label, 1952 Label* odd_label) { 1953 int first = ranges->at(start_index); 1954 int last = ranges->at(end_index) - 1; 1955 1956 DCHECK_LT(min_char, first); 1957 1958 // Just need to test if the character is before or on-or-after 1959 // a particular character. 1960 if (start_index == end_index) { 1961 EmitBoundaryTest(masm, first, fall_through, even_label, odd_label); 1962 return; 1963 } 1964 1965 // Another almost trivial case: There is one interval in the middle that is 1966 // different from the end intervals. 1967 if (start_index + 1 == end_index) { 1968 EmitDoubleBoundaryTest( 1969 masm, first, last, fall_through, even_label, odd_label); 1970 return; 1971 } 1972 1973 // It's not worth using table lookup if there are very few intervals in the 1974 // character class. 1975 if (end_index - start_index <= 6) { 1976 // It is faster to test for individual characters, so we look for those 1977 // first, then try arbitrary ranges in the second round. 1978 static int kNoCutIndex = -1; 1979 int cut = kNoCutIndex; 1980 for (int i = start_index; i < end_index; i++) { 1981 if (ranges->at(i) == ranges->at(i + 1) - 1) { 1982 cut = i; 1983 break; 1984 } 1985 } 1986 if (cut == kNoCutIndex) cut = start_index; 1987 CutOutRange( 1988 masm, ranges, start_index, end_index, cut, even_label, odd_label); 1989 DCHECK_GE(end_index - start_index, 2); 1990 GenerateBranches(masm, 1991 ranges, 1992 start_index + 1, 1993 end_index - 1, 1994 min_char, 1995 max_char, 1996 fall_through, 1997 even_label, 1998 odd_label); 1999 return; 2000 } 2001 2002 // If there are a lot of intervals in the regexp, then we will use tables to 2003 // determine whether the character is inside or outside the character class. 2004 static const int kBits = RegExpMacroAssembler::kTableSizeBits; 2005 2006 if ((max_char >> kBits) == (min_char >> kBits)) { 2007 EmitUseLookupTable(masm, 2008 ranges, 2009 start_index, 2010 end_index, 2011 min_char, 2012 fall_through, 2013 even_label, 2014 odd_label); 2015 return; 2016 } 2017 2018 if ((min_char >> kBits) != (first >> kBits)) { 2019 masm->CheckCharacterLT(first, odd_label); 2020 GenerateBranches(masm, 2021 ranges, 2022 start_index + 1, 2023 end_index, 2024 first, 2025 max_char, 2026 fall_through, 2027 odd_label, 2028 even_label); 2029 return; 2030 } 2031 2032 int new_start_index = 0; 2033 int new_end_index = 0; 2034 int border = 0; 2035 2036 SplitSearchSpace(ranges, 2037 start_index, 2038 end_index, 2039 &new_start_index, 2040 &new_end_index, 2041 &border); 2042 2043 Label handle_rest; 2044 Label* above = &handle_rest; 2045 if (border == last + 1) { 2046 // We didn't find any section that started after the limit, so everything 2047 // above the border is one of the terminal labels. 2048 above = (end_index & 1) != (start_index & 1) ? odd_label : even_label; 2049 DCHECK(new_end_index == end_index - 1); 2050 } 2051 2052 DCHECK_LE(start_index, new_end_index); 2053 DCHECK_LE(new_start_index, end_index); 2054 DCHECK_LT(start_index, new_start_index); 2055 DCHECK_LT(new_end_index, end_index); 2056 DCHECK(new_end_index + 1 == new_start_index || 2057 (new_end_index + 2 == new_start_index && 2058 border == ranges->at(new_end_index + 1))); 2059 DCHECK_LT(min_char, border - 1); 2060 DCHECK_LT(border, max_char); 2061 DCHECK_LT(ranges->at(new_end_index), border); 2062 DCHECK(border < ranges->at(new_start_index) || 2063 (border == ranges->at(new_start_index) && 2064 new_start_index == end_index && 2065 new_end_index == end_index - 1 && 2066 border == last + 1)); 2067 DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1)); 2068 2069 masm->CheckCharacterGT(border - 1, above); 2070 Label dummy; 2071 GenerateBranches(masm, 2072 ranges, 2073 start_index, 2074 new_end_index, 2075 min_char, 2076 border - 1, 2077 &dummy, 2078 even_label, 2079 odd_label); 2080 if (handle_rest.is_linked()) { 2081 masm->Bind(&handle_rest); 2082 bool flip = (new_start_index & 1) != (start_index & 1); 2083 GenerateBranches(masm, 2084 ranges, 2085 new_start_index, 2086 end_index, 2087 border, 2088 max_char, 2089 &dummy, 2090 flip ? odd_label : even_label, 2091 flip ? even_label : odd_label); 2092 } 2093 } 2094 2095 2096 static void EmitCharClass(RegExpMacroAssembler* macro_assembler, 2097 RegExpCharacterClass* cc, bool one_byte, 2098 Label* on_failure, int cp_offset, bool check_offset, 2099 bool preloaded, Zone* zone) { 2100 ZoneList<CharacterRange>* ranges = cc->ranges(zone); 2101 if (!CharacterRange::IsCanonical(ranges)) { 2102 CharacterRange::Canonicalize(ranges); 2103 } 2104 2105 int max_char; 2106 if (one_byte) { 2107 max_char = String::kMaxOneByteCharCode; 2108 } else { 2109 max_char = String::kMaxUtf16CodeUnit; 2110 } 2111 2112 int range_count = ranges->length(); 2113 2114 int last_valid_range = range_count - 1; 2115 while (last_valid_range >= 0) { 2116 CharacterRange& range = ranges->at(last_valid_range); 2117 if (range.from() <= max_char) { 2118 break; 2119 } 2120 last_valid_range--; 2121 } 2122 2123 if (last_valid_range < 0) { 2124 if (!cc->is_negated()) { 2125 macro_assembler->GoTo(on_failure); 2126 } 2127 if (check_offset) { 2128 macro_assembler->CheckPosition(cp_offset, on_failure); 2129 } 2130 return; 2131 } 2132 2133 if (last_valid_range == 0 && 2134 ranges->at(0).IsEverything(max_char)) { 2135 if (cc->is_negated()) { 2136 macro_assembler->GoTo(on_failure); 2137 } else { 2138 // This is a common case hit by non-anchored expressions. 2139 if (check_offset) { 2140 macro_assembler->CheckPosition(cp_offset, on_failure); 2141 } 2142 } 2143 return; 2144 } 2145 if (last_valid_range == 0 && 2146 !cc->is_negated() && 2147 ranges->at(0).IsEverything(max_char)) { 2148 // This is a common case hit by non-anchored expressions. 2149 if (check_offset) { 2150 macro_assembler->CheckPosition(cp_offset, on_failure); 2151 } 2152 return; 2153 } 2154 2155 if (!preloaded) { 2156 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset); 2157 } 2158 2159 if (cc->is_standard(zone) && 2160 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(), 2161 on_failure)) { 2162 return; 2163 } 2164 2165 2166 // A new list with ascending entries. Each entry is a code unit 2167 // where there is a boundary between code units that are part of 2168 // the class and code units that are not. Normally we insert an 2169 // entry at zero which goes to the failure label, but if there 2170 // was already one there we fall through for success on that entry. 2171 // Subsequent entries have alternating meaning (success/failure). 2172 ZoneList<int>* range_boundaries = 2173 new(zone) ZoneList<int>(last_valid_range, zone); 2174 2175 bool zeroth_entry_is_failure = !cc->is_negated(); 2176 2177 for (int i = 0; i <= last_valid_range; i++) { 2178 CharacterRange& range = ranges->at(i); 2179 if (range.from() == 0) { 2180 DCHECK_EQ(i, 0); 2181 zeroth_entry_is_failure = !zeroth_entry_is_failure; 2182 } else { 2183 range_boundaries->Add(range.from(), zone); 2184 } 2185 range_boundaries->Add(range.to() + 1, zone); 2186 } 2187 int end_index = range_boundaries->length() - 1; 2188 if (range_boundaries->at(end_index) > max_char) { 2189 end_index--; 2190 } 2191 2192 Label fall_through; 2193 GenerateBranches(macro_assembler, 2194 range_boundaries, 2195 0, // start_index. 2196 end_index, 2197 0, // min_char. 2198 max_char, 2199 &fall_through, 2200 zeroth_entry_is_failure ? &fall_through : on_failure, 2201 zeroth_entry_is_failure ? on_failure : &fall_through); 2202 macro_assembler->Bind(&fall_through); 2203 } 2204 2205 2206 RegExpNode::~RegExpNode() { 2207 } 2208 2209 2210 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler, 2211 Trace* trace) { 2212 // If we are generating a greedy loop then don't stop and don't reuse code. 2213 if (trace->stop_node() != NULL) { 2214 return CONTINUE; 2215 } 2216 2217 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 2218 if (trace->is_trivial()) { 2219 if (label_.is_bound() || on_work_list() || !KeepRecursing(compiler)) { 2220 // If a generic version is already scheduled to be generated or we have 2221 // recursed too deeply then just generate a jump to that code. 2222 macro_assembler->GoTo(&label_); 2223 // This will queue it up for generation of a generic version if it hasn't 2224 // already been queued. 2225 compiler->AddWork(this); 2226 return DONE; 2227 } 2228 // Generate generic version of the node and bind the label for later use. 2229 macro_assembler->Bind(&label_); 2230 return CONTINUE; 2231 } 2232 2233 // We are being asked to make a non-generic version. Keep track of how many 2234 // non-generic versions we generate so as not to overdo it. 2235 trace_count_++; 2236 if (KeepRecursing(compiler) && compiler->optimize() && 2237 trace_count_ < kMaxCopiesCodeGenerated) { 2238 return CONTINUE; 2239 } 2240 2241 // If we get here code has been generated for this node too many times or 2242 // recursion is too deep. Time to switch to a generic version. The code for 2243 // generic versions above can handle deep recursion properly. 2244 bool was_limiting = compiler->limiting_recursion(); 2245 compiler->set_limiting_recursion(true); 2246 trace->Flush(compiler, this); 2247 compiler->set_limiting_recursion(was_limiting); 2248 return DONE; 2249 } 2250 2251 2252 bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) { 2253 return !compiler->limiting_recursion() && 2254 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion; 2255 } 2256 2257 2258 int ActionNode::EatsAtLeast(int still_to_find, 2259 int budget, 2260 bool not_at_start) { 2261 if (budget <= 0) return 0; 2262 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input! 2263 return on_success()->EatsAtLeast(still_to_find, 2264 budget - 1, 2265 not_at_start); 2266 } 2267 2268 2269 void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget, 2270 BoyerMooreLookahead* bm, bool not_at_start) { 2271 if (action_type_ == BEGIN_SUBMATCH) { 2272 bm->SetRest(offset); 2273 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) { 2274 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); 2275 } 2276 SaveBMInfo(bm, not_at_start, offset); 2277 } 2278 2279 2280 int AssertionNode::EatsAtLeast(int still_to_find, 2281 int budget, 2282 bool not_at_start) { 2283 if (budget <= 0) return 0; 2284 // If we know we are not at the start and we are asked "how many characters 2285 // will you match if you succeed?" then we can answer anything since false 2286 // implies false. So lets just return the max answer (still_to_find) since 2287 // that won't prevent us from preloading a lot of characters for the other 2288 // branches in the node graph. 2289 if (assertion_type() == AT_START && not_at_start) return still_to_find; 2290 return on_success()->EatsAtLeast(still_to_find, 2291 budget - 1, 2292 not_at_start); 2293 } 2294 2295 2296 void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget, 2297 BoyerMooreLookahead* bm, bool not_at_start) { 2298 // Match the behaviour of EatsAtLeast on this node. 2299 if (assertion_type() == AT_START && not_at_start) return; 2300 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); 2301 SaveBMInfo(bm, not_at_start, offset); 2302 } 2303 2304 2305 int BackReferenceNode::EatsAtLeast(int still_to_find, 2306 int budget, 2307 bool not_at_start) { 2308 if (read_backward()) return 0; 2309 if (budget <= 0) return 0; 2310 return on_success()->EatsAtLeast(still_to_find, 2311 budget - 1, 2312 not_at_start); 2313 } 2314 2315 2316 int TextNode::EatsAtLeast(int still_to_find, 2317 int budget, 2318 bool not_at_start) { 2319 if (read_backward()) return 0; 2320 int answer = Length(); 2321 if (answer >= still_to_find) return answer; 2322 if (budget <= 0) return answer; 2323 // We are not at start after this node so we set the last argument to 'true'. 2324 return answer + on_success()->EatsAtLeast(still_to_find - answer, 2325 budget - 1, 2326 true); 2327 } 2328 2329 2330 int NegativeLookaroundChoiceNode::EatsAtLeast(int still_to_find, int budget, 2331 bool not_at_start) { 2332 if (budget <= 0) return 0; 2333 // Alternative 0 is the negative lookahead, alternative 1 is what comes 2334 // afterwards. 2335 RegExpNode* node = alternatives_->at(1).node(); 2336 return node->EatsAtLeast(still_to_find, budget - 1, not_at_start); 2337 } 2338 2339 2340 void NegativeLookaroundChoiceNode::GetQuickCheckDetails( 2341 QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in, 2342 bool not_at_start) { 2343 // Alternative 0 is the negative lookahead, alternative 1 is what comes 2344 // afterwards. 2345 RegExpNode* node = alternatives_->at(1).node(); 2346 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start); 2347 } 2348 2349 2350 int ChoiceNode::EatsAtLeastHelper(int still_to_find, 2351 int budget, 2352 RegExpNode* ignore_this_node, 2353 bool not_at_start) { 2354 if (budget <= 0) return 0; 2355 int min = 100; 2356 int choice_count = alternatives_->length(); 2357 budget = (budget - 1) / choice_count; 2358 for (int i = 0; i < choice_count; i++) { 2359 RegExpNode* node = alternatives_->at(i).node(); 2360 if (node == ignore_this_node) continue; 2361 int node_eats_at_least = 2362 node->EatsAtLeast(still_to_find, budget, not_at_start); 2363 if (node_eats_at_least < min) min = node_eats_at_least; 2364 if (min == 0) return 0; 2365 } 2366 return min; 2367 } 2368 2369 2370 int LoopChoiceNode::EatsAtLeast(int still_to_find, 2371 int budget, 2372 bool not_at_start) { 2373 return EatsAtLeastHelper(still_to_find, 2374 budget - 1, 2375 loop_node_, 2376 not_at_start); 2377 } 2378 2379 2380 int ChoiceNode::EatsAtLeast(int still_to_find, 2381 int budget, 2382 bool not_at_start) { 2383 return EatsAtLeastHelper(still_to_find, 2384 budget, 2385 NULL, 2386 not_at_start); 2387 } 2388 2389 2390 // Takes the left-most 1-bit and smears it out, setting all bits to its right. 2391 static inline uint32_t SmearBitsRight(uint32_t v) { 2392 v |= v >> 1; 2393 v |= v >> 2; 2394 v |= v >> 4; 2395 v |= v >> 8; 2396 v |= v >> 16; 2397 return v; 2398 } 2399 2400 2401 bool QuickCheckDetails::Rationalize(bool asc) { 2402 bool found_useful_op = false; 2403 uint32_t char_mask; 2404 if (asc) { 2405 char_mask = String::kMaxOneByteCharCode; 2406 } else { 2407 char_mask = String::kMaxUtf16CodeUnit; 2408 } 2409 mask_ = 0; 2410 value_ = 0; 2411 int char_shift = 0; 2412 for (int i = 0; i < characters_; i++) { 2413 Position* pos = &positions_[i]; 2414 if ((pos->mask & String::kMaxOneByteCharCode) != 0) { 2415 found_useful_op = true; 2416 } 2417 mask_ |= (pos->mask & char_mask) << char_shift; 2418 value_ |= (pos->value & char_mask) << char_shift; 2419 char_shift += asc ? 8 : 16; 2420 } 2421 return found_useful_op; 2422 } 2423 2424 2425 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler, 2426 Trace* bounds_check_trace, 2427 Trace* trace, 2428 bool preload_has_checked_bounds, 2429 Label* on_possible_success, 2430 QuickCheckDetails* details, 2431 bool fall_through_on_failure) { 2432 if (details->characters() == 0) return false; 2433 GetQuickCheckDetails( 2434 details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE); 2435 if (details->cannot_match()) return false; 2436 if (!details->Rationalize(compiler->one_byte())) return false; 2437 DCHECK(details->characters() == 1 || 2438 compiler->macro_assembler()->CanReadUnaligned()); 2439 uint32_t mask = details->mask(); 2440 uint32_t value = details->value(); 2441 2442 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2443 2444 if (trace->characters_preloaded() != details->characters()) { 2445 DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset()); 2446 // We are attempting to preload the minimum number of characters 2447 // any choice would eat, so if the bounds check fails, then none of the 2448 // choices can succeed, so we can just immediately backtrack, rather 2449 // than go to the next choice. 2450 assembler->LoadCurrentCharacter(trace->cp_offset(), 2451 bounds_check_trace->backtrack(), 2452 !preload_has_checked_bounds, 2453 details->characters()); 2454 } 2455 2456 2457 bool need_mask = true; 2458 2459 if (details->characters() == 1) { 2460 // If number of characters preloaded is 1 then we used a byte or 16 bit 2461 // load so the value is already masked down. 2462 uint32_t char_mask; 2463 if (compiler->one_byte()) { 2464 char_mask = String::kMaxOneByteCharCode; 2465 } else { 2466 char_mask = String::kMaxUtf16CodeUnit; 2467 } 2468 if ((mask & char_mask) == char_mask) need_mask = false; 2469 mask &= char_mask; 2470 } else { 2471 // For 2-character preloads in one-byte mode or 1-character preloads in 2472 // two-byte mode we also use a 16 bit load with zero extend. 2473 if (details->characters() == 2 && compiler->one_byte()) { 2474 if ((mask & 0xffff) == 0xffff) need_mask = false; 2475 } else if (details->characters() == 1 && !compiler->one_byte()) { 2476 if ((mask & 0xffff) == 0xffff) need_mask = false; 2477 } else { 2478 if (mask == 0xffffffff) need_mask = false; 2479 } 2480 } 2481 2482 if (fall_through_on_failure) { 2483 if (need_mask) { 2484 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success); 2485 } else { 2486 assembler->CheckCharacter(value, on_possible_success); 2487 } 2488 } else { 2489 if (need_mask) { 2490 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack()); 2491 } else { 2492 assembler->CheckNotCharacter(value, trace->backtrack()); 2493 } 2494 } 2495 return true; 2496 } 2497 2498 2499 // Here is the meat of GetQuickCheckDetails (see also the comment on the 2500 // super-class in the .h file). 2501 // 2502 // We iterate along the text object, building up for each character a 2503 // mask and value that can be used to test for a quick failure to match. 2504 // The masks and values for the positions will be combined into a single 2505 // machine word for the current character width in order to be used in 2506 // generating a quick check. 2507 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, 2508 RegExpCompiler* compiler, 2509 int characters_filled_in, 2510 bool not_at_start) { 2511 // Do not collect any quick check details if the text node reads backward, 2512 // since it reads in the opposite direction than we use for quick checks. 2513 if (read_backward()) return; 2514 Isolate* isolate = compiler->macro_assembler()->isolate(); 2515 DCHECK(characters_filled_in < details->characters()); 2516 int characters = details->characters(); 2517 int char_mask; 2518 if (compiler->one_byte()) { 2519 char_mask = String::kMaxOneByteCharCode; 2520 } else { 2521 char_mask = String::kMaxUtf16CodeUnit; 2522 } 2523 for (int k = 0; k < elements()->length(); k++) { 2524 TextElement elm = elements()->at(k); 2525 if (elm.text_type() == TextElement::ATOM) { 2526 Vector<const uc16> quarks = elm.atom()->data(); 2527 for (int i = 0; i < characters && i < quarks.length(); i++) { 2528 QuickCheckDetails::Position* pos = 2529 details->positions(characters_filled_in); 2530 uc16 c = quarks[i]; 2531 if (compiler->ignore_case()) { 2532 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 2533 int length = GetCaseIndependentLetters(isolate, c, 2534 compiler->one_byte(), chars); 2535 if (length == 0) { 2536 // This can happen because all case variants are non-Latin1, but we 2537 // know the input is Latin1. 2538 details->set_cannot_match(); 2539 pos->determines_perfectly = false; 2540 return; 2541 } 2542 if (length == 1) { 2543 // This letter has no case equivalents, so it's nice and simple 2544 // and the mask-compare will determine definitely whether we have 2545 // a match at this character position. 2546 pos->mask = char_mask; 2547 pos->value = c; 2548 pos->determines_perfectly = true; 2549 } else { 2550 uint32_t common_bits = char_mask; 2551 uint32_t bits = chars[0]; 2552 for (int j = 1; j < length; j++) { 2553 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits); 2554 common_bits ^= differing_bits; 2555 bits &= common_bits; 2556 } 2557 // If length is 2 and common bits has only one zero in it then 2558 // our mask and compare instruction will determine definitely 2559 // whether we have a match at this character position. Otherwise 2560 // it can only be an approximate check. 2561 uint32_t one_zero = (common_bits | ~char_mask); 2562 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) { 2563 pos->determines_perfectly = true; 2564 } 2565 pos->mask = common_bits; 2566 pos->value = bits; 2567 } 2568 } else { 2569 // Don't ignore case. Nice simple case where the mask-compare will 2570 // determine definitely whether we have a match at this character 2571 // position. 2572 if (c > char_mask) { 2573 details->set_cannot_match(); 2574 pos->determines_perfectly = false; 2575 return; 2576 } 2577 pos->mask = char_mask; 2578 pos->value = c; 2579 pos->determines_perfectly = true; 2580 } 2581 characters_filled_in++; 2582 DCHECK(characters_filled_in <= details->characters()); 2583 if (characters_filled_in == details->characters()) { 2584 return; 2585 } 2586 } 2587 } else { 2588 QuickCheckDetails::Position* pos = 2589 details->positions(characters_filled_in); 2590 RegExpCharacterClass* tree = elm.char_class(); 2591 ZoneList<CharacterRange>* ranges = tree->ranges(zone()); 2592 if (tree->is_negated()) { 2593 // A quick check uses multi-character mask and compare. There is no 2594 // useful way to incorporate a negative char class into this scheme 2595 // so we just conservatively create a mask and value that will always 2596 // succeed. 2597 pos->mask = 0; 2598 pos->value = 0; 2599 } else { 2600 int first_range = 0; 2601 while (ranges->at(first_range).from() > char_mask) { 2602 first_range++; 2603 if (first_range == ranges->length()) { 2604 details->set_cannot_match(); 2605 pos->determines_perfectly = false; 2606 return; 2607 } 2608 } 2609 CharacterRange range = ranges->at(first_range); 2610 uc16 from = range.from(); 2611 uc16 to = range.to(); 2612 if (to > char_mask) { 2613 to = char_mask; 2614 } 2615 uint32_t differing_bits = (from ^ to); 2616 // A mask and compare is only perfect if the differing bits form a 2617 // number like 00011111 with one single block of trailing 1s. 2618 if ((differing_bits & (differing_bits + 1)) == 0 && 2619 from + differing_bits == to) { 2620 pos->determines_perfectly = true; 2621 } 2622 uint32_t common_bits = ~SmearBitsRight(differing_bits); 2623 uint32_t bits = (from & common_bits); 2624 for (int i = first_range + 1; i < ranges->length(); i++) { 2625 CharacterRange range = ranges->at(i); 2626 uc16 from = range.from(); 2627 uc16 to = range.to(); 2628 if (from > char_mask) continue; 2629 if (to > char_mask) to = char_mask; 2630 // Here we are combining more ranges into the mask and compare 2631 // value. With each new range the mask becomes more sparse and 2632 // so the chances of a false positive rise. A character class 2633 // with multiple ranges is assumed never to be equivalent to a 2634 // mask and compare operation. 2635 pos->determines_perfectly = false; 2636 uint32_t new_common_bits = (from ^ to); 2637 new_common_bits = ~SmearBitsRight(new_common_bits); 2638 common_bits &= new_common_bits; 2639 bits &= new_common_bits; 2640 uint32_t differing_bits = (from & common_bits) ^ bits; 2641 common_bits ^= differing_bits; 2642 bits &= common_bits; 2643 } 2644 pos->mask = common_bits; 2645 pos->value = bits; 2646 } 2647 characters_filled_in++; 2648 DCHECK(characters_filled_in <= details->characters()); 2649 if (characters_filled_in == details->characters()) { 2650 return; 2651 } 2652 } 2653 } 2654 DCHECK(characters_filled_in != details->characters()); 2655 if (!details->cannot_match()) { 2656 on_success()-> GetQuickCheckDetails(details, 2657 compiler, 2658 characters_filled_in, 2659 true); 2660 } 2661 } 2662 2663 2664 void QuickCheckDetails::Clear() { 2665 for (int i = 0; i < characters_; i++) { 2666 positions_[i].mask = 0; 2667 positions_[i].value = 0; 2668 positions_[i].determines_perfectly = false; 2669 } 2670 characters_ = 0; 2671 } 2672 2673 2674 void QuickCheckDetails::Advance(int by, bool one_byte) { 2675 if (by >= characters_ || by < 0) { 2676 DCHECK_IMPLIES(by < 0, characters_ == 0); 2677 Clear(); 2678 return; 2679 } 2680 DCHECK_LE(characters_ - by, 4); 2681 DCHECK_LE(characters_, 4); 2682 for (int i = 0; i < characters_ - by; i++) { 2683 positions_[i] = positions_[by + i]; 2684 } 2685 for (int i = characters_ - by; i < characters_; i++) { 2686 positions_[i].mask = 0; 2687 positions_[i].value = 0; 2688 positions_[i].determines_perfectly = false; 2689 } 2690 characters_ -= by; 2691 // We could change mask_ and value_ here but we would never advance unless 2692 // they had already been used in a check and they won't be used again because 2693 // it would gain us nothing. So there's no point. 2694 } 2695 2696 2697 void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) { 2698 DCHECK(characters_ == other->characters_); 2699 if (other->cannot_match_) { 2700 return; 2701 } 2702 if (cannot_match_) { 2703 *this = *other; 2704 return; 2705 } 2706 for (int i = from_index; i < characters_; i++) { 2707 QuickCheckDetails::Position* pos = positions(i); 2708 QuickCheckDetails::Position* other_pos = other->positions(i); 2709 if (pos->mask != other_pos->mask || 2710 pos->value != other_pos->value || 2711 !other_pos->determines_perfectly) { 2712 // Our mask-compare operation will be approximate unless we have the 2713 // exact same operation on both sides of the alternation. 2714 pos->determines_perfectly = false; 2715 } 2716 pos->mask &= other_pos->mask; 2717 pos->value &= pos->mask; 2718 other_pos->value &= pos->mask; 2719 uc16 differing_bits = (pos->value ^ other_pos->value); 2720 pos->mask &= ~differing_bits; 2721 pos->value &= pos->mask; 2722 } 2723 } 2724 2725 2726 class VisitMarker { 2727 public: 2728 explicit VisitMarker(NodeInfo* info) : info_(info) { 2729 DCHECK(!info->visited); 2730 info->visited = true; 2731 } 2732 ~VisitMarker() { 2733 info_->visited = false; 2734 } 2735 private: 2736 NodeInfo* info_; 2737 }; 2738 2739 2740 RegExpNode* SeqRegExpNode::FilterOneByte(int depth, bool ignore_case) { 2741 if (info()->replacement_calculated) return replacement(); 2742 if (depth < 0) return this; 2743 DCHECK(!info()->visited); 2744 VisitMarker marker(info()); 2745 return FilterSuccessor(depth - 1, ignore_case); 2746 } 2747 2748 2749 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) { 2750 RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case); 2751 if (next == NULL) return set_replacement(NULL); 2752 on_success_ = next; 2753 return set_replacement(this); 2754 } 2755 2756 2757 // We need to check for the following characters: 0x39c 0x3bc 0x178. 2758 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) { 2759 // TODO(dcarney): this could be a lot more efficient. 2760 return range.Contains(0x39c) || 2761 range.Contains(0x3bc) || range.Contains(0x178); 2762 } 2763 2764 2765 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) { 2766 for (int i = 0; i < ranges->length(); i++) { 2767 // TODO(dcarney): this could be a lot more efficient. 2768 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true; 2769 } 2770 return false; 2771 } 2772 2773 2774 RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) { 2775 if (info()->replacement_calculated) return replacement(); 2776 if (depth < 0) return this; 2777 DCHECK(!info()->visited); 2778 VisitMarker marker(info()); 2779 int element_count = elements()->length(); 2780 for (int i = 0; i < element_count; i++) { 2781 TextElement elm = elements()->at(i); 2782 if (elm.text_type() == TextElement::ATOM) { 2783 Vector<const uc16> quarks = elm.atom()->data(); 2784 for (int j = 0; j < quarks.length(); j++) { 2785 uint16_t c = quarks[j]; 2786 if (c <= String::kMaxOneByteCharCode) continue; 2787 if (!ignore_case) return set_replacement(NULL); 2788 // Here, we need to check for characters whose upper and lower cases 2789 // are outside the Latin-1 range. 2790 uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c); 2791 // Character is outside Latin-1 completely 2792 if (converted == 0) return set_replacement(NULL); 2793 // Convert quark to Latin-1 in place. 2794 uint16_t* copy = const_cast<uint16_t*>(quarks.start()); 2795 copy[j] = converted; 2796 } 2797 } else { 2798 DCHECK(elm.text_type() == TextElement::CHAR_CLASS); 2799 RegExpCharacterClass* cc = elm.char_class(); 2800 ZoneList<CharacterRange>* ranges = cc->ranges(zone()); 2801 if (!CharacterRange::IsCanonical(ranges)) { 2802 CharacterRange::Canonicalize(ranges); 2803 } 2804 // Now they are in order so we only need to look at the first. 2805 int range_count = ranges->length(); 2806 if (cc->is_negated()) { 2807 if (range_count != 0 && 2808 ranges->at(0).from() == 0 && 2809 ranges->at(0).to() >= String::kMaxOneByteCharCode) { 2810 // This will be handled in a later filter. 2811 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue; 2812 return set_replacement(NULL); 2813 } 2814 } else { 2815 if (range_count == 0 || 2816 ranges->at(0).from() > String::kMaxOneByteCharCode) { 2817 // This will be handled in a later filter. 2818 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue; 2819 return set_replacement(NULL); 2820 } 2821 } 2822 } 2823 } 2824 return FilterSuccessor(depth - 1, ignore_case); 2825 } 2826 2827 2828 RegExpNode* LoopChoiceNode::FilterOneByte(int depth, bool ignore_case) { 2829 if (info()->replacement_calculated) return replacement(); 2830 if (depth < 0) return this; 2831 if (info()->visited) return this; 2832 { 2833 VisitMarker marker(info()); 2834 2835 RegExpNode* continue_replacement = 2836 continue_node_->FilterOneByte(depth - 1, ignore_case); 2837 // If we can't continue after the loop then there is no sense in doing the 2838 // loop. 2839 if (continue_replacement == NULL) return set_replacement(NULL); 2840 } 2841 2842 return ChoiceNode::FilterOneByte(depth - 1, ignore_case); 2843 } 2844 2845 2846 RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) { 2847 if (info()->replacement_calculated) return replacement(); 2848 if (depth < 0) return this; 2849 if (info()->visited) return this; 2850 VisitMarker marker(info()); 2851 int choice_count = alternatives_->length(); 2852 2853 for (int i = 0; i < choice_count; i++) { 2854 GuardedAlternative alternative = alternatives_->at(i); 2855 if (alternative.guards() != NULL && alternative.guards()->length() != 0) { 2856 set_replacement(this); 2857 return this; 2858 } 2859 } 2860 2861 int surviving = 0; 2862 RegExpNode* survivor = NULL; 2863 for (int i = 0; i < choice_count; i++) { 2864 GuardedAlternative alternative = alternatives_->at(i); 2865 RegExpNode* replacement = 2866 alternative.node()->FilterOneByte(depth - 1, ignore_case); 2867 DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK. 2868 if (replacement != NULL) { 2869 alternatives_->at(i).set_node(replacement); 2870 surviving++; 2871 survivor = replacement; 2872 } 2873 } 2874 if (surviving < 2) return set_replacement(survivor); 2875 2876 set_replacement(this); 2877 if (surviving == choice_count) { 2878 return this; 2879 } 2880 // Only some of the nodes survived the filtering. We need to rebuild the 2881 // alternatives list. 2882 ZoneList<GuardedAlternative>* new_alternatives = 2883 new(zone()) ZoneList<GuardedAlternative>(surviving, zone()); 2884 for (int i = 0; i < choice_count; i++) { 2885 RegExpNode* replacement = 2886 alternatives_->at(i).node()->FilterOneByte(depth - 1, ignore_case); 2887 if (replacement != NULL) { 2888 alternatives_->at(i).set_node(replacement); 2889 new_alternatives->Add(alternatives_->at(i), zone()); 2890 } 2891 } 2892 alternatives_ = new_alternatives; 2893 return this; 2894 } 2895 2896 2897 RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth, 2898 bool ignore_case) { 2899 if (info()->replacement_calculated) return replacement(); 2900 if (depth < 0) return this; 2901 if (info()->visited) return this; 2902 VisitMarker marker(info()); 2903 // Alternative 0 is the negative lookahead, alternative 1 is what comes 2904 // afterwards. 2905 RegExpNode* node = alternatives_->at(1).node(); 2906 RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case); 2907 if (replacement == NULL) return set_replacement(NULL); 2908 alternatives_->at(1).set_node(replacement); 2909 2910 RegExpNode* neg_node = alternatives_->at(0).node(); 2911 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case); 2912 // If the negative lookahead is always going to fail then 2913 // we don't need to check it. 2914 if (neg_replacement == NULL) return set_replacement(replacement); 2915 alternatives_->at(0).set_node(neg_replacement); 2916 return set_replacement(this); 2917 } 2918 2919 2920 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 2921 RegExpCompiler* compiler, 2922 int characters_filled_in, 2923 bool not_at_start) { 2924 if (body_can_be_zero_length_ || info()->visited) return; 2925 VisitMarker marker(info()); 2926 return ChoiceNode::GetQuickCheckDetails(details, 2927 compiler, 2928 characters_filled_in, 2929 not_at_start); 2930 } 2931 2932 2933 void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget, 2934 BoyerMooreLookahead* bm, bool not_at_start) { 2935 if (body_can_be_zero_length_ || budget <= 0) { 2936 bm->SetRest(offset); 2937 SaveBMInfo(bm, not_at_start, offset); 2938 return; 2939 } 2940 ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); 2941 SaveBMInfo(bm, not_at_start, offset); 2942 } 2943 2944 2945 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 2946 RegExpCompiler* compiler, 2947 int characters_filled_in, 2948 bool not_at_start) { 2949 not_at_start = (not_at_start || not_at_start_); 2950 int choice_count = alternatives_->length(); 2951 DCHECK(choice_count > 0); 2952 alternatives_->at(0).node()->GetQuickCheckDetails(details, 2953 compiler, 2954 characters_filled_in, 2955 not_at_start); 2956 for (int i = 1; i < choice_count; i++) { 2957 QuickCheckDetails new_details(details->characters()); 2958 RegExpNode* node = alternatives_->at(i).node(); 2959 node->GetQuickCheckDetails(&new_details, compiler, 2960 characters_filled_in, 2961 not_at_start); 2962 // Here we merge the quick match details of the two branches. 2963 details->Merge(&new_details, characters_filled_in); 2964 } 2965 } 2966 2967 2968 // Check for [0-9A-Z_a-z]. 2969 static void EmitWordCheck(RegExpMacroAssembler* assembler, 2970 Label* word, 2971 Label* non_word, 2972 bool fall_through_on_word) { 2973 if (assembler->CheckSpecialCharacterClass( 2974 fall_through_on_word ? 'w' : 'W', 2975 fall_through_on_word ? non_word : word)) { 2976 // Optimized implementation available. 2977 return; 2978 } 2979 assembler->CheckCharacterGT('z', non_word); 2980 assembler->CheckCharacterLT('0', non_word); 2981 assembler->CheckCharacterGT('a' - 1, word); 2982 assembler->CheckCharacterLT('9' + 1, word); 2983 assembler->CheckCharacterLT('A', non_word); 2984 assembler->CheckCharacterLT('Z' + 1, word); 2985 if (fall_through_on_word) { 2986 assembler->CheckNotCharacter('_', non_word); 2987 } else { 2988 assembler->CheckCharacter('_', word); 2989 } 2990 } 2991 2992 2993 // Emit the code to check for a ^ in multiline mode (1-character lookbehind 2994 // that matches newline or the start of input). 2995 static void EmitHat(RegExpCompiler* compiler, 2996 RegExpNode* on_success, 2997 Trace* trace) { 2998 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2999 // We will be loading the previous character into the current character 3000 // register. 3001 Trace new_trace(*trace); 3002 new_trace.InvalidateCurrentCharacter(); 3003 3004 Label ok; 3005 if (new_trace.cp_offset() == 0) { 3006 // The start of input counts as a newline in this context, so skip to 3007 // ok if we are at the start. 3008 assembler->CheckAtStart(&ok); 3009 } 3010 // We already checked that we are not at the start of input so it must be 3011 // OK to load the previous character. 3012 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1, 3013 new_trace.backtrack(), 3014 false); 3015 if (!assembler->CheckSpecialCharacterClass('n', 3016 new_trace.backtrack())) { 3017 // Newline means \n, \r, 0x2028 or 0x2029. 3018 if (!compiler->one_byte()) { 3019 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok); 3020 } 3021 assembler->CheckCharacter('\n', &ok); 3022 assembler->CheckNotCharacter('\r', new_trace.backtrack()); 3023 } 3024 assembler->Bind(&ok); 3025 on_success->Emit(compiler, &new_trace); 3026 } 3027 3028 3029 // Emit the code to handle \b and \B (word-boundary or non-word-boundary). 3030 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) { 3031 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3032 Isolate* isolate = assembler->isolate(); 3033 Trace::TriBool next_is_word_character = Trace::UNKNOWN; 3034 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE); 3035 BoyerMooreLookahead* lookahead = bm_info(not_at_start); 3036 if (lookahead == NULL) { 3037 int eats_at_least = 3038 Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore, 3039 kRecursionBudget, 3040 not_at_start)); 3041 if (eats_at_least >= 1) { 3042 BoyerMooreLookahead* bm = 3043 new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone()); 3044 FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start); 3045 if (bm->at(0)->is_non_word()) 3046 next_is_word_character = Trace::FALSE_VALUE; 3047 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE; 3048 } 3049 } else { 3050 if (lookahead->at(0)->is_non_word()) 3051 next_is_word_character = Trace::FALSE_VALUE; 3052 if (lookahead->at(0)->is_word()) 3053 next_is_word_character = Trace::TRUE_VALUE; 3054 } 3055 bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY); 3056 if (next_is_word_character == Trace::UNKNOWN) { 3057 Label before_non_word; 3058 Label before_word; 3059 if (trace->characters_preloaded() != 1) { 3060 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word); 3061 } 3062 // Fall through on non-word. 3063 EmitWordCheck(assembler, &before_word, &before_non_word, false); 3064 // Next character is not a word character. 3065 assembler->Bind(&before_non_word); 3066 Label ok; 3067 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord); 3068 assembler->GoTo(&ok); 3069 3070 assembler->Bind(&before_word); 3071 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord); 3072 assembler->Bind(&ok); 3073 } else if (next_is_word_character == Trace::TRUE_VALUE) { 3074 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord); 3075 } else { 3076 DCHECK(next_is_word_character == Trace::FALSE_VALUE); 3077 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord); 3078 } 3079 } 3080 3081 3082 void AssertionNode::BacktrackIfPrevious( 3083 RegExpCompiler* compiler, 3084 Trace* trace, 3085 AssertionNode::IfPrevious backtrack_if_previous) { 3086 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3087 Trace new_trace(*trace); 3088 new_trace.InvalidateCurrentCharacter(); 3089 3090 Label fall_through, dummy; 3091 3092 Label* non_word = backtrack_if_previous == kIsNonWord ? 3093 new_trace.backtrack() : 3094 &fall_through; 3095 Label* word = backtrack_if_previous == kIsNonWord ? 3096 &fall_through : 3097 new_trace.backtrack(); 3098 3099 if (new_trace.cp_offset() == 0) { 3100 // The start of input counts as a non-word character, so the question is 3101 // decided if we are at the start. 3102 assembler->CheckAtStart(non_word); 3103 } 3104 // We already checked that we are not at the start of input so it must be 3105 // OK to load the previous character. 3106 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false); 3107 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord); 3108 3109 assembler->Bind(&fall_through); 3110 on_success()->Emit(compiler, &new_trace); 3111 } 3112 3113 3114 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details, 3115 RegExpCompiler* compiler, 3116 int filled_in, 3117 bool not_at_start) { 3118 if (assertion_type_ == AT_START && not_at_start) { 3119 details->set_cannot_match(); 3120 return; 3121 } 3122 return on_success()->GetQuickCheckDetails(details, 3123 compiler, 3124 filled_in, 3125 not_at_start); 3126 } 3127 3128 3129 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3130 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3131 switch (assertion_type_) { 3132 case AT_END: { 3133 Label ok; 3134 assembler->CheckPosition(trace->cp_offset(), &ok); 3135 assembler->GoTo(trace->backtrack()); 3136 assembler->Bind(&ok); 3137 break; 3138 } 3139 case AT_START: { 3140 if (trace->at_start() == Trace::FALSE_VALUE) { 3141 assembler->GoTo(trace->backtrack()); 3142 return; 3143 } 3144 if (trace->at_start() == Trace::UNKNOWN) { 3145 assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack()); 3146 Trace at_start_trace = *trace; 3147 at_start_trace.set_at_start(Trace::TRUE_VALUE); 3148 on_success()->Emit(compiler, &at_start_trace); 3149 return; 3150 } 3151 } 3152 break; 3153 case AFTER_NEWLINE: 3154 EmitHat(compiler, on_success(), trace); 3155 return; 3156 case AT_BOUNDARY: 3157 case AT_NON_BOUNDARY: { 3158 EmitBoundaryCheck(compiler, trace); 3159 return; 3160 } 3161 } 3162 on_success()->Emit(compiler, trace); 3163 } 3164 3165 3166 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) { 3167 if (quick_check == NULL) return false; 3168 if (offset >= quick_check->characters()) return false; 3169 return quick_check->positions(offset)->determines_perfectly; 3170 } 3171 3172 3173 static void UpdateBoundsCheck(int index, int* checked_up_to) { 3174 if (index > *checked_up_to) { 3175 *checked_up_to = index; 3176 } 3177 } 3178 3179 3180 // We call this repeatedly to generate code for each pass over the text node. 3181 // The passes are in increasing order of difficulty because we hope one 3182 // of the first passes will fail in which case we are saved the work of the 3183 // later passes. for example for the case independent regexp /%[asdfghjkl]a/ 3184 // we will check the '%' in the first pass, the case independent 'a' in the 3185 // second pass and the character class in the last pass. 3186 // 3187 // The passes are done from right to left, so for example to test for /bar/ 3188 // we will first test for an 'r' with offset 2, then an 'a' with offset 1 3189 // and then a 'b' with offset 0. This means we can avoid the end-of-input 3190 // bounds check most of the time. In the example we only need to check for 3191 // end-of-input when loading the putative 'r'. 3192 // 3193 // A slight complication involves the fact that the first character may already 3194 // be fetched into a register by the previous node. In this case we want to 3195 // do the test for that character first. We do this in separate passes. The 3196 // 'preloaded' argument indicates that we are doing such a 'pass'. If such a 3197 // pass has been performed then subsequent passes will have true in 3198 // first_element_checked to indicate that that character does not need to be 3199 // checked again. 3200 // 3201 // In addition to all this we are passed a Trace, which can 3202 // contain an AlternativeGeneration object. In this AlternativeGeneration 3203 // object we can see details of any quick check that was already passed in 3204 // order to get to the code we are now generating. The quick check can involve 3205 // loading characters, which means we do not need to recheck the bounds 3206 // up to the limit the quick check already checked. In addition the quick 3207 // check can have involved a mask and compare operation which may simplify 3208 // or obviate the need for further checks at some character positions. 3209 void TextNode::TextEmitPass(RegExpCompiler* compiler, 3210 TextEmitPassType pass, 3211 bool preloaded, 3212 Trace* trace, 3213 bool first_element_checked, 3214 int* checked_up_to) { 3215 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3216 Isolate* isolate = assembler->isolate(); 3217 bool one_byte = compiler->one_byte(); 3218 Label* backtrack = trace->backtrack(); 3219 QuickCheckDetails* quick_check = trace->quick_check_performed(); 3220 int element_count = elements()->length(); 3221 int backward_offset = read_backward() ? -Length() : 0; 3222 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) { 3223 TextElement elm = elements()->at(i); 3224 int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset; 3225 if (elm.text_type() == TextElement::ATOM) { 3226 Vector<const uc16> quarks = elm.atom()->data(); 3227 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { 3228 if (first_element_checked && i == 0 && j == 0) continue; 3229 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue; 3230 EmitCharacterFunction* emit_function = NULL; 3231 switch (pass) { 3232 case NON_LATIN1_MATCH: 3233 DCHECK(one_byte); 3234 if (quarks[j] > String::kMaxOneByteCharCode) { 3235 assembler->GoTo(backtrack); 3236 return; 3237 } 3238 break; 3239 case NON_LETTER_CHARACTER_MATCH: 3240 emit_function = &EmitAtomNonLetter; 3241 break; 3242 case SIMPLE_CHARACTER_MATCH: 3243 emit_function = &EmitSimpleCharacter; 3244 break; 3245 case CASE_CHARACTER_MATCH: 3246 emit_function = &EmitAtomLetter; 3247 break; 3248 default: 3249 break; 3250 } 3251 if (emit_function != NULL) { 3252 bool bounds_check = *checked_up_to < cp_offset + j || read_backward(); 3253 bool bound_checked = 3254 emit_function(isolate, compiler, quarks[j], backtrack, 3255 cp_offset + j, bounds_check, preloaded); 3256 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); 3257 } 3258 } 3259 } else { 3260 DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type()); 3261 if (pass == CHARACTER_CLASS_MATCH) { 3262 if (first_element_checked && i == 0) continue; 3263 if (DeterminedAlready(quick_check, elm.cp_offset())) continue; 3264 RegExpCharacterClass* cc = elm.char_class(); 3265 bool bounds_check = *checked_up_to < cp_offset || read_backward(); 3266 EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset, 3267 bounds_check, preloaded, zone()); 3268 UpdateBoundsCheck(cp_offset, checked_up_to); 3269 } 3270 } 3271 } 3272 } 3273 3274 3275 int TextNode::Length() { 3276 TextElement elm = elements()->last(); 3277 DCHECK(elm.cp_offset() >= 0); 3278 return elm.cp_offset() + elm.length(); 3279 } 3280 3281 3282 bool TextNode::SkipPass(int int_pass, bool ignore_case) { 3283 TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass); 3284 if (ignore_case) { 3285 return pass == SIMPLE_CHARACTER_MATCH; 3286 } else { 3287 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH; 3288 } 3289 } 3290 3291 3292 // This generates the code to match a text node. A text node can contain 3293 // straight character sequences (possibly to be matched in a case-independent 3294 // way) and character classes. For efficiency we do not do this in a single 3295 // pass from left to right. Instead we pass over the text node several times, 3296 // emitting code for some character positions every time. See the comment on 3297 // TextEmitPass for details. 3298 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3299 LimitResult limit_result = LimitVersions(compiler, trace); 3300 if (limit_result == DONE) return; 3301 DCHECK(limit_result == CONTINUE); 3302 3303 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) { 3304 compiler->SetRegExpTooBig(); 3305 return; 3306 } 3307 3308 if (compiler->one_byte()) { 3309 int dummy = 0; 3310 TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy); 3311 } 3312 3313 bool first_elt_done = false; 3314 int bound_checked_to = trace->cp_offset() - 1; 3315 bound_checked_to += trace->bound_checked_up_to(); 3316 3317 // If a character is preloaded into the current character register then 3318 // check that now. 3319 if (trace->characters_preloaded() == 1) { 3320 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { 3321 if (!SkipPass(pass, compiler->ignore_case())) { 3322 TextEmitPass(compiler, 3323 static_cast<TextEmitPassType>(pass), 3324 true, 3325 trace, 3326 false, 3327 &bound_checked_to); 3328 } 3329 } 3330 first_elt_done = true; 3331 } 3332 3333 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { 3334 if (!SkipPass(pass, compiler->ignore_case())) { 3335 TextEmitPass(compiler, 3336 static_cast<TextEmitPassType>(pass), 3337 false, 3338 trace, 3339 first_elt_done, 3340 &bound_checked_to); 3341 } 3342 } 3343 3344 Trace successor_trace(*trace); 3345 // If we advance backward, we may end up at the start. 3346 successor_trace.AdvanceCurrentPositionInTrace( 3347 read_backward() ? -Length() : Length(), compiler); 3348 successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN 3349 : Trace::FALSE_VALUE); 3350 RecursionCheck rc(compiler); 3351 on_success()->Emit(compiler, &successor_trace); 3352 } 3353 3354 3355 void Trace::InvalidateCurrentCharacter() { 3356 characters_preloaded_ = 0; 3357 } 3358 3359 3360 void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) { 3361 // We don't have an instruction for shifting the current character register 3362 // down or for using a shifted value for anything so lets just forget that 3363 // we preloaded any characters into it. 3364 characters_preloaded_ = 0; 3365 // Adjust the offsets of the quick check performed information. This 3366 // information is used to find out what we already determined about the 3367 // characters by means of mask and compare. 3368 quick_check_performed_.Advance(by, compiler->one_byte()); 3369 cp_offset_ += by; 3370 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) { 3371 compiler->SetRegExpTooBig(); 3372 cp_offset_ = 0; 3373 } 3374 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by); 3375 } 3376 3377 3378 void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) { 3379 int element_count = elements()->length(); 3380 for (int i = 0; i < element_count; i++) { 3381 TextElement elm = elements()->at(i); 3382 if (elm.text_type() == TextElement::CHAR_CLASS) { 3383 RegExpCharacterClass* cc = elm.char_class(); 3384 // None of the standard character classes is different in the case 3385 // independent case and it slows us down if we don't know that. 3386 if (cc->is_standard(zone())) continue; 3387 ZoneList<CharacterRange>* ranges = cc->ranges(zone()); 3388 int range_count = ranges->length(); 3389 for (int j = 0; j < range_count; j++) { 3390 ranges->at(j).AddCaseEquivalents(isolate, zone(), ranges, is_one_byte); 3391 } 3392 } 3393 } 3394 } 3395 3396 3397 int TextNode::GreedyLoopTextLength() { return Length(); } 3398 3399 3400 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode( 3401 RegExpCompiler* compiler) { 3402 if (read_backward()) return NULL; 3403 if (elements()->length() != 1) return NULL; 3404 TextElement elm = elements()->at(0); 3405 if (elm.text_type() != TextElement::CHAR_CLASS) return NULL; 3406 RegExpCharacterClass* node = elm.char_class(); 3407 ZoneList<CharacterRange>* ranges = node->ranges(zone()); 3408 if (!CharacterRange::IsCanonical(ranges)) { 3409 CharacterRange::Canonicalize(ranges); 3410 } 3411 if (node->is_negated()) { 3412 return ranges->length() == 0 ? on_success() : NULL; 3413 } 3414 if (ranges->length() != 1) return NULL; 3415 uint32_t max_char; 3416 if (compiler->one_byte()) { 3417 max_char = String::kMaxOneByteCharCode; 3418 } else { 3419 max_char = String::kMaxUtf16CodeUnit; 3420 } 3421 return ranges->at(0).IsEverything(max_char) ? on_success() : NULL; 3422 } 3423 3424 3425 // Finds the fixed match length of a sequence of nodes that goes from 3426 // this alternative and back to this choice node. If there are variable 3427 // length nodes or other complications in the way then return a sentinel 3428 // value indicating that a greedy loop cannot be constructed. 3429 int ChoiceNode::GreedyLoopTextLengthForAlternative( 3430 GuardedAlternative* alternative) { 3431 int length = 0; 3432 RegExpNode* node = alternative->node(); 3433 // Later we will generate code for all these text nodes using recursion 3434 // so we have to limit the max number. 3435 int recursion_depth = 0; 3436 while (node != this) { 3437 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) { 3438 return kNodeIsTooComplexForGreedyLoops; 3439 } 3440 int node_length = node->GreedyLoopTextLength(); 3441 if (node_length == kNodeIsTooComplexForGreedyLoops) { 3442 return kNodeIsTooComplexForGreedyLoops; 3443 } 3444 length += node_length; 3445 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node); 3446 node = seq_node->on_success(); 3447 } 3448 return read_backward() ? -length : length; 3449 } 3450 3451 3452 void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) { 3453 DCHECK_NULL(loop_node_); 3454 AddAlternative(alt); 3455 loop_node_ = alt.node(); 3456 } 3457 3458 3459 void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) { 3460 DCHECK_NULL(continue_node_); 3461 AddAlternative(alt); 3462 continue_node_ = alt.node(); 3463 } 3464 3465 3466 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3467 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 3468 if (trace->stop_node() == this) { 3469 // Back edge of greedy optimized loop node graph. 3470 int text_length = 3471 GreedyLoopTextLengthForAlternative(&(alternatives_->at(0))); 3472 DCHECK(text_length != kNodeIsTooComplexForGreedyLoops); 3473 // Update the counter-based backtracking info on the stack. This is an 3474 // optimization for greedy loops (see below). 3475 DCHECK(trace->cp_offset() == text_length); 3476 macro_assembler->AdvanceCurrentPosition(text_length); 3477 macro_assembler->GoTo(trace->loop_label()); 3478 return; 3479 } 3480 DCHECK_NULL(trace->stop_node()); 3481 if (!trace->is_trivial()) { 3482 trace->Flush(compiler, this); 3483 return; 3484 } 3485 ChoiceNode::Emit(compiler, trace); 3486 } 3487 3488 3489 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler, 3490 int eats_at_least) { 3491 int preload_characters = Min(4, eats_at_least); 3492 if (compiler->macro_assembler()->CanReadUnaligned()) { 3493 bool one_byte = compiler->one_byte(); 3494 if (one_byte) { 3495 if (preload_characters > 4) preload_characters = 4; 3496 // We can't preload 3 characters because there is no machine instruction 3497 // to do that. We can't just load 4 because we could be reading 3498 // beyond the end of the string, which could cause a memory fault. 3499 if (preload_characters == 3) preload_characters = 2; 3500 } else { 3501 if (preload_characters > 2) preload_characters = 2; 3502 } 3503 } else { 3504 if (preload_characters > 1) preload_characters = 1; 3505 } 3506 return preload_characters; 3507 } 3508 3509 3510 // This class is used when generating the alternatives in a choice node. It 3511 // records the way the alternative is being code generated. 3512 class AlternativeGeneration: public Malloced { 3513 public: 3514 AlternativeGeneration() 3515 : possible_success(), 3516 expects_preload(false), 3517 after(), 3518 quick_check_details() { } 3519 Label possible_success; 3520 bool expects_preload; 3521 Label after; 3522 QuickCheckDetails quick_check_details; 3523 }; 3524 3525 3526 // Creates a list of AlternativeGenerations. If the list has a reasonable 3527 // size then it is on the stack, otherwise the excess is on the heap. 3528 class AlternativeGenerationList { 3529 public: 3530 AlternativeGenerationList(int count, Zone* zone) 3531 : alt_gens_(count, zone) { 3532 for (int i = 0; i < count && i < kAFew; i++) { 3533 alt_gens_.Add(a_few_alt_gens_ + i, zone); 3534 } 3535 for (int i = kAFew; i < count; i++) { 3536 alt_gens_.Add(new AlternativeGeneration(), zone); 3537 } 3538 } 3539 ~AlternativeGenerationList() { 3540 for (int i = kAFew; i < alt_gens_.length(); i++) { 3541 delete alt_gens_[i]; 3542 alt_gens_[i] = NULL; 3543 } 3544 } 3545 3546 AlternativeGeneration* at(int i) { 3547 return alt_gens_[i]; 3548 } 3549 3550 private: 3551 static const int kAFew = 10; 3552 ZoneList<AlternativeGeneration*> alt_gens_; 3553 AlternativeGeneration a_few_alt_gens_[kAFew]; 3554 }; 3555 3556 3557 // The '2' variant is has inclusive from and exclusive to. 3558 // This covers \s as defined in ECMA-262 5.1, 15.10.2.12, 3559 // which include WhiteSpace (7.2) or LineTerminator (7.3) values. 3560 static const int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1, 3561 0x00A0, 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B, 3562 0x2028, 0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001, 3563 0xFEFF, 0xFF00, 0x10000 }; 3564 static const int kSpaceRangeCount = arraysize(kSpaceRanges); 3565 3566 static const int kWordRanges[] = { 3567 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 }; 3568 static const int kWordRangeCount = arraysize(kWordRanges); 3569 static const int kDigitRanges[] = { '0', '9' + 1, 0x10000 }; 3570 static const int kDigitRangeCount = arraysize(kDigitRanges); 3571 static const int kSurrogateRanges[] = { 0xd800, 0xe000, 0x10000 }; 3572 static const int kSurrogateRangeCount = arraysize(kSurrogateRanges); 3573 static const int kLineTerminatorRanges[] = { 0x000A, 0x000B, 0x000D, 0x000E, 3574 0x2028, 0x202A, 0x10000 }; 3575 static const int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges); 3576 3577 3578 void BoyerMoorePositionInfo::Set(int character) { 3579 SetInterval(Interval(character, character)); 3580 } 3581 3582 3583 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) { 3584 s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval); 3585 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval); 3586 d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval); 3587 surrogate_ = 3588 AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval); 3589 if (interval.to() - interval.from() >= kMapSize - 1) { 3590 if (map_count_ != kMapSize) { 3591 map_count_ = kMapSize; 3592 for (int i = 0; i < kMapSize; i++) map_->at(i) = true; 3593 } 3594 return; 3595 } 3596 for (int i = interval.from(); i <= interval.to(); i++) { 3597 int mod_character = (i & kMask); 3598 if (!map_->at(mod_character)) { 3599 map_count_++; 3600 map_->at(mod_character) = true; 3601 } 3602 if (map_count_ == kMapSize) return; 3603 } 3604 } 3605 3606 3607 void BoyerMoorePositionInfo::SetAll() { 3608 s_ = w_ = d_ = kLatticeUnknown; 3609 if (map_count_ != kMapSize) { 3610 map_count_ = kMapSize; 3611 for (int i = 0; i < kMapSize; i++) map_->at(i) = true; 3612 } 3613 } 3614 3615 3616 BoyerMooreLookahead::BoyerMooreLookahead( 3617 int length, RegExpCompiler* compiler, Zone* zone) 3618 : length_(length), 3619 compiler_(compiler) { 3620 if (compiler->one_byte()) { 3621 max_char_ = String::kMaxOneByteCharCode; 3622 } else { 3623 max_char_ = String::kMaxUtf16CodeUnit; 3624 } 3625 bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone); 3626 for (int i = 0; i < length; i++) { 3627 bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone); 3628 } 3629 } 3630 3631 3632 // Find the longest range of lookahead that has the fewest number of different 3633 // characters that can occur at a given position. Since we are optimizing two 3634 // different parameters at once this is a tradeoff. 3635 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) { 3636 int biggest_points = 0; 3637 // If more than 32 characters out of 128 can occur it is unlikely that we can 3638 // be lucky enough to step forwards much of the time. 3639 const int kMaxMax = 32; 3640 for (int max_number_of_chars = 4; 3641 max_number_of_chars < kMaxMax; 3642 max_number_of_chars *= 2) { 3643 biggest_points = 3644 FindBestInterval(max_number_of_chars, biggest_points, from, to); 3645 } 3646 if (biggest_points == 0) return false; 3647 return true; 3648 } 3649 3650 3651 // Find the highest-points range between 0 and length_ where the character 3652 // information is not too vague. 'Too vague' means that there are more than 3653 // max_number_of_chars that can occur at this position. Calculates the number 3654 // of points as the product of width-of-the-range and 3655 // probability-of-finding-one-of-the-characters, where the probability is 3656 // calculated using the frequency distribution of the sample subject string. 3657 int BoyerMooreLookahead::FindBestInterval( 3658 int max_number_of_chars, int old_biggest_points, int* from, int* to) { 3659 int biggest_points = old_biggest_points; 3660 static const int kSize = RegExpMacroAssembler::kTableSize; 3661 for (int i = 0; i < length_; ) { 3662 while (i < length_ && Count(i) > max_number_of_chars) i++; 3663 if (i == length_) break; 3664 int remembered_from = i; 3665 bool union_map[kSize]; 3666 for (int j = 0; j < kSize; j++) union_map[j] = false; 3667 while (i < length_ && Count(i) <= max_number_of_chars) { 3668 BoyerMoorePositionInfo* map = bitmaps_->at(i); 3669 for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j); 3670 i++; 3671 } 3672 int frequency = 0; 3673 for (int j = 0; j < kSize; j++) { 3674 if (union_map[j]) { 3675 // Add 1 to the frequency to give a small per-character boost for 3676 // the cases where our sampling is not good enough and many 3677 // characters have a frequency of zero. This means the frequency 3678 // can theoretically be up to 2*kSize though we treat it mostly as 3679 // a fraction of kSize. 3680 frequency += compiler_->frequency_collator()->Frequency(j) + 1; 3681 } 3682 } 3683 // We use the probability of skipping times the distance we are skipping to 3684 // judge the effectiveness of this. Actually we have a cut-off: By 3685 // dividing by 2 we switch off the skipping if the probability of skipping 3686 // is less than 50%. This is because the multibyte mask-and-compare 3687 // skipping in quickcheck is more likely to do well on this case. 3688 bool in_quickcheck_range = 3689 ((i - remembered_from < 4) || 3690 (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2)); 3691 // Called 'probability' but it is only a rough estimate and can actually 3692 // be outside the 0-kSize range. 3693 int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency; 3694 int points = (i - remembered_from) * probability; 3695 if (points > biggest_points) { 3696 *from = remembered_from; 3697 *to = i - 1; 3698 biggest_points = points; 3699 } 3700 } 3701 return biggest_points; 3702 } 3703 3704 3705 // Take all the characters that will not prevent a successful match if they 3706 // occur in the subject string in the range between min_lookahead and 3707 // max_lookahead (inclusive) measured from the current position. If the 3708 // character at max_lookahead offset is not one of these characters, then we 3709 // can safely skip forwards by the number of characters in the range. 3710 int BoyerMooreLookahead::GetSkipTable(int min_lookahead, 3711 int max_lookahead, 3712 Handle<ByteArray> boolean_skip_table) { 3713 const int kSize = RegExpMacroAssembler::kTableSize; 3714 3715 const int kSkipArrayEntry = 0; 3716 const int kDontSkipArrayEntry = 1; 3717 3718 for (int i = 0; i < kSize; i++) { 3719 boolean_skip_table->set(i, kSkipArrayEntry); 3720 } 3721 int skip = max_lookahead + 1 - min_lookahead; 3722 3723 for (int i = max_lookahead; i >= min_lookahead; i--) { 3724 BoyerMoorePositionInfo* map = bitmaps_->at(i); 3725 for (int j = 0; j < kSize; j++) { 3726 if (map->at(j)) { 3727 boolean_skip_table->set(j, kDontSkipArrayEntry); 3728 } 3729 } 3730 } 3731 3732 return skip; 3733 } 3734 3735 3736 // See comment above on the implementation of GetSkipTable. 3737 void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) { 3738 const int kSize = RegExpMacroAssembler::kTableSize; 3739 3740 int min_lookahead = 0; 3741 int max_lookahead = 0; 3742 3743 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return; 3744 3745 bool found_single_character = false; 3746 int single_character = 0; 3747 for (int i = max_lookahead; i >= min_lookahead; i--) { 3748 BoyerMoorePositionInfo* map = bitmaps_->at(i); 3749 if (map->map_count() > 1 || 3750 (found_single_character && map->map_count() != 0)) { 3751 found_single_character = false; 3752 break; 3753 } 3754 for (int j = 0; j < kSize; j++) { 3755 if (map->at(j)) { 3756 found_single_character = true; 3757 single_character = j; 3758 break; 3759 } 3760 } 3761 } 3762 3763 int lookahead_width = max_lookahead + 1 - min_lookahead; 3764 3765 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) { 3766 // The mask-compare can probably handle this better. 3767 return; 3768 } 3769 3770 if (found_single_character) { 3771 Label cont, again; 3772 masm->Bind(&again); 3773 masm->LoadCurrentCharacter(max_lookahead, &cont, true); 3774 if (max_char_ > kSize) { 3775 masm->CheckCharacterAfterAnd(single_character, 3776 RegExpMacroAssembler::kTableMask, 3777 &cont); 3778 } else { 3779 masm->CheckCharacter(single_character, &cont); 3780 } 3781 masm->AdvanceCurrentPosition(lookahead_width); 3782 masm->GoTo(&again); 3783 masm->Bind(&cont); 3784 return; 3785 } 3786 3787 Factory* factory = masm->isolate()->factory(); 3788 Handle<ByteArray> boolean_skip_table = factory->NewByteArray(kSize, TENURED); 3789 int skip_distance = GetSkipTable( 3790 min_lookahead, max_lookahead, boolean_skip_table); 3791 DCHECK(skip_distance != 0); 3792 3793 Label cont, again; 3794 masm->Bind(&again); 3795 masm->LoadCurrentCharacter(max_lookahead, &cont, true); 3796 masm->CheckBitInTable(boolean_skip_table, &cont); 3797 masm->AdvanceCurrentPosition(skip_distance); 3798 masm->GoTo(&again); 3799 masm->Bind(&cont); 3800 } 3801 3802 3803 /* Code generation for choice nodes. 3804 * 3805 * We generate quick checks that do a mask and compare to eliminate a 3806 * choice. If the quick check succeeds then it jumps to the continuation to 3807 * do slow checks and check subsequent nodes. If it fails (the common case) 3808 * it falls through to the next choice. 3809 * 3810 * Here is the desired flow graph. Nodes directly below each other imply 3811 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative 3812 * 3 doesn't have a quick check so we have to call the slow check. 3813 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire 3814 * regexp continuation is generated directly after the Sn node, up to the 3815 * next GoTo if we decide to reuse some already generated code. Some 3816 * nodes expect preload_characters to be preloaded into the current 3817 * character register. R nodes do this preloading. Vertices are marked 3818 * F for failures and S for success (possible success in the case of quick 3819 * nodes). L, V, < and > are used as arrow heads. 3820 * 3821 * ----------> R 3822 * | 3823 * V 3824 * Q1 -----> S1 3825 * | S / 3826 * F| / 3827 * | F/ 3828 * | / 3829 * | R 3830 * | / 3831 * V L 3832 * Q2 -----> S2 3833 * | S / 3834 * F| / 3835 * | F/ 3836 * | / 3837 * | R 3838 * | / 3839 * V L 3840 * S3 3841 * | 3842 * F| 3843 * | 3844 * R 3845 * | 3846 * backtrack V 3847 * <----------Q4 3848 * \ F | 3849 * \ |S 3850 * \ F V 3851 * \-----S4 3852 * 3853 * For greedy loops we push the current position, then generate the code that 3854 * eats the input specially in EmitGreedyLoop. The other choice (the 3855 * continuation) is generated by the normal code in EmitChoices, and steps back 3856 * in the input to the starting position when it fails to match. The loop code 3857 * looks like this (U is the unwind code that steps back in the greedy loop). 3858 * 3859 * _____ 3860 * / \ 3861 * V | 3862 * ----------> S1 | 3863 * /| | 3864 * / |S | 3865 * F/ \_____/ 3866 * / 3867 * |<----- 3868 * | \ 3869 * V |S 3870 * Q2 ---> U----->backtrack 3871 * | F / 3872 * S| / 3873 * V F / 3874 * S2--/ 3875 */ 3876 3877 GreedyLoopState::GreedyLoopState(bool not_at_start) { 3878 counter_backtrack_trace_.set_backtrack(&label_); 3879 if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE); 3880 } 3881 3882 3883 void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) { 3884 #ifdef DEBUG 3885 int choice_count = alternatives_->length(); 3886 for (int i = 0; i < choice_count - 1; i++) { 3887 GuardedAlternative alternative = alternatives_->at(i); 3888 ZoneList<Guard*>* guards = alternative.guards(); 3889 int guard_count = (guards == NULL) ? 0 : guards->length(); 3890 for (int j = 0; j < guard_count; j++) { 3891 DCHECK(!trace->mentions_reg(guards->at(j)->reg())); 3892 } 3893 } 3894 #endif 3895 } 3896 3897 3898 void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler, 3899 Trace* current_trace, 3900 PreloadState* state) { 3901 if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) { 3902 // Save some time by looking at most one machine word ahead. 3903 state->eats_at_least_ = 3904 EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget, 3905 current_trace->at_start() == Trace::FALSE_VALUE); 3906 } 3907 state->preload_characters_ = 3908 CalculatePreloadCharacters(compiler, state->eats_at_least_); 3909 3910 state->preload_is_current_ = 3911 (current_trace->characters_preloaded() == state->preload_characters_); 3912 state->preload_has_checked_bounds_ = state->preload_is_current_; 3913 } 3914 3915 3916 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3917 int choice_count = alternatives_->length(); 3918 3919 AssertGuardsMentionRegisters(trace); 3920 3921 LimitResult limit_result = LimitVersions(compiler, trace); 3922 if (limit_result == DONE) return; 3923 DCHECK(limit_result == CONTINUE); 3924 3925 // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for 3926 // other choice nodes we only flush if we are out of code size budget. 3927 if (trace->flush_budget() == 0 && trace->actions() != NULL) { 3928 trace->Flush(compiler, this); 3929 return; 3930 } 3931 3932 RecursionCheck rc(compiler); 3933 3934 PreloadState preload; 3935 preload.init(); 3936 GreedyLoopState greedy_loop_state(not_at_start()); 3937 3938 int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0)); 3939 AlternativeGenerationList alt_gens(choice_count, zone()); 3940 3941 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) { 3942 trace = EmitGreedyLoop(compiler, 3943 trace, 3944 &alt_gens, 3945 &preload, 3946 &greedy_loop_state, 3947 text_length); 3948 } else { 3949 // TODO(erikcorry): Delete this. We don't need this label, but it makes us 3950 // match the traces produced pre-cleanup. 3951 Label second_choice; 3952 compiler->macro_assembler()->Bind(&second_choice); 3953 3954 preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace); 3955 3956 EmitChoices(compiler, 3957 &alt_gens, 3958 0, 3959 trace, 3960 &preload); 3961 } 3962 3963 // At this point we need to generate slow checks for the alternatives where 3964 // the quick check was inlined. We can recognize these because the associated 3965 // label was bound. 3966 int new_flush_budget = trace->flush_budget() / choice_count; 3967 for (int i = 0; i < choice_count; i++) { 3968 AlternativeGeneration* alt_gen = alt_gens.at(i); 3969 Trace new_trace(*trace); 3970 // If there are actions to be flushed we have to limit how many times 3971 // they are flushed. Take the budget of the parent trace and distribute 3972 // it fairly amongst the children. 3973 if (new_trace.actions() != NULL) { 3974 new_trace.set_flush_budget(new_flush_budget); 3975 } 3976 bool next_expects_preload = 3977 i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload; 3978 EmitOutOfLineContinuation(compiler, 3979 &new_trace, 3980 alternatives_->at(i), 3981 alt_gen, 3982 preload.preload_characters_, 3983 next_expects_preload); 3984 } 3985 } 3986 3987 3988 Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler, 3989 Trace* trace, 3990 AlternativeGenerationList* alt_gens, 3991 PreloadState* preload, 3992 GreedyLoopState* greedy_loop_state, 3993 int text_length) { 3994 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 3995 // Here we have special handling for greedy loops containing only text nodes 3996 // and other simple nodes. These are handled by pushing the current 3997 // position on the stack and then incrementing the current position each 3998 // time around the switch. On backtrack we decrement the current position 3999 // and check it against the pushed value. This avoids pushing backtrack 4000 // information for each iteration of the loop, which could take up a lot of 4001 // space. 4002 DCHECK(trace->stop_node() == NULL); 4003 macro_assembler->PushCurrentPosition(); 4004 Label greedy_match_failed; 4005 Trace greedy_match_trace; 4006 if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE); 4007 greedy_match_trace.set_backtrack(&greedy_match_failed); 4008 Label loop_label; 4009 macro_assembler->Bind(&loop_label); 4010 greedy_match_trace.set_stop_node(this); 4011 greedy_match_trace.set_loop_label(&loop_label); 4012 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace); 4013 macro_assembler->Bind(&greedy_match_failed); 4014 4015 Label second_choice; // For use in greedy matches. 4016 macro_assembler->Bind(&second_choice); 4017 4018 Trace* new_trace = greedy_loop_state->counter_backtrack_trace(); 4019 4020 EmitChoices(compiler, 4021 alt_gens, 4022 1, 4023 new_trace, 4024 preload); 4025 4026 macro_assembler->Bind(greedy_loop_state->label()); 4027 // If we have unwound to the bottom then backtrack. 4028 macro_assembler->CheckGreedyLoop(trace->backtrack()); 4029 // Otherwise try the second priority at an earlier position. 4030 macro_assembler->AdvanceCurrentPosition(-text_length); 4031 macro_assembler->GoTo(&second_choice); 4032 return new_trace; 4033 } 4034 4035 int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler, 4036 Trace* trace) { 4037 int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized; 4038 if (alternatives_->length() != 2) return eats_at_least; 4039 4040 GuardedAlternative alt1 = alternatives_->at(1); 4041 if (alt1.guards() != NULL && alt1.guards()->length() != 0) { 4042 return eats_at_least; 4043 } 4044 RegExpNode* eats_anything_node = alt1.node(); 4045 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) { 4046 return eats_at_least; 4047 } 4048 4049 // Really we should be creating a new trace when we execute this function, 4050 // but there is no need, because the code it generates cannot backtrack, and 4051 // we always arrive here with a trivial trace (since it's the entry to a 4052 // loop. That also implies that there are no preloaded characters, which is 4053 // good, because it means we won't be violating any assumptions by 4054 // overwriting those characters with new load instructions. 4055 DCHECK(trace->is_trivial()); 4056 4057 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 4058 Isolate* isolate = macro_assembler->isolate(); 4059 // At this point we know that we are at a non-greedy loop that will eat 4060 // any character one at a time. Any non-anchored regexp has such a 4061 // loop prepended to it in order to find where it starts. We look for 4062 // a pattern of the form ...abc... where we can look 6 characters ahead 4063 // and step forwards 3 if the character is not one of abc. Abc need 4064 // not be atoms, they can be any reasonably limited character class or 4065 // small alternation. 4066 BoyerMooreLookahead* bm = bm_info(false); 4067 if (bm == NULL) { 4068 eats_at_least = Min(kMaxLookaheadForBoyerMoore, 4069 EatsAtLeast(kMaxLookaheadForBoyerMoore, 4070 kRecursionBudget, 4071 false)); 4072 if (eats_at_least >= 1) { 4073 bm = new(zone()) BoyerMooreLookahead(eats_at_least, 4074 compiler, 4075 zone()); 4076 GuardedAlternative alt0 = alternatives_->at(0); 4077 alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false); 4078 } 4079 } 4080 if (bm != NULL) { 4081 bm->EmitSkipInstructions(macro_assembler); 4082 } 4083 return eats_at_least; 4084 } 4085 4086 4087 void ChoiceNode::EmitChoices(RegExpCompiler* compiler, 4088 AlternativeGenerationList* alt_gens, 4089 int first_choice, 4090 Trace* trace, 4091 PreloadState* preload) { 4092 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 4093 SetUpPreLoad(compiler, trace, preload); 4094 4095 // For now we just call all choices one after the other. The idea ultimately 4096 // is to use the Dispatch table to try only the relevant ones. 4097 int choice_count = alternatives_->length(); 4098 4099 int new_flush_budget = trace->flush_budget() / choice_count; 4100 4101 for (int i = first_choice; i < choice_count; i++) { 4102 bool is_last = i == choice_count - 1; 4103 bool fall_through_on_failure = !is_last; 4104 GuardedAlternative alternative = alternatives_->at(i); 4105 AlternativeGeneration* alt_gen = alt_gens->at(i); 4106 alt_gen->quick_check_details.set_characters(preload->preload_characters_); 4107 ZoneList<Guard*>* guards = alternative.guards(); 4108 int guard_count = (guards == NULL) ? 0 : guards->length(); 4109 Trace new_trace(*trace); 4110 new_trace.set_characters_preloaded(preload->preload_is_current_ ? 4111 preload->preload_characters_ : 4112 0); 4113 if (preload->preload_has_checked_bounds_) { 4114 new_trace.set_bound_checked_up_to(preload->preload_characters_); 4115 } 4116 new_trace.quick_check_performed()->Clear(); 4117 if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE); 4118 if (!is_last) { 4119 new_trace.set_backtrack(&alt_gen->after); 4120 } 4121 alt_gen->expects_preload = preload->preload_is_current_; 4122 bool generate_full_check_inline = false; 4123 if (compiler->optimize() && 4124 try_to_emit_quick_check_for_alternative(i == 0) && 4125 alternative.node()->EmitQuickCheck( 4126 compiler, trace, &new_trace, preload->preload_has_checked_bounds_, 4127 &alt_gen->possible_success, &alt_gen->quick_check_details, 4128 fall_through_on_failure)) { 4129 // Quick check was generated for this choice. 4130 preload->preload_is_current_ = true; 4131 preload->preload_has_checked_bounds_ = true; 4132 // If we generated the quick check to fall through on possible success, 4133 // we now need to generate the full check inline. 4134 if (!fall_through_on_failure) { 4135 macro_assembler->Bind(&alt_gen->possible_success); 4136 new_trace.set_quick_check_performed(&alt_gen->quick_check_details); 4137 new_trace.set_characters_preloaded(preload->preload_characters_); 4138 new_trace.set_bound_checked_up_to(preload->preload_characters_); 4139 generate_full_check_inline = true; 4140 } 4141 } else if (alt_gen->quick_check_details.cannot_match()) { 4142 if (!fall_through_on_failure) { 4143 macro_assembler->GoTo(trace->backtrack()); 4144 } 4145 continue; 4146 } else { 4147 // No quick check was generated. Put the full code here. 4148 // If this is not the first choice then there could be slow checks from 4149 // previous cases that go here when they fail. There's no reason to 4150 // insist that they preload characters since the slow check we are about 4151 // to generate probably can't use it. 4152 if (i != first_choice) { 4153 alt_gen->expects_preload = false; 4154 new_trace.InvalidateCurrentCharacter(); 4155 } 4156 generate_full_check_inline = true; 4157 } 4158 if (generate_full_check_inline) { 4159 if (new_trace.actions() != NULL) { 4160 new_trace.set_flush_budget(new_flush_budget); 4161 } 4162 for (int j = 0; j < guard_count; j++) { 4163 GenerateGuard(macro_assembler, guards->at(j), &new_trace); 4164 } 4165 alternative.node()->Emit(compiler, &new_trace); 4166 preload->preload_is_current_ = false; 4167 } 4168 macro_assembler->Bind(&alt_gen->after); 4169 } 4170 } 4171 4172 4173 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler, 4174 Trace* trace, 4175 GuardedAlternative alternative, 4176 AlternativeGeneration* alt_gen, 4177 int preload_characters, 4178 bool next_expects_preload) { 4179 if (!alt_gen->possible_success.is_linked()) return; 4180 4181 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 4182 macro_assembler->Bind(&alt_gen->possible_success); 4183 Trace out_of_line_trace(*trace); 4184 out_of_line_trace.set_characters_preloaded(preload_characters); 4185 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details); 4186 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE); 4187 ZoneList<Guard*>* guards = alternative.guards(); 4188 int guard_count = (guards == NULL) ? 0 : guards->length(); 4189 if (next_expects_preload) { 4190 Label reload_current_char; 4191 out_of_line_trace.set_backtrack(&reload_current_char); 4192 for (int j = 0; j < guard_count; j++) { 4193 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace); 4194 } 4195 alternative.node()->Emit(compiler, &out_of_line_trace); 4196 macro_assembler->Bind(&reload_current_char); 4197 // Reload the current character, since the next quick check expects that. 4198 // We don't need to check bounds here because we only get into this 4199 // code through a quick check which already did the checked load. 4200 macro_assembler->LoadCurrentCharacter(trace->cp_offset(), 4201 NULL, 4202 false, 4203 preload_characters); 4204 macro_assembler->GoTo(&(alt_gen->after)); 4205 } else { 4206 out_of_line_trace.set_backtrack(&(alt_gen->after)); 4207 for (int j = 0; j < guard_count; j++) { 4208 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace); 4209 } 4210 alternative.node()->Emit(compiler, &out_of_line_trace); 4211 } 4212 } 4213 4214 4215 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) { 4216 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 4217 LimitResult limit_result = LimitVersions(compiler, trace); 4218 if (limit_result == DONE) return; 4219 DCHECK(limit_result == CONTINUE); 4220 4221 RecursionCheck rc(compiler); 4222 4223 switch (action_type_) { 4224 case STORE_POSITION: { 4225 Trace::DeferredCapture 4226 new_capture(data_.u_position_register.reg, 4227 data_.u_position_register.is_capture, 4228 trace); 4229 Trace new_trace = *trace; 4230 new_trace.add_action(&new_capture); 4231 on_success()->Emit(compiler, &new_trace); 4232 break; 4233 } 4234 case INCREMENT_REGISTER: { 4235 Trace::DeferredIncrementRegister 4236 new_increment(data_.u_increment_register.reg); 4237 Trace new_trace = *trace; 4238 new_trace.add_action(&new_increment); 4239 on_success()->Emit(compiler, &new_trace); 4240 break; 4241 } 4242 case SET_REGISTER: { 4243 Trace::DeferredSetRegister 4244 new_set(data_.u_store_register.reg, data_.u_store_register.value); 4245 Trace new_trace = *trace; 4246 new_trace.add_action(&new_set); 4247 on_success()->Emit(compiler, &new_trace); 4248 break; 4249 } 4250 case CLEAR_CAPTURES: { 4251 Trace::DeferredClearCaptures 4252 new_capture(Interval(data_.u_clear_captures.range_from, 4253 data_.u_clear_captures.range_to)); 4254 Trace new_trace = *trace; 4255 new_trace.add_action(&new_capture); 4256 on_success()->Emit(compiler, &new_trace); 4257 break; 4258 } 4259 case BEGIN_SUBMATCH: 4260 if (!trace->is_trivial()) { 4261 trace->Flush(compiler, this); 4262 } else { 4263 assembler->WriteCurrentPositionToRegister( 4264 data_.u_submatch.current_position_register, 0); 4265 assembler->WriteStackPointerToRegister( 4266 data_.u_submatch.stack_pointer_register); 4267 on_success()->Emit(compiler, trace); 4268 } 4269 break; 4270 case EMPTY_MATCH_CHECK: { 4271 int start_pos_reg = data_.u_empty_match_check.start_register; 4272 int stored_pos = 0; 4273 int rep_reg = data_.u_empty_match_check.repetition_register; 4274 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister); 4275 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos); 4276 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) { 4277 // If we know we haven't advanced and there is no minimum we 4278 // can just backtrack immediately. 4279 assembler->GoTo(trace->backtrack()); 4280 } else if (know_dist && stored_pos < trace->cp_offset()) { 4281 // If we know we've advanced we can generate the continuation 4282 // immediately. 4283 on_success()->Emit(compiler, trace); 4284 } else if (!trace->is_trivial()) { 4285 trace->Flush(compiler, this); 4286 } else { 4287 Label skip_empty_check; 4288 // If we have a minimum number of repetitions we check the current 4289 // number first and skip the empty check if it's not enough. 4290 if (has_minimum) { 4291 int limit = data_.u_empty_match_check.repetition_limit; 4292 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check); 4293 } 4294 // If the match is empty we bail out, otherwise we fall through 4295 // to the on-success continuation. 4296 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register, 4297 trace->backtrack()); 4298 assembler->Bind(&skip_empty_check); 4299 on_success()->Emit(compiler, trace); 4300 } 4301 break; 4302 } 4303 case POSITIVE_SUBMATCH_SUCCESS: { 4304 if (!trace->is_trivial()) { 4305 trace->Flush(compiler, this); 4306 return; 4307 } 4308 assembler->ReadCurrentPositionFromRegister( 4309 data_.u_submatch.current_position_register); 4310 assembler->ReadStackPointerFromRegister( 4311 data_.u_submatch.stack_pointer_register); 4312 int clear_register_count = data_.u_submatch.clear_register_count; 4313 if (clear_register_count == 0) { 4314 on_success()->Emit(compiler, trace); 4315 return; 4316 } 4317 int clear_registers_from = data_.u_submatch.clear_register_from; 4318 Label clear_registers_backtrack; 4319 Trace new_trace = *trace; 4320 new_trace.set_backtrack(&clear_registers_backtrack); 4321 on_success()->Emit(compiler, &new_trace); 4322 4323 assembler->Bind(&clear_registers_backtrack); 4324 int clear_registers_to = clear_registers_from + clear_register_count - 1; 4325 assembler->ClearRegisters(clear_registers_from, clear_registers_to); 4326 4327 DCHECK(trace->backtrack() == NULL); 4328 assembler->Backtrack(); 4329 return; 4330 } 4331 default: 4332 UNREACHABLE(); 4333 } 4334 } 4335 4336 4337 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 4338 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 4339 if (!trace->is_trivial()) { 4340 trace->Flush(compiler, this); 4341 return; 4342 } 4343 4344 LimitResult limit_result = LimitVersions(compiler, trace); 4345 if (limit_result == DONE) return; 4346 DCHECK(limit_result == CONTINUE); 4347 4348 RecursionCheck rc(compiler); 4349 4350 DCHECK_EQ(start_reg_ + 1, end_reg_); 4351 if (compiler->ignore_case()) { 4352 assembler->CheckNotBackReferenceIgnoreCase(start_reg_, read_backward(), 4353 trace->backtrack()); 4354 } else { 4355 assembler->CheckNotBackReference(start_reg_, read_backward(), 4356 trace->backtrack()); 4357 } 4358 // We are going to advance backward, so we may end up at the start. 4359 if (read_backward()) trace->set_at_start(Trace::UNKNOWN); 4360 on_success()->Emit(compiler, trace); 4361 } 4362 4363 4364 // ------------------------------------------------------------------- 4365 // Dot/dotty output 4366 4367 4368 #ifdef DEBUG 4369 4370 4371 class DotPrinter: public NodeVisitor { 4372 public: 4373 DotPrinter(std::ostream& os, bool ignore_case) // NOLINT 4374 : os_(os), 4375 ignore_case_(ignore_case) {} 4376 void PrintNode(const char* label, RegExpNode* node); 4377 void Visit(RegExpNode* node); 4378 void PrintAttributes(RegExpNode* from); 4379 void PrintOnFailure(RegExpNode* from, RegExpNode* to); 4380 #define DECLARE_VISIT(Type) \ 4381 virtual void Visit##Type(Type##Node* that); 4382 FOR_EACH_NODE_TYPE(DECLARE_VISIT) 4383 #undef DECLARE_VISIT 4384 private: 4385 std::ostream& os_; 4386 bool ignore_case_; 4387 }; 4388 4389 4390 void DotPrinter::PrintNode(const char* label, RegExpNode* node) { 4391 os_ << "digraph G {\n graph [label=\""; 4392 for (int i = 0; label[i]; i++) { 4393 switch (label[i]) { 4394 case '\\': 4395 os_ << "\\\\"; 4396 break; 4397 case '"': 4398 os_ << "\""; 4399 break; 4400 default: 4401 os_ << label[i]; 4402 break; 4403 } 4404 } 4405 os_ << "\"];\n"; 4406 Visit(node); 4407 os_ << "}" << std::endl; 4408 } 4409 4410 4411 void DotPrinter::Visit(RegExpNode* node) { 4412 if (node->info()->visited) return; 4413 node->info()->visited = true; 4414 node->Accept(this); 4415 } 4416 4417 4418 void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) { 4419 os_ << " n" << from << " -> n" << on_failure << " [style=dotted];\n"; 4420 Visit(on_failure); 4421 } 4422 4423 4424 class TableEntryBodyPrinter { 4425 public: 4426 TableEntryBodyPrinter(std::ostream& os, ChoiceNode* choice) // NOLINT 4427 : os_(os), 4428 choice_(choice) {} 4429 void Call(uc16 from, DispatchTable::Entry entry) { 4430 OutSet* out_set = entry.out_set(); 4431 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { 4432 if (out_set->Get(i)) { 4433 os_ << " n" << choice() << ":s" << from << "o" << i << " -> n" 4434 << choice()->alternatives()->at(i).node() << ";\n"; 4435 } 4436 } 4437 } 4438 private: 4439 ChoiceNode* choice() { return choice_; } 4440 std::ostream& os_; 4441 ChoiceNode* choice_; 4442 }; 4443 4444 4445 class TableEntryHeaderPrinter { 4446 public: 4447 explicit TableEntryHeaderPrinter(std::ostream& os) // NOLINT 4448 : first_(true), 4449 os_(os) {} 4450 void Call(uc16 from, DispatchTable::Entry entry) { 4451 if (first_) { 4452 first_ = false; 4453 } else { 4454 os_ << "|"; 4455 } 4456 os_ << "{\\" << AsUC16(from) << "-\\" << AsUC16(entry.to()) << "|{"; 4457 OutSet* out_set = entry.out_set(); 4458 int priority = 0; 4459 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { 4460 if (out_set->Get(i)) { 4461 if (priority > 0) os_ << "|"; 4462 os_ << "<s" << from << "o" << i << "> " << priority; 4463 priority++; 4464 } 4465 } 4466 os_ << "}}"; 4467 } 4468 4469 private: 4470 bool first_; 4471 std::ostream& os_; 4472 }; 4473 4474 4475 class AttributePrinter { 4476 public: 4477 explicit AttributePrinter(std::ostream& os) // NOLINT 4478 : os_(os), 4479 first_(true) {} 4480 void PrintSeparator() { 4481 if (first_) { 4482 first_ = false; 4483 } else { 4484 os_ << "|"; 4485 } 4486 } 4487 void PrintBit(const char* name, bool value) { 4488 if (!value) return; 4489 PrintSeparator(); 4490 os_ << "{" << name << "}"; 4491 } 4492 void PrintPositive(const char* name, int value) { 4493 if (value < 0) return; 4494 PrintSeparator(); 4495 os_ << "{" << name << "|" << value << "}"; 4496 } 4497 4498 private: 4499 std::ostream& os_; 4500 bool first_; 4501 }; 4502 4503 4504 void DotPrinter::PrintAttributes(RegExpNode* that) { 4505 os_ << " a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, " 4506 << "margin=0.1, fontsize=10, label=\"{"; 4507 AttributePrinter printer(os_); 4508 NodeInfo* info = that->info(); 4509 printer.PrintBit("NI", info->follows_newline_interest); 4510 printer.PrintBit("WI", info->follows_word_interest); 4511 printer.PrintBit("SI", info->follows_start_interest); 4512 Label* label = that->label(); 4513 if (label->is_bound()) 4514 printer.PrintPositive("@", label->pos()); 4515 os_ << "}\"];\n" 4516 << " a" << that << " -> n" << that 4517 << " [style=dashed, color=grey, arrowhead=none];\n"; 4518 } 4519 4520 4521 static const bool kPrintDispatchTable = false; 4522 void DotPrinter::VisitChoice(ChoiceNode* that) { 4523 if (kPrintDispatchTable) { 4524 os_ << " n" << that << " [shape=Mrecord, label=\""; 4525 TableEntryHeaderPrinter header_printer(os_); 4526 that->GetTable(ignore_case_)->ForEach(&header_printer); 4527 os_ << "\"]\n"; 4528 PrintAttributes(that); 4529 TableEntryBodyPrinter body_printer(os_, that); 4530 that->GetTable(ignore_case_)->ForEach(&body_printer); 4531 } else { 4532 os_ << " n" << that << " [shape=Mrecord, label=\"?\"];\n"; 4533 for (int i = 0; i < that->alternatives()->length(); i++) { 4534 GuardedAlternative alt = that->alternatives()->at(i); 4535 os_ << " n" << that << " -> n" << alt.node(); 4536 } 4537 } 4538 for (int i = 0; i < that->alternatives()->length(); i++) { 4539 GuardedAlternative alt = that->alternatives()->at(i); 4540 alt.node()->Accept(this); 4541 } 4542 } 4543 4544 4545 void DotPrinter::VisitText(TextNode* that) { 4546 Zone* zone = that->zone(); 4547 os_ << " n" << that << " [label=\""; 4548 for (int i = 0; i < that->elements()->length(); i++) { 4549 if (i > 0) os_ << " "; 4550 TextElement elm = that->elements()->at(i); 4551 switch (elm.text_type()) { 4552 case TextElement::ATOM: { 4553 Vector<const uc16> data = elm.atom()->data(); 4554 for (int i = 0; i < data.length(); i++) { 4555 os_ << static_cast<char>(data[i]); 4556 } 4557 break; 4558 } 4559 case TextElement::CHAR_CLASS: { 4560 RegExpCharacterClass* node = elm.char_class(); 4561 os_ << "["; 4562 if (node->is_negated()) os_ << "^"; 4563 for (int j = 0; j < node->ranges(zone)->length(); j++) { 4564 CharacterRange range = node->ranges(zone)->at(j); 4565 os_ << AsUC16(range.from()) << "-" << AsUC16(range.to()); 4566 } 4567 os_ << "]"; 4568 break; 4569 } 4570 default: 4571 UNREACHABLE(); 4572 } 4573 } 4574 os_ << "\", shape=box, peripheries=2];\n"; 4575 PrintAttributes(that); 4576 os_ << " n" << that << " -> n" << that->on_success() << ";\n"; 4577 Visit(that->on_success()); 4578 } 4579 4580 4581 void DotPrinter::VisitBackReference(BackReferenceNode* that) { 4582 os_ << " n" << that << " [label=\"$" << that->start_register() << "..$" 4583 << that->end_register() << "\", shape=doubleoctagon];\n"; 4584 PrintAttributes(that); 4585 os_ << " n" << that << " -> n" << that->on_success() << ";\n"; 4586 Visit(that->on_success()); 4587 } 4588 4589 4590 void DotPrinter::VisitEnd(EndNode* that) { 4591 os_ << " n" << that << " [style=bold, shape=point];\n"; 4592 PrintAttributes(that); 4593 } 4594 4595 4596 void DotPrinter::VisitAssertion(AssertionNode* that) { 4597 os_ << " n" << that << " ["; 4598 switch (that->assertion_type()) { 4599 case AssertionNode::AT_END: 4600 os_ << "label=\"$\", shape=septagon"; 4601 break; 4602 case AssertionNode::AT_START: 4603 os_ << "label=\"^\", shape=septagon"; 4604 break; 4605 case AssertionNode::AT_BOUNDARY: 4606 os_ << "label=\"\\b\", shape=septagon"; 4607 break; 4608 case AssertionNode::AT_NON_BOUNDARY: 4609 os_ << "label=\"\\B\", shape=septagon"; 4610 break; 4611 case AssertionNode::AFTER_NEWLINE: 4612 os_ << "label=\"(?<=\\n)\", shape=septagon"; 4613 break; 4614 } 4615 os_ << "];\n"; 4616 PrintAttributes(that); 4617 RegExpNode* successor = that->on_success(); 4618 os_ << " n" << that << " -> n" << successor << ";\n"; 4619 Visit(successor); 4620 } 4621 4622 4623 void DotPrinter::VisitAction(ActionNode* that) { 4624 os_ << " n" << that << " ["; 4625 switch (that->action_type_) { 4626 case ActionNode::SET_REGISTER: 4627 os_ << "label=\"$" << that->data_.u_store_register.reg 4628 << ":=" << that->data_.u_store_register.value << "\", shape=octagon"; 4629 break; 4630 case ActionNode::INCREMENT_REGISTER: 4631 os_ << "label=\"$" << that->data_.u_increment_register.reg 4632 << "++\", shape=octagon"; 4633 break; 4634 case ActionNode::STORE_POSITION: 4635 os_ << "label=\"$" << that->data_.u_position_register.reg 4636 << ":=$pos\", shape=octagon"; 4637 break; 4638 case ActionNode::BEGIN_SUBMATCH: 4639 os_ << "label=\"$" << that->data_.u_submatch.current_position_register 4640 << ":=$pos,begin\", shape=septagon"; 4641 break; 4642 case ActionNode::POSITIVE_SUBMATCH_SUCCESS: 4643 os_ << "label=\"escape\", shape=septagon"; 4644 break; 4645 case ActionNode::EMPTY_MATCH_CHECK: 4646 os_ << "label=\"$" << that->data_.u_empty_match_check.start_register 4647 << "=$pos?,$" << that->data_.u_empty_match_check.repetition_register 4648 << "<" << that->data_.u_empty_match_check.repetition_limit 4649 << "?\", shape=septagon"; 4650 break; 4651 case ActionNode::CLEAR_CAPTURES: { 4652 os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from 4653 << " to $" << that->data_.u_clear_captures.range_to 4654 << "\", shape=septagon"; 4655 break; 4656 } 4657 } 4658 os_ << "];\n"; 4659 PrintAttributes(that); 4660 RegExpNode* successor = that->on_success(); 4661 os_ << " n" << that << " -> n" << successor << ";\n"; 4662 Visit(successor); 4663 } 4664 4665 4666 class DispatchTableDumper { 4667 public: 4668 explicit DispatchTableDumper(std::ostream& os) : os_(os) {} 4669 void Call(uc16 key, DispatchTable::Entry entry); 4670 private: 4671 std::ostream& os_; 4672 }; 4673 4674 4675 void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) { 4676 os_ << "[" << AsUC16(key) << "-" << AsUC16(entry.to()) << "]: {"; 4677 OutSet* set = entry.out_set(); 4678 bool first = true; 4679 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { 4680 if (set->Get(i)) { 4681 if (first) { 4682 first = false; 4683 } else { 4684 os_ << ", "; 4685 } 4686 os_ << i; 4687 } 4688 } 4689 os_ << "}\n"; 4690 } 4691 4692 4693 void DispatchTable::Dump() { 4694 OFStream os(stderr); 4695 DispatchTableDumper dumper(os); 4696 tree()->ForEach(&dumper); 4697 } 4698 4699 4700 void RegExpEngine::DotPrint(const char* label, 4701 RegExpNode* node, 4702 bool ignore_case) { 4703 OFStream os(stdout); 4704 DotPrinter printer(os, ignore_case); 4705 printer.PrintNode(label, node); 4706 } 4707 4708 4709 #endif // DEBUG 4710 4711 4712 // ------------------------------------------------------------------- 4713 // Tree to graph conversion 4714 4715 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler, 4716 RegExpNode* on_success) { 4717 ZoneList<TextElement>* elms = 4718 new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone()); 4719 elms->Add(TextElement::Atom(this), compiler->zone()); 4720 return new (compiler->zone()) 4721 TextNode(elms, compiler->read_backward(), on_success); 4722 } 4723 4724 4725 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, 4726 RegExpNode* on_success) { 4727 return new (compiler->zone()) 4728 TextNode(elements(), compiler->read_backward(), on_success); 4729 } 4730 4731 4732 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges, 4733 const int* special_class, 4734 int length) { 4735 length--; // Remove final 0x10000. 4736 DCHECK(special_class[length] == 0x10000); 4737 DCHECK(ranges->length() != 0); 4738 DCHECK(length != 0); 4739 DCHECK(special_class[0] != 0); 4740 if (ranges->length() != (length >> 1) + 1) { 4741 return false; 4742 } 4743 CharacterRange range = ranges->at(0); 4744 if (range.from() != 0) { 4745 return false; 4746 } 4747 for (int i = 0; i < length; i += 2) { 4748 if (special_class[i] != (range.to() + 1)) { 4749 return false; 4750 } 4751 range = ranges->at((i >> 1) + 1); 4752 if (special_class[i+1] != range.from()) { 4753 return false; 4754 } 4755 } 4756 if (range.to() != 0xffff) { 4757 return false; 4758 } 4759 return true; 4760 } 4761 4762 4763 static bool CompareRanges(ZoneList<CharacterRange>* ranges, 4764 const int* special_class, 4765 int length) { 4766 length--; // Remove final 0x10000. 4767 DCHECK(special_class[length] == 0x10000); 4768 if (ranges->length() * 2 != length) { 4769 return false; 4770 } 4771 for (int i = 0; i < length; i += 2) { 4772 CharacterRange range = ranges->at(i >> 1); 4773 if (range.from() != special_class[i] || 4774 range.to() != special_class[i + 1] - 1) { 4775 return false; 4776 } 4777 } 4778 return true; 4779 } 4780 4781 4782 bool RegExpCharacterClass::is_standard(Zone* zone) { 4783 // TODO(lrn): Remove need for this function, by not throwing away information 4784 // along the way. 4785 if (is_negated_) { 4786 return false; 4787 } 4788 if (set_.is_standard()) { 4789 return true; 4790 } 4791 if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) { 4792 set_.set_standard_set_type('s'); 4793 return true; 4794 } 4795 if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) { 4796 set_.set_standard_set_type('S'); 4797 return true; 4798 } 4799 if (CompareInverseRanges(set_.ranges(zone), 4800 kLineTerminatorRanges, 4801 kLineTerminatorRangeCount)) { 4802 set_.set_standard_set_type('.'); 4803 return true; 4804 } 4805 if (CompareRanges(set_.ranges(zone), 4806 kLineTerminatorRanges, 4807 kLineTerminatorRangeCount)) { 4808 set_.set_standard_set_type('n'); 4809 return true; 4810 } 4811 if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) { 4812 set_.set_standard_set_type('w'); 4813 return true; 4814 } 4815 if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) { 4816 set_.set_standard_set_type('W'); 4817 return true; 4818 } 4819 return false; 4820 } 4821 4822 4823 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, 4824 RegExpNode* on_success) { 4825 return new (compiler->zone()) 4826 TextNode(this, compiler->read_backward(), on_success); 4827 } 4828 4829 4830 int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) { 4831 RegExpAtom* atom1 = (*a)->AsAtom(); 4832 RegExpAtom* atom2 = (*b)->AsAtom(); 4833 uc16 character1 = atom1->data().at(0); 4834 uc16 character2 = atom2->data().at(0); 4835 if (character1 < character2) return -1; 4836 if (character1 > character2) return 1; 4837 return 0; 4838 } 4839 4840 4841 static unibrow::uchar Canonical( 4842 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize, 4843 unibrow::uchar c) { 4844 unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth]; 4845 int length = canonicalize->get(c, '\0', chars); 4846 DCHECK_LE(length, 1); 4847 unibrow::uchar canonical = c; 4848 if (length == 1) canonical = chars[0]; 4849 return canonical; 4850 } 4851 4852 4853 int CompareFirstCharCaseIndependent( 4854 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize, 4855 RegExpTree* const* a, RegExpTree* const* b) { 4856 RegExpAtom* atom1 = (*a)->AsAtom(); 4857 RegExpAtom* atom2 = (*b)->AsAtom(); 4858 unibrow::uchar character1 = atom1->data().at(0); 4859 unibrow::uchar character2 = atom2->data().at(0); 4860 if (character1 == character2) return 0; 4861 if (character1 >= 'a' || character2 >= 'a') { 4862 character1 = Canonical(canonicalize, character1); 4863 character2 = Canonical(canonicalize, character2); 4864 } 4865 return static_cast<int>(character1) - static_cast<int>(character2); 4866 } 4867 4868 4869 // We can stable sort runs of atoms, since the order does not matter if they 4870 // start with different characters. 4871 // Returns true if any consecutive atoms were found. 4872 bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) { 4873 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 4874 int length = alternatives->length(); 4875 bool found_consecutive_atoms = false; 4876 for (int i = 0; i < length; i++) { 4877 while (i < length) { 4878 RegExpTree* alternative = alternatives->at(i); 4879 if (alternative->IsAtom()) break; 4880 i++; 4881 } 4882 // i is length or it is the index of an atom. 4883 if (i == length) break; 4884 int first_atom = i; 4885 i++; 4886 while (i < length) { 4887 RegExpTree* alternative = alternatives->at(i); 4888 if (!alternative->IsAtom()) break; 4889 i++; 4890 } 4891 // Sort atoms to get ones with common prefixes together. 4892 // This step is more tricky if we are in a case-independent regexp, 4893 // because it would change /is|I/ to /I|is/, and order matters when 4894 // the regexp parts don't match only disjoint starting points. To fix 4895 // this we have a version of CompareFirstChar that uses case- 4896 // independent character classes for comparison. 4897 DCHECK_LT(first_atom, alternatives->length()); 4898 DCHECK_LE(i, alternatives->length()); 4899 DCHECK_LE(first_atom, i); 4900 if (compiler->ignore_case()) { 4901 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize = 4902 compiler->isolate()->regexp_macro_assembler_canonicalize(); 4903 auto compare_closure = 4904 [canonicalize](RegExpTree* const* a, RegExpTree* const* b) { 4905 return CompareFirstCharCaseIndependent(canonicalize, a, b); 4906 }; 4907 alternatives->StableSort(compare_closure, first_atom, i - first_atom); 4908 } else { 4909 alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom); 4910 } 4911 if (i - first_atom > 1) found_consecutive_atoms = true; 4912 } 4913 return found_consecutive_atoms; 4914 } 4915 4916 4917 // Optimizes ab|ac|az to a(?:b|c|d). 4918 void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { 4919 Zone* zone = compiler->zone(); 4920 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 4921 int length = alternatives->length(); 4922 4923 int write_posn = 0; 4924 int i = 0; 4925 while (i < length) { 4926 RegExpTree* alternative = alternatives->at(i); 4927 if (!alternative->IsAtom()) { 4928 alternatives->at(write_posn++) = alternatives->at(i); 4929 i++; 4930 continue; 4931 } 4932 RegExpAtom* atom = alternative->AsAtom(); 4933 unibrow::uchar common_prefix = atom->data().at(0); 4934 int first_with_prefix = i; 4935 int prefix_length = atom->length(); 4936 i++; 4937 while (i < length) { 4938 alternative = alternatives->at(i); 4939 if (!alternative->IsAtom()) break; 4940 atom = alternative->AsAtom(); 4941 unibrow::uchar new_prefix = atom->data().at(0); 4942 if (new_prefix != common_prefix) { 4943 if (!compiler->ignore_case()) break; 4944 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize = 4945 compiler->isolate()->regexp_macro_assembler_canonicalize(); 4946 new_prefix = Canonical(canonicalize, new_prefix); 4947 common_prefix = Canonical(canonicalize, common_prefix); 4948 if (new_prefix != common_prefix) break; 4949 } 4950 prefix_length = Min(prefix_length, atom->length()); 4951 i++; 4952 } 4953 if (i > first_with_prefix + 2) { 4954 // Found worthwhile run of alternatives with common prefix of at least one 4955 // character. The sorting function above did not sort on more than one 4956 // character for reasons of correctness, but there may still be a longer 4957 // common prefix if the terms were similar or presorted in the input. 4958 // Find out how long the common prefix is. 4959 int run_length = i - first_with_prefix; 4960 atom = alternatives->at(first_with_prefix)->AsAtom(); 4961 for (int j = 1; j < run_length && prefix_length > 1; j++) { 4962 RegExpAtom* old_atom = 4963 alternatives->at(j + first_with_prefix)->AsAtom(); 4964 for (int k = 1; k < prefix_length; k++) { 4965 if (atom->data().at(k) != old_atom->data().at(k)) { 4966 prefix_length = k; 4967 break; 4968 } 4969 } 4970 } 4971 RegExpAtom* prefix = 4972 new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length)); 4973 ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone); 4974 pair->Add(prefix, zone); 4975 ZoneList<RegExpTree*>* suffixes = 4976 new (zone) ZoneList<RegExpTree*>(run_length, zone); 4977 for (int j = 0; j < run_length; j++) { 4978 RegExpAtom* old_atom = 4979 alternatives->at(j + first_with_prefix)->AsAtom(); 4980 int len = old_atom->length(); 4981 if (len == prefix_length) { 4982 suffixes->Add(new (zone) RegExpEmpty(), zone); 4983 } else { 4984 RegExpTree* suffix = new (zone) RegExpAtom( 4985 old_atom->data().SubVector(prefix_length, old_atom->length())); 4986 suffixes->Add(suffix, zone); 4987 } 4988 } 4989 pair->Add(new (zone) RegExpDisjunction(suffixes), zone); 4990 alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair); 4991 } else { 4992 // Just copy any non-worthwhile alternatives. 4993 for (int j = first_with_prefix; j < i; j++) { 4994 alternatives->at(write_posn++) = alternatives->at(j); 4995 } 4996 } 4997 } 4998 alternatives->Rewind(write_posn); // Trim end of array. 4999 } 5000 5001 5002 // Optimizes b|c|z to [bcz]. 5003 void RegExpDisjunction::FixSingleCharacterDisjunctions( 5004 RegExpCompiler* compiler) { 5005 Zone* zone = compiler->zone(); 5006 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 5007 int length = alternatives->length(); 5008 5009 int write_posn = 0; 5010 int i = 0; 5011 while (i < length) { 5012 RegExpTree* alternative = alternatives->at(i); 5013 if (!alternative->IsAtom()) { 5014 alternatives->at(write_posn++) = alternatives->at(i); 5015 i++; 5016 continue; 5017 } 5018 RegExpAtom* atom = alternative->AsAtom(); 5019 if (atom->length() != 1) { 5020 alternatives->at(write_posn++) = alternatives->at(i); 5021 i++; 5022 continue; 5023 } 5024 int first_in_run = i; 5025 i++; 5026 while (i < length) { 5027 alternative = alternatives->at(i); 5028 if (!alternative->IsAtom()) break; 5029 atom = alternative->AsAtom(); 5030 if (atom->length() != 1) break; 5031 i++; 5032 } 5033 if (i > first_in_run + 1) { 5034 // Found non-trivial run of single-character alternatives. 5035 int run_length = i - first_in_run; 5036 ZoneList<CharacterRange>* ranges = 5037 new (zone) ZoneList<CharacterRange>(2, zone); 5038 for (int j = 0; j < run_length; j++) { 5039 RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom(); 5040 DCHECK_EQ(old_atom->length(), 1); 5041 ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone); 5042 } 5043 alternatives->at(write_posn++) = 5044 new (zone) RegExpCharacterClass(ranges, false); 5045 } else { 5046 // Just copy any trivial alternatives. 5047 for (int j = first_in_run; j < i; j++) { 5048 alternatives->at(write_posn++) = alternatives->at(j); 5049 } 5050 } 5051 } 5052 alternatives->Rewind(write_posn); // Trim end of array. 5053 } 5054 5055 5056 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler, 5057 RegExpNode* on_success) { 5058 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 5059 5060 if (alternatives->length() > 2) { 5061 bool found_consecutive_atoms = SortConsecutiveAtoms(compiler); 5062 if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler); 5063 FixSingleCharacterDisjunctions(compiler); 5064 if (alternatives->length() == 1) { 5065 return alternatives->at(0)->ToNode(compiler, on_success); 5066 } 5067 } 5068 5069 int length = alternatives->length(); 5070 5071 ChoiceNode* result = 5072 new(compiler->zone()) ChoiceNode(length, compiler->zone()); 5073 for (int i = 0; i < length; i++) { 5074 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler, 5075 on_success)); 5076 result->AddAlternative(alternative); 5077 } 5078 return result; 5079 } 5080 5081 5082 RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler, 5083 RegExpNode* on_success) { 5084 return ToNode(min(), 5085 max(), 5086 is_greedy(), 5087 body(), 5088 compiler, 5089 on_success); 5090 } 5091 5092 5093 // Scoped object to keep track of how much we unroll quantifier loops in the 5094 // regexp graph generator. 5095 class RegExpExpansionLimiter { 5096 public: 5097 static const int kMaxExpansionFactor = 6; 5098 RegExpExpansionLimiter(RegExpCompiler* compiler, int factor) 5099 : compiler_(compiler), 5100 saved_expansion_factor_(compiler->current_expansion_factor()), 5101 ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) { 5102 DCHECK(factor > 0); 5103 if (ok_to_expand_) { 5104 if (factor > kMaxExpansionFactor) { 5105 // Avoid integer overflow of the current expansion factor. 5106 ok_to_expand_ = false; 5107 compiler->set_current_expansion_factor(kMaxExpansionFactor + 1); 5108 } else { 5109 int new_factor = saved_expansion_factor_ * factor; 5110 ok_to_expand_ = (new_factor <= kMaxExpansionFactor); 5111 compiler->set_current_expansion_factor(new_factor); 5112 } 5113 } 5114 } 5115 5116 ~RegExpExpansionLimiter() { 5117 compiler_->set_current_expansion_factor(saved_expansion_factor_); 5118 } 5119 5120 bool ok_to_expand() { return ok_to_expand_; } 5121 5122 private: 5123 RegExpCompiler* compiler_; 5124 int saved_expansion_factor_; 5125 bool ok_to_expand_; 5126 5127 DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter); 5128 }; 5129 5130 5131 RegExpNode* RegExpQuantifier::ToNode(int min, 5132 int max, 5133 bool is_greedy, 5134 RegExpTree* body, 5135 RegExpCompiler* compiler, 5136 RegExpNode* on_success, 5137 bool not_at_start) { 5138 // x{f, t} becomes this: 5139 // 5140 // (r++)<-. 5141 // | ` 5142 // | (x) 5143 // v ^ 5144 // (r=0)-->(?)---/ [if r < t] 5145 // | 5146 // [if r >= f] \----> ... 5147 // 5148 5149 // 15.10.2.5 RepeatMatcher algorithm. 5150 // The parser has already eliminated the case where max is 0. In the case 5151 // where max_match is zero the parser has removed the quantifier if min was 5152 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom. 5153 5154 // If we know that we cannot match zero length then things are a little 5155 // simpler since we don't need to make the special zero length match check 5156 // from step 2.1. If the min and max are small we can unroll a little in 5157 // this case. 5158 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,} 5159 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3} 5160 if (max == 0) return on_success; // This can happen due to recursion. 5161 bool body_can_be_empty = (body->min_match() == 0); 5162 int body_start_reg = RegExpCompiler::kNoRegister; 5163 Interval capture_registers = body->CaptureRegisters(); 5164 bool needs_capture_clearing = !capture_registers.is_empty(); 5165 Zone* zone = compiler->zone(); 5166 5167 if (body_can_be_empty) { 5168 body_start_reg = compiler->AllocateRegister(); 5169 } else if (compiler->optimize() && !needs_capture_clearing) { 5170 // Only unroll if there are no captures and the body can't be 5171 // empty. 5172 { 5173 RegExpExpansionLimiter limiter( 5174 compiler, min + ((max != min) ? 1 : 0)); 5175 if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) { 5176 int new_max = (max == kInfinity) ? max : max - min; 5177 // Recurse once to get the loop or optional matches after the fixed 5178 // ones. 5179 RegExpNode* answer = ToNode( 5180 0, new_max, is_greedy, body, compiler, on_success, true); 5181 // Unroll the forced matches from 0 to min. This can cause chains of 5182 // TextNodes (which the parser does not generate). These should be 5183 // combined if it turns out they hinder good code generation. 5184 for (int i = 0; i < min; i++) { 5185 answer = body->ToNode(compiler, answer); 5186 } 5187 return answer; 5188 } 5189 } 5190 if (max <= kMaxUnrolledMaxMatches && min == 0) { 5191 DCHECK(max > 0); // Due to the 'if' above. 5192 RegExpExpansionLimiter limiter(compiler, max); 5193 if (limiter.ok_to_expand()) { 5194 // Unroll the optional matches up to max. 5195 RegExpNode* answer = on_success; 5196 for (int i = 0; i < max; i++) { 5197 ChoiceNode* alternation = new(zone) ChoiceNode(2, zone); 5198 if (is_greedy) { 5199 alternation->AddAlternative( 5200 GuardedAlternative(body->ToNode(compiler, answer))); 5201 alternation->AddAlternative(GuardedAlternative(on_success)); 5202 } else { 5203 alternation->AddAlternative(GuardedAlternative(on_success)); 5204 alternation->AddAlternative( 5205 GuardedAlternative(body->ToNode(compiler, answer))); 5206 } 5207 answer = alternation; 5208 if (not_at_start && !compiler->read_backward()) { 5209 alternation->set_not_at_start(); 5210 } 5211 } 5212 return answer; 5213 } 5214 } 5215 } 5216 bool has_min = min > 0; 5217 bool has_max = max < RegExpTree::kInfinity; 5218 bool needs_counter = has_min || has_max; 5219 int reg_ctr = needs_counter 5220 ? compiler->AllocateRegister() 5221 : RegExpCompiler::kNoRegister; 5222 LoopChoiceNode* center = new (zone) 5223 LoopChoiceNode(body->min_match() == 0, compiler->read_backward(), zone); 5224 if (not_at_start && !compiler->read_backward()) center->set_not_at_start(); 5225 RegExpNode* loop_return = needs_counter 5226 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center)) 5227 : static_cast<RegExpNode*>(center); 5228 if (body_can_be_empty) { 5229 // If the body can be empty we need to check if it was and then 5230 // backtrack. 5231 loop_return = ActionNode::EmptyMatchCheck(body_start_reg, 5232 reg_ctr, 5233 min, 5234 loop_return); 5235 } 5236 RegExpNode* body_node = body->ToNode(compiler, loop_return); 5237 if (body_can_be_empty) { 5238 // If the body can be empty we need to store the start position 5239 // so we can bail out if it was empty. 5240 body_node = ActionNode::StorePosition(body_start_reg, false, body_node); 5241 } 5242 if (needs_capture_clearing) { 5243 // Before entering the body of this loop we need to clear captures. 5244 body_node = ActionNode::ClearCaptures(capture_registers, body_node); 5245 } 5246 GuardedAlternative body_alt(body_node); 5247 if (has_max) { 5248 Guard* body_guard = 5249 new(zone) Guard(reg_ctr, Guard::LT, max); 5250 body_alt.AddGuard(body_guard, zone); 5251 } 5252 GuardedAlternative rest_alt(on_success); 5253 if (has_min) { 5254 Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min); 5255 rest_alt.AddGuard(rest_guard, zone); 5256 } 5257 if (is_greedy) { 5258 center->AddLoopAlternative(body_alt); 5259 center->AddContinueAlternative(rest_alt); 5260 } else { 5261 center->AddContinueAlternative(rest_alt); 5262 center->AddLoopAlternative(body_alt); 5263 } 5264 if (needs_counter) { 5265 return ActionNode::SetRegister(reg_ctr, 0, center); 5266 } else { 5267 return center; 5268 } 5269 } 5270 5271 5272 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, 5273 RegExpNode* on_success) { 5274 NodeInfo info; 5275 Zone* zone = compiler->zone(); 5276 5277 switch (assertion_type()) { 5278 case START_OF_LINE: 5279 return AssertionNode::AfterNewline(on_success); 5280 case START_OF_INPUT: 5281 return AssertionNode::AtStart(on_success); 5282 case BOUNDARY: 5283 return AssertionNode::AtBoundary(on_success); 5284 case NON_BOUNDARY: 5285 return AssertionNode::AtNonBoundary(on_success); 5286 case END_OF_INPUT: 5287 return AssertionNode::AtEnd(on_success); 5288 case END_OF_LINE: { 5289 // Compile $ in multiline regexps as an alternation with a positive 5290 // lookahead in one side and an end-of-input on the other side. 5291 // We need two registers for the lookahead. 5292 int stack_pointer_register = compiler->AllocateRegister(); 5293 int position_register = compiler->AllocateRegister(); 5294 // The ChoiceNode to distinguish between a newline and end-of-input. 5295 ChoiceNode* result = new(zone) ChoiceNode(2, zone); 5296 // Create a newline atom. 5297 ZoneList<CharacterRange>* newline_ranges = 5298 new(zone) ZoneList<CharacterRange>(3, zone); 5299 CharacterRange::AddClassEscape('n', newline_ranges, zone); 5300 RegExpCharacterClass* newline_atom = new (zone) RegExpCharacterClass('n'); 5301 TextNode* newline_matcher = new (zone) TextNode( 5302 newline_atom, false, ActionNode::PositiveSubmatchSuccess( 5303 stack_pointer_register, position_register, 5304 0, // No captures inside. 5305 -1, // Ignored if no captures. 5306 on_success)); 5307 // Create an end-of-input matcher. 5308 RegExpNode* end_of_line = ActionNode::BeginSubmatch( 5309 stack_pointer_register, 5310 position_register, 5311 newline_matcher); 5312 // Add the two alternatives to the ChoiceNode. 5313 GuardedAlternative eol_alternative(end_of_line); 5314 result->AddAlternative(eol_alternative); 5315 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success)); 5316 result->AddAlternative(end_alternative); 5317 return result; 5318 } 5319 default: 5320 UNREACHABLE(); 5321 } 5322 return on_success; 5323 } 5324 5325 5326 RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler, 5327 RegExpNode* on_success) { 5328 return new (compiler->zone()) 5329 BackReferenceNode(RegExpCapture::StartRegister(index()), 5330 RegExpCapture::EndRegister(index()), 5331 compiler->read_backward(), on_success); 5332 } 5333 5334 5335 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler, 5336 RegExpNode* on_success) { 5337 return on_success; 5338 } 5339 5340 5341 RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler, 5342 RegExpNode* on_success) { 5343 int stack_pointer_register = compiler->AllocateRegister(); 5344 int position_register = compiler->AllocateRegister(); 5345 5346 const int registers_per_capture = 2; 5347 const int register_of_first_capture = 2; 5348 int register_count = capture_count_ * registers_per_capture; 5349 int register_start = 5350 register_of_first_capture + capture_from_ * registers_per_capture; 5351 5352 RegExpNode* result; 5353 bool was_reading_backward = compiler->read_backward(); 5354 compiler->set_read_backward(type() == LOOKBEHIND); 5355 if (is_positive()) { 5356 result = ActionNode::BeginSubmatch( 5357 stack_pointer_register, position_register, 5358 body()->ToNode(compiler, 5359 ActionNode::PositiveSubmatchSuccess( 5360 stack_pointer_register, position_register, 5361 register_count, register_start, on_success))); 5362 } else { 5363 // We use a ChoiceNode for a negative lookahead because it has most of 5364 // the characteristics we need. It has the body of the lookahead as its 5365 // first alternative and the expression after the lookahead of the second 5366 // alternative. If the first alternative succeeds then the 5367 // NegativeSubmatchSuccess will unwind the stack including everything the 5368 // choice node set up and backtrack. If the first alternative fails then 5369 // the second alternative is tried, which is exactly the desired result 5370 // for a negative lookahead. The NegativeLookaheadChoiceNode is a special 5371 // ChoiceNode that knows to ignore the first exit when calculating quick 5372 // checks. 5373 Zone* zone = compiler->zone(); 5374 5375 GuardedAlternative body_alt( 5376 body()->ToNode(compiler, new (zone) NegativeSubmatchSuccess( 5377 stack_pointer_register, position_register, 5378 register_count, register_start, zone))); 5379 ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode( 5380 body_alt, GuardedAlternative(on_success), zone); 5381 result = ActionNode::BeginSubmatch(stack_pointer_register, 5382 position_register, choice_node); 5383 } 5384 compiler->set_read_backward(was_reading_backward); 5385 return result; 5386 } 5387 5388 5389 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler, 5390 RegExpNode* on_success) { 5391 return ToNode(body(), index(), compiler, on_success); 5392 } 5393 5394 5395 RegExpNode* RegExpCapture::ToNode(RegExpTree* body, 5396 int index, 5397 RegExpCompiler* compiler, 5398 RegExpNode* on_success) { 5399 DCHECK_NOT_NULL(body); 5400 int start_reg = RegExpCapture::StartRegister(index); 5401 int end_reg = RegExpCapture::EndRegister(index); 5402 if (compiler->read_backward()) std::swap(start_reg, end_reg); 5403 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success); 5404 RegExpNode* body_node = body->ToNode(compiler, store_end); 5405 return ActionNode::StorePosition(start_reg, true, body_node); 5406 } 5407 5408 5409 RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler, 5410 RegExpNode* on_success) { 5411 ZoneList<RegExpTree*>* children = nodes(); 5412 RegExpNode* current = on_success; 5413 if (compiler->read_backward()) { 5414 for (int i = 0; i < children->length(); i++) { 5415 current = children->at(i)->ToNode(compiler, current); 5416 } 5417 } else { 5418 for (int i = children->length() - 1; i >= 0; i--) { 5419 current = children->at(i)->ToNode(compiler, current); 5420 } 5421 } 5422 return current; 5423 } 5424 5425 5426 static void AddClass(const int* elmv, 5427 int elmc, 5428 ZoneList<CharacterRange>* ranges, 5429 Zone* zone) { 5430 elmc--; 5431 DCHECK(elmv[elmc] == 0x10000); 5432 for (int i = 0; i < elmc; i += 2) { 5433 DCHECK(elmv[i] < elmv[i + 1]); 5434 ranges->Add(CharacterRange(elmv[i], elmv[i + 1] - 1), zone); 5435 } 5436 } 5437 5438 5439 static void AddClassNegated(const int *elmv, 5440 int elmc, 5441 ZoneList<CharacterRange>* ranges, 5442 Zone* zone) { 5443 elmc--; 5444 DCHECK(elmv[elmc] == 0x10000); 5445 DCHECK(elmv[0] != 0x0000); 5446 DCHECK(elmv[elmc-1] != String::kMaxUtf16CodeUnit); 5447 uc16 last = 0x0000; 5448 for (int i = 0; i < elmc; i += 2) { 5449 DCHECK(last <= elmv[i] - 1); 5450 DCHECK(elmv[i] < elmv[i + 1]); 5451 ranges->Add(CharacterRange(last, elmv[i] - 1), zone); 5452 last = elmv[i + 1]; 5453 } 5454 ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit), zone); 5455 } 5456 5457 5458 void CharacterRange::AddClassEscape(uc16 type, 5459 ZoneList<CharacterRange>* ranges, 5460 Zone* zone) { 5461 switch (type) { 5462 case 's': 5463 AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone); 5464 break; 5465 case 'S': 5466 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone); 5467 break; 5468 case 'w': 5469 AddClass(kWordRanges, kWordRangeCount, ranges, zone); 5470 break; 5471 case 'W': 5472 AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone); 5473 break; 5474 case 'd': 5475 AddClass(kDigitRanges, kDigitRangeCount, ranges, zone); 5476 break; 5477 case 'D': 5478 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone); 5479 break; 5480 case '.': 5481 AddClassNegated(kLineTerminatorRanges, 5482 kLineTerminatorRangeCount, 5483 ranges, 5484 zone); 5485 break; 5486 // This is not a character range as defined by the spec but a 5487 // convenient shorthand for a character class that matches any 5488 // character. 5489 case '*': 5490 ranges->Add(CharacterRange::Everything(), zone); 5491 break; 5492 // This is the set of characters matched by the $ and ^ symbols 5493 // in multiline mode. 5494 case 'n': 5495 AddClass(kLineTerminatorRanges, 5496 kLineTerminatorRangeCount, 5497 ranges, 5498 zone); 5499 break; 5500 default: 5501 UNREACHABLE(); 5502 } 5503 } 5504 5505 5506 Vector<const int> CharacterRange::GetWordBounds() { 5507 return Vector<const int>(kWordRanges, kWordRangeCount - 1); 5508 } 5509 5510 5511 class CharacterRangeSplitter { 5512 public: 5513 CharacterRangeSplitter(ZoneList<CharacterRange>** included, 5514 ZoneList<CharacterRange>** excluded, 5515 Zone* zone) 5516 : included_(included), 5517 excluded_(excluded), 5518 zone_(zone) { } 5519 void Call(uc16 from, DispatchTable::Entry entry); 5520 5521 static const int kInBase = 0; 5522 static const int kInOverlay = 1; 5523 5524 private: 5525 ZoneList<CharacterRange>** included_; 5526 ZoneList<CharacterRange>** excluded_; 5527 Zone* zone_; 5528 }; 5529 5530 5531 void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) { 5532 if (!entry.out_set()->Get(kInBase)) return; 5533 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay) 5534 ? included_ 5535 : excluded_; 5536 if (*target == NULL) *target = new(zone_) ZoneList<CharacterRange>(2, zone_); 5537 (*target)->Add(CharacterRange(entry.from(), entry.to()), zone_); 5538 } 5539 5540 5541 void CharacterRange::Split(ZoneList<CharacterRange>* base, 5542 Vector<const int> overlay, 5543 ZoneList<CharacterRange>** included, 5544 ZoneList<CharacterRange>** excluded, 5545 Zone* zone) { 5546 DCHECK_NULL(*included); 5547 DCHECK_NULL(*excluded); 5548 DispatchTable table(zone); 5549 for (int i = 0; i < base->length(); i++) 5550 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase, zone); 5551 for (int i = 0; i < overlay.length(); i += 2) { 5552 table.AddRange(CharacterRange(overlay[i], overlay[i + 1] - 1), 5553 CharacterRangeSplitter::kInOverlay, zone); 5554 } 5555 CharacterRangeSplitter callback(included, excluded, zone); 5556 table.ForEach(&callback); 5557 } 5558 5559 5560 void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone, 5561 ZoneList<CharacterRange>* ranges, 5562 bool is_one_byte) { 5563 uc16 bottom = from(); 5564 uc16 top = to(); 5565 if (is_one_byte && !RangeContainsLatin1Equivalents(*this)) { 5566 if (bottom > String::kMaxOneByteCharCode) return; 5567 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode; 5568 } 5569 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 5570 if (top == bottom) { 5571 // If this is a singleton we just expand the one character. 5572 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars); 5573 for (int i = 0; i < length; i++) { 5574 uc32 chr = chars[i]; 5575 if (chr != bottom) { 5576 ranges->Add(CharacterRange::Singleton(chars[i]), zone); 5577 } 5578 } 5579 } else { 5580 // If this is a range we expand the characters block by block, 5581 // expanding contiguous subranges (blocks) one at a time. 5582 // The approach is as follows. For a given start character we 5583 // look up the remainder of the block that contains it (represented 5584 // by the end point), for instance we find 'z' if the character 5585 // is 'c'. A block is characterized by the property 5586 // that all characters uncanonicalize in the same way, except that 5587 // each entry in the result is incremented by the distance from the first 5588 // element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and 5589 // the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. 5590 // Once we've found the end point we look up its uncanonicalization 5591 // and produce a range for each element. For instance for [c-f] 5592 // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only 5593 // add a range if it is not already contained in the input, so [c-f] 5594 // will be skipped but [C-F] will be added. If this range is not 5595 // completely contained in a block we do this for all the blocks 5596 // covered by the range (handling characters that is not in a block 5597 // as a "singleton block"). 5598 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 5599 int pos = bottom; 5600 while (pos <= top) { 5601 int length = isolate->jsregexp_canonrange()->get(pos, '\0', range); 5602 uc16 block_end; 5603 if (length == 0) { 5604 block_end = pos; 5605 } else { 5606 DCHECK_EQ(1, length); 5607 block_end = range[0]; 5608 } 5609 int end = (block_end > top) ? top : block_end; 5610 length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', range); 5611 for (int i = 0; i < length; i++) { 5612 uc32 c = range[i]; 5613 uc16 range_from = c - (block_end - pos); 5614 uc16 range_to = c - (block_end - end); 5615 if (!(bottom <= range_from && range_to <= top)) { 5616 ranges->Add(CharacterRange(range_from, range_to), zone); 5617 } 5618 } 5619 pos = end + 1; 5620 } 5621 } 5622 } 5623 5624 5625 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) { 5626 DCHECK_NOT_NULL(ranges); 5627 int n = ranges->length(); 5628 if (n <= 1) return true; 5629 int max = ranges->at(0).to(); 5630 for (int i = 1; i < n; i++) { 5631 CharacterRange next_range = ranges->at(i); 5632 if (next_range.from() <= max + 1) return false; 5633 max = next_range.to(); 5634 } 5635 return true; 5636 } 5637 5638 5639 ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) { 5640 if (ranges_ == NULL) { 5641 ranges_ = new(zone) ZoneList<CharacterRange>(2, zone); 5642 CharacterRange::AddClassEscape(standard_set_type_, ranges_, zone); 5643 } 5644 return ranges_; 5645 } 5646 5647 5648 // Move a number of elements in a zonelist to another position 5649 // in the same list. Handles overlapping source and target areas. 5650 static void MoveRanges(ZoneList<CharacterRange>* list, 5651 int from, 5652 int to, 5653 int count) { 5654 // Ranges are potentially overlapping. 5655 if (from < to) { 5656 for (int i = count - 1; i >= 0; i--) { 5657 list->at(to + i) = list->at(from + i); 5658 } 5659 } else { 5660 for (int i = 0; i < count; i++) { 5661 list->at(to + i) = list->at(from + i); 5662 } 5663 } 5664 } 5665 5666 5667 static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list, 5668 int count, 5669 CharacterRange insert) { 5670 // Inserts a range into list[0..count[, which must be sorted 5671 // by from value and non-overlapping and non-adjacent, using at most 5672 // list[0..count] for the result. Returns the number of resulting 5673 // canonicalized ranges. Inserting a range may collapse existing ranges into 5674 // fewer ranges, so the return value can be anything in the range 1..count+1. 5675 uc16 from = insert.from(); 5676 uc16 to = insert.to(); 5677 int start_pos = 0; 5678 int end_pos = count; 5679 for (int i = count - 1; i >= 0; i--) { 5680 CharacterRange current = list->at(i); 5681 if (current.from() > to + 1) { 5682 end_pos = i; 5683 } else if (current.to() + 1 < from) { 5684 start_pos = i + 1; 5685 break; 5686 } 5687 } 5688 5689 // Inserted range overlaps, or is adjacent to, ranges at positions 5690 // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are 5691 // not affected by the insertion. 5692 // If start_pos == end_pos, the range must be inserted before start_pos. 5693 // if start_pos < end_pos, the entire range from start_pos to end_pos 5694 // must be merged with the insert range. 5695 5696 if (start_pos == end_pos) { 5697 // Insert between existing ranges at position start_pos. 5698 if (start_pos < count) { 5699 MoveRanges(list, start_pos, start_pos + 1, count - start_pos); 5700 } 5701 list->at(start_pos) = insert; 5702 return count + 1; 5703 } 5704 if (start_pos + 1 == end_pos) { 5705 // Replace single existing range at position start_pos. 5706 CharacterRange to_replace = list->at(start_pos); 5707 int new_from = Min(to_replace.from(), from); 5708 int new_to = Max(to_replace.to(), to); 5709 list->at(start_pos) = CharacterRange(new_from, new_to); 5710 return count; 5711 } 5712 // Replace a number of existing ranges from start_pos to end_pos - 1. 5713 // Move the remaining ranges down. 5714 5715 int new_from = Min(list->at(start_pos).from(), from); 5716 int new_to = Max(list->at(end_pos - 1).to(), to); 5717 if (end_pos < count) { 5718 MoveRanges(list, end_pos, start_pos + 1, count - end_pos); 5719 } 5720 list->at(start_pos) = CharacterRange(new_from, new_to); 5721 return count - (end_pos - start_pos) + 1; 5722 } 5723 5724 5725 void CharacterSet::Canonicalize() { 5726 // Special/default classes are always considered canonical. The result 5727 // of calling ranges() will be sorted. 5728 if (ranges_ == NULL) return; 5729 CharacterRange::Canonicalize(ranges_); 5730 } 5731 5732 5733 void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) { 5734 if (character_ranges->length() <= 1) return; 5735 // Check whether ranges are already canonical (increasing, non-overlapping, 5736 // non-adjacent). 5737 int n = character_ranges->length(); 5738 int max = character_ranges->at(0).to(); 5739 int i = 1; 5740 while (i < n) { 5741 CharacterRange current = character_ranges->at(i); 5742 if (current.from() <= max + 1) { 5743 break; 5744 } 5745 max = current.to(); 5746 i++; 5747 } 5748 // Canonical until the i'th range. If that's all of them, we are done. 5749 if (i == n) return; 5750 5751 // The ranges at index i and forward are not canonicalized. Make them so by 5752 // doing the equivalent of insertion sort (inserting each into the previous 5753 // list, in order). 5754 // Notice that inserting a range can reduce the number of ranges in the 5755 // result due to combining of adjacent and overlapping ranges. 5756 int read = i; // Range to insert. 5757 int num_canonical = i; // Length of canonicalized part of list. 5758 do { 5759 num_canonical = InsertRangeInCanonicalList(character_ranges, 5760 num_canonical, 5761 character_ranges->at(read)); 5762 read++; 5763 } while (read < n); 5764 character_ranges->Rewind(num_canonical); 5765 5766 DCHECK(CharacterRange::IsCanonical(character_ranges)); 5767 } 5768 5769 5770 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges, 5771 ZoneList<CharacterRange>* negated_ranges, 5772 Zone* zone) { 5773 DCHECK(CharacterRange::IsCanonical(ranges)); 5774 DCHECK_EQ(0, negated_ranges->length()); 5775 int range_count = ranges->length(); 5776 uc16 from = 0; 5777 int i = 0; 5778 if (range_count > 0 && ranges->at(0).from() == 0) { 5779 from = ranges->at(0).to(); 5780 i = 1; 5781 } 5782 while (i < range_count) { 5783 CharacterRange range = ranges->at(i); 5784 negated_ranges->Add(CharacterRange(from + 1, range.from() - 1), zone); 5785 from = range.to(); 5786 i++; 5787 } 5788 if (from < String::kMaxUtf16CodeUnit) { 5789 negated_ranges->Add(CharacterRange(from + 1, String::kMaxUtf16CodeUnit), 5790 zone); 5791 } 5792 } 5793 5794 5795 // ------------------------------------------------------------------- 5796 // Splay tree 5797 5798 5799 OutSet* OutSet::Extend(unsigned value, Zone* zone) { 5800 if (Get(value)) 5801 return this; 5802 if (successors(zone) != NULL) { 5803 for (int i = 0; i < successors(zone)->length(); i++) { 5804 OutSet* successor = successors(zone)->at(i); 5805 if (successor->Get(value)) 5806 return successor; 5807 } 5808 } else { 5809 successors_ = new(zone) ZoneList<OutSet*>(2, zone); 5810 } 5811 OutSet* result = new(zone) OutSet(first_, remaining_); 5812 result->Set(value, zone); 5813 successors(zone)->Add(result, zone); 5814 return result; 5815 } 5816 5817 5818 void OutSet::Set(unsigned value, Zone *zone) { 5819 if (value < kFirstLimit) { 5820 first_ |= (1 << value); 5821 } else { 5822 if (remaining_ == NULL) 5823 remaining_ = new(zone) ZoneList<unsigned>(1, zone); 5824 if (remaining_->is_empty() || !remaining_->Contains(value)) 5825 remaining_->Add(value, zone); 5826 } 5827 } 5828 5829 5830 bool OutSet::Get(unsigned value) const { 5831 if (value < kFirstLimit) { 5832 return (first_ & (1 << value)) != 0; 5833 } else if (remaining_ == NULL) { 5834 return false; 5835 } else { 5836 return remaining_->Contains(value); 5837 } 5838 } 5839 5840 5841 const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar; 5842 5843 5844 void DispatchTable::AddRange(CharacterRange full_range, int value, 5845 Zone* zone) { 5846 CharacterRange current = full_range; 5847 if (tree()->is_empty()) { 5848 // If this is the first range we just insert into the table. 5849 ZoneSplayTree<Config>::Locator loc; 5850 bool inserted = tree()->Insert(current.from(), &loc); 5851 DCHECK(inserted); 5852 USE(inserted); 5853 loc.set_value(Entry(current.from(), current.to(), 5854 empty()->Extend(value, zone))); 5855 return; 5856 } 5857 // First see if there is a range to the left of this one that 5858 // overlaps. 5859 ZoneSplayTree<Config>::Locator loc; 5860 if (tree()->FindGreatestLessThan(current.from(), &loc)) { 5861 Entry* entry = &loc.value(); 5862 // If we've found a range that overlaps with this one, and it 5863 // starts strictly to the left of this one, we have to fix it 5864 // because the following code only handles ranges that start on 5865 // or after the start point of the range we're adding. 5866 if (entry->from() < current.from() && entry->to() >= current.from()) { 5867 // Snap the overlapping range in half around the start point of 5868 // the range we're adding. 5869 CharacterRange left(entry->from(), current.from() - 1); 5870 CharacterRange right(current.from(), entry->to()); 5871 // The left part of the overlapping range doesn't overlap. 5872 // Truncate the whole entry to be just the left part. 5873 entry->set_to(left.to()); 5874 // The right part is the one that overlaps. We add this part 5875 // to the map and let the next step deal with merging it with 5876 // the range we're adding. 5877 ZoneSplayTree<Config>::Locator loc; 5878 bool inserted = tree()->Insert(right.from(), &loc); 5879 DCHECK(inserted); 5880 USE(inserted); 5881 loc.set_value(Entry(right.from(), 5882 right.to(), 5883 entry->out_set())); 5884 } 5885 } 5886 while (current.is_valid()) { 5887 if (tree()->FindLeastGreaterThan(current.from(), &loc) && 5888 (loc.value().from() <= current.to()) && 5889 (loc.value().to() >= current.from())) { 5890 Entry* entry = &loc.value(); 5891 // We have overlap. If there is space between the start point of 5892 // the range we're adding and where the overlapping range starts 5893 // then we have to add a range covering just that space. 5894 if (current.from() < entry->from()) { 5895 ZoneSplayTree<Config>::Locator ins; 5896 bool inserted = tree()->Insert(current.from(), &ins); 5897 DCHECK(inserted); 5898 USE(inserted); 5899 ins.set_value(Entry(current.from(), 5900 entry->from() - 1, 5901 empty()->Extend(value, zone))); 5902 current.set_from(entry->from()); 5903 } 5904 DCHECK_EQ(current.from(), entry->from()); 5905 // If the overlapping range extends beyond the one we want to add 5906 // we have to snap the right part off and add it separately. 5907 if (entry->to() > current.to()) { 5908 ZoneSplayTree<Config>::Locator ins; 5909 bool inserted = tree()->Insert(current.to() + 1, &ins); 5910 DCHECK(inserted); 5911 USE(inserted); 5912 ins.set_value(Entry(current.to() + 1, 5913 entry->to(), 5914 entry->out_set())); 5915 entry->set_to(current.to()); 5916 } 5917 DCHECK(entry->to() <= current.to()); 5918 // The overlapping range is now completely contained by the range 5919 // we're adding so we can just update it and move the start point 5920 // of the range we're adding just past it. 5921 entry->AddValue(value, zone); 5922 // Bail out if the last interval ended at 0xFFFF since otherwise 5923 // adding 1 will wrap around to 0. 5924 if (entry->to() == String::kMaxUtf16CodeUnit) 5925 break; 5926 DCHECK(entry->to() + 1 > current.from()); 5927 current.set_from(entry->to() + 1); 5928 } else { 5929 // There is no overlap so we can just add the range 5930 ZoneSplayTree<Config>::Locator ins; 5931 bool inserted = tree()->Insert(current.from(), &ins); 5932 DCHECK(inserted); 5933 USE(inserted); 5934 ins.set_value(Entry(current.from(), 5935 current.to(), 5936 empty()->Extend(value, zone))); 5937 break; 5938 } 5939 } 5940 } 5941 5942 5943 OutSet* DispatchTable::Get(uc16 value) { 5944 ZoneSplayTree<Config>::Locator loc; 5945 if (!tree()->FindGreatestLessThan(value, &loc)) 5946 return empty(); 5947 Entry* entry = &loc.value(); 5948 if (value <= entry->to()) 5949 return entry->out_set(); 5950 else 5951 return empty(); 5952 } 5953 5954 5955 // ------------------------------------------------------------------- 5956 // Analysis 5957 5958 5959 void Analysis::EnsureAnalyzed(RegExpNode* that) { 5960 StackLimitCheck check(isolate()); 5961 if (check.HasOverflowed()) { 5962 fail("Stack overflow"); 5963 return; 5964 } 5965 if (that->info()->been_analyzed || that->info()->being_analyzed) 5966 return; 5967 that->info()->being_analyzed = true; 5968 that->Accept(this); 5969 that->info()->being_analyzed = false; 5970 that->info()->been_analyzed = true; 5971 } 5972 5973 5974 void Analysis::VisitEnd(EndNode* that) { 5975 // nothing to do 5976 } 5977 5978 5979 void TextNode::CalculateOffsets() { 5980 int element_count = elements()->length(); 5981 // Set up the offsets of the elements relative to the start. This is a fixed 5982 // quantity since a TextNode can only contain fixed-width things. 5983 int cp_offset = 0; 5984 for (int i = 0; i < element_count; i++) { 5985 TextElement& elm = elements()->at(i); 5986 elm.set_cp_offset(cp_offset); 5987 cp_offset += elm.length(); 5988 } 5989 } 5990 5991 5992 void Analysis::VisitText(TextNode* that) { 5993 if (ignore_case_) { 5994 that->MakeCaseIndependent(isolate(), is_one_byte_); 5995 } 5996 EnsureAnalyzed(that->on_success()); 5997 if (!has_failed()) { 5998 that->CalculateOffsets(); 5999 } 6000 } 6001 6002 6003 void Analysis::VisitAction(ActionNode* that) { 6004 RegExpNode* target = that->on_success(); 6005 EnsureAnalyzed(target); 6006 if (!has_failed()) { 6007 // If the next node is interested in what it follows then this node 6008 // has to be interested too so it can pass the information on. 6009 that->info()->AddFromFollowing(target->info()); 6010 } 6011 } 6012 6013 6014 void Analysis::VisitChoice(ChoiceNode* that) { 6015 NodeInfo* info = that->info(); 6016 for (int i = 0; i < that->alternatives()->length(); i++) { 6017 RegExpNode* node = that->alternatives()->at(i).node(); 6018 EnsureAnalyzed(node); 6019 if (has_failed()) return; 6020 // Anything the following nodes need to know has to be known by 6021 // this node also, so it can pass it on. 6022 info->AddFromFollowing(node->info()); 6023 } 6024 } 6025 6026 6027 void Analysis::VisitLoopChoice(LoopChoiceNode* that) { 6028 NodeInfo* info = that->info(); 6029 for (int i = 0; i < that->alternatives()->length(); i++) { 6030 RegExpNode* node = that->alternatives()->at(i).node(); 6031 if (node != that->loop_node()) { 6032 EnsureAnalyzed(node); 6033 if (has_failed()) return; 6034 info->AddFromFollowing(node->info()); 6035 } 6036 } 6037 // Check the loop last since it may need the value of this node 6038 // to get a correct result. 6039 EnsureAnalyzed(that->loop_node()); 6040 if (!has_failed()) { 6041 info->AddFromFollowing(that->loop_node()->info()); 6042 } 6043 } 6044 6045 6046 void Analysis::VisitBackReference(BackReferenceNode* that) { 6047 EnsureAnalyzed(that->on_success()); 6048 } 6049 6050 6051 void Analysis::VisitAssertion(AssertionNode* that) { 6052 EnsureAnalyzed(that->on_success()); 6053 } 6054 6055 6056 void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget, 6057 BoyerMooreLookahead* bm, 6058 bool not_at_start) { 6059 // Working out the set of characters that a backreference can match is too 6060 // hard, so we just say that any character can match. 6061 bm->SetRest(offset); 6062 SaveBMInfo(bm, not_at_start, offset); 6063 } 6064 6065 6066 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize == 6067 RegExpMacroAssembler::kTableSize); 6068 6069 6070 void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget, 6071 BoyerMooreLookahead* bm, bool not_at_start) { 6072 ZoneList<GuardedAlternative>* alts = alternatives(); 6073 budget = (budget - 1) / alts->length(); 6074 for (int i = 0; i < alts->length(); i++) { 6075 GuardedAlternative& alt = alts->at(i); 6076 if (alt.guards() != NULL && alt.guards()->length() != 0) { 6077 bm->SetRest(offset); // Give up trying to fill in info. 6078 SaveBMInfo(bm, not_at_start, offset); 6079 return; 6080 } 6081 alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start); 6082 } 6083 SaveBMInfo(bm, not_at_start, offset); 6084 } 6085 6086 6087 void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget, 6088 BoyerMooreLookahead* bm, bool not_at_start) { 6089 if (initial_offset >= bm->length()) return; 6090 int offset = initial_offset; 6091 int max_char = bm->max_char(); 6092 for (int i = 0; i < elements()->length(); i++) { 6093 if (offset >= bm->length()) { 6094 if (initial_offset == 0) set_bm_info(not_at_start, bm); 6095 return; 6096 } 6097 TextElement text = elements()->at(i); 6098 if (text.text_type() == TextElement::ATOM) { 6099 RegExpAtom* atom = text.atom(); 6100 for (int j = 0; j < atom->length(); j++, offset++) { 6101 if (offset >= bm->length()) { 6102 if (initial_offset == 0) set_bm_info(not_at_start, bm); 6103 return; 6104 } 6105 uc16 character = atom->data()[j]; 6106 if (bm->compiler()->ignore_case()) { 6107 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 6108 int length = GetCaseIndependentLetters( 6109 isolate, character, bm->max_char() == String::kMaxOneByteCharCode, 6110 chars); 6111 for (int j = 0; j < length; j++) { 6112 bm->Set(offset, chars[j]); 6113 } 6114 } else { 6115 if (character <= max_char) bm->Set(offset, character); 6116 } 6117 } 6118 } else { 6119 DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type()); 6120 RegExpCharacterClass* char_class = text.char_class(); 6121 ZoneList<CharacterRange>* ranges = char_class->ranges(zone()); 6122 if (char_class->is_negated()) { 6123 bm->SetAll(offset); 6124 } else { 6125 for (int k = 0; k < ranges->length(); k++) { 6126 CharacterRange& range = ranges->at(k); 6127 if (range.from() > max_char) continue; 6128 int to = Min(max_char, static_cast<int>(range.to())); 6129 bm->SetInterval(offset, Interval(range.from(), to)); 6130 } 6131 } 6132 offset++; 6133 } 6134 } 6135 if (offset >= bm->length()) { 6136 if (initial_offset == 0) set_bm_info(not_at_start, bm); 6137 return; 6138 } 6139 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, 6140 true); // Not at start after a text node. 6141 if (initial_offset == 0) set_bm_info(not_at_start, bm); 6142 } 6143 6144 6145 // ------------------------------------------------------------------- 6146 // Dispatch table construction 6147 6148 6149 void DispatchTableConstructor::VisitEnd(EndNode* that) { 6150 AddRange(CharacterRange::Everything()); 6151 } 6152 6153 6154 void DispatchTableConstructor::BuildTable(ChoiceNode* node) { 6155 node->set_being_calculated(true); 6156 ZoneList<GuardedAlternative>* alternatives = node->alternatives(); 6157 for (int i = 0; i < alternatives->length(); i++) { 6158 set_choice_index(i); 6159 alternatives->at(i).node()->Accept(this); 6160 } 6161 node->set_being_calculated(false); 6162 } 6163 6164 6165 class AddDispatchRange { 6166 public: 6167 explicit AddDispatchRange(DispatchTableConstructor* constructor) 6168 : constructor_(constructor) { } 6169 void Call(uc32 from, DispatchTable::Entry entry); 6170 private: 6171 DispatchTableConstructor* constructor_; 6172 }; 6173 6174 6175 void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) { 6176 CharacterRange range(from, entry.to()); 6177 constructor_->AddRange(range); 6178 } 6179 6180 6181 void DispatchTableConstructor::VisitChoice(ChoiceNode* node) { 6182 if (node->being_calculated()) 6183 return; 6184 DispatchTable* table = node->GetTable(ignore_case_); 6185 AddDispatchRange adder(this); 6186 table->ForEach(&adder); 6187 } 6188 6189 6190 void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) { 6191 // TODO(160): Find the node that we refer back to and propagate its start 6192 // set back to here. For now we just accept anything. 6193 AddRange(CharacterRange::Everything()); 6194 } 6195 6196 6197 void DispatchTableConstructor::VisitAssertion(AssertionNode* that) { 6198 RegExpNode* target = that->on_success(); 6199 target->Accept(this); 6200 } 6201 6202 6203 static int CompareRangeByFrom(const CharacterRange* a, 6204 const CharacterRange* b) { 6205 return Compare<uc16>(a->from(), b->from()); 6206 } 6207 6208 6209 void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) { 6210 ranges->Sort(CompareRangeByFrom); 6211 uc16 last = 0; 6212 for (int i = 0; i < ranges->length(); i++) { 6213 CharacterRange range = ranges->at(i); 6214 if (last < range.from()) 6215 AddRange(CharacterRange(last, range.from() - 1)); 6216 if (range.to() >= last) { 6217 if (range.to() == String::kMaxUtf16CodeUnit) { 6218 return; 6219 } else { 6220 last = range.to() + 1; 6221 } 6222 } 6223 } 6224 AddRange(CharacterRange(last, String::kMaxUtf16CodeUnit)); 6225 } 6226 6227 6228 void DispatchTableConstructor::VisitText(TextNode* that) { 6229 TextElement elm = that->elements()->at(0); 6230 switch (elm.text_type()) { 6231 case TextElement::ATOM: { 6232 uc16 c = elm.atom()->data()[0]; 6233 AddRange(CharacterRange(c, c)); 6234 break; 6235 } 6236 case TextElement::CHAR_CLASS: { 6237 RegExpCharacterClass* tree = elm.char_class(); 6238 ZoneList<CharacterRange>* ranges = tree->ranges(that->zone()); 6239 if (tree->is_negated()) { 6240 AddInverse(ranges); 6241 } else { 6242 for (int i = 0; i < ranges->length(); i++) 6243 AddRange(ranges->at(i)); 6244 } 6245 break; 6246 } 6247 default: { 6248 UNIMPLEMENTED(); 6249 } 6250 } 6251 } 6252 6253 6254 void DispatchTableConstructor::VisitAction(ActionNode* that) { 6255 RegExpNode* target = that->on_success(); 6256 target->Accept(this); 6257 } 6258 6259 6260 RegExpEngine::CompilationResult RegExpEngine::Compile( 6261 Isolate* isolate, Zone* zone, RegExpCompileData* data, bool ignore_case, 6262 bool is_global, bool is_multiline, bool is_sticky, Handle<String> pattern, 6263 Handle<String> sample_subject, bool is_one_byte) { 6264 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) { 6265 return IrregexpRegExpTooBig(isolate); 6266 } 6267 RegExpCompiler compiler(isolate, zone, data->capture_count, ignore_case, 6268 is_one_byte); 6269 6270 if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern)); 6271 6272 // Sample some characters from the middle of the string. 6273 static const int kSampleSize = 128; 6274 6275 sample_subject = String::Flatten(sample_subject); 6276 int chars_sampled = 0; 6277 int half_way = (sample_subject->length() - kSampleSize) / 2; 6278 for (int i = Max(0, half_way); 6279 i < sample_subject->length() && chars_sampled < kSampleSize; 6280 i++, chars_sampled++) { 6281 compiler.frequency_collator()->CountCharacter(sample_subject->Get(i)); 6282 } 6283 6284 // Wrap the body of the regexp in capture #0. 6285 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree, 6286 0, 6287 &compiler, 6288 compiler.accept()); 6289 RegExpNode* node = captured_body; 6290 bool is_end_anchored = data->tree->IsAnchoredAtEnd(); 6291 bool is_start_anchored = data->tree->IsAnchoredAtStart(); 6292 int max_length = data->tree->max_match(); 6293 if (!is_start_anchored && !is_sticky) { 6294 // Add a .*? at the beginning, outside the body capture, unless 6295 // this expression is anchored at the beginning or sticky. 6296 RegExpNode* loop_node = RegExpQuantifier::ToNode( 6297 0, RegExpTree::kInfinity, false, new (zone) RegExpCharacterClass('*'), 6298 &compiler, captured_body, data->contains_anchor); 6299 6300 if (data->contains_anchor) { 6301 // Unroll loop once, to take care of the case that might start 6302 // at the start of input. 6303 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone); 6304 first_step_node->AddAlternative(GuardedAlternative(captured_body)); 6305 first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode( 6306 new (zone) RegExpCharacterClass('*'), false, loop_node))); 6307 node = first_step_node; 6308 } else { 6309 node = loop_node; 6310 } 6311 } 6312 if (is_one_byte) { 6313 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case); 6314 // Do it again to propagate the new nodes to places where they were not 6315 // put because they had not been calculated yet. 6316 if (node != NULL) { 6317 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case); 6318 } 6319 } 6320 6321 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone); 6322 data->node = node; 6323 Analysis analysis(isolate, ignore_case, is_one_byte); 6324 analysis.EnsureAnalyzed(node); 6325 if (analysis.has_failed()) { 6326 const char* error_message = analysis.error_message(); 6327 return CompilationResult(isolate, error_message); 6328 } 6329 6330 // Create the correct assembler for the architecture. 6331 #ifndef V8_INTERPRETED_REGEXP 6332 // Native regexp implementation. 6333 6334 NativeRegExpMacroAssembler::Mode mode = 6335 is_one_byte ? NativeRegExpMacroAssembler::LATIN1 6336 : NativeRegExpMacroAssembler::UC16; 6337 6338 #if V8_TARGET_ARCH_IA32 6339 RegExpMacroAssemblerIA32 macro_assembler(isolate, zone, mode, 6340 (data->capture_count + 1) * 2); 6341 #elif V8_TARGET_ARCH_X64 6342 RegExpMacroAssemblerX64 macro_assembler(isolate, zone, mode, 6343 (data->capture_count + 1) * 2); 6344 #elif V8_TARGET_ARCH_ARM 6345 RegExpMacroAssemblerARM macro_assembler(isolate, zone, mode, 6346 (data->capture_count + 1) * 2); 6347 #elif V8_TARGET_ARCH_ARM64 6348 RegExpMacroAssemblerARM64 macro_assembler(isolate, zone, mode, 6349 (data->capture_count + 1) * 2); 6350 #elif V8_TARGET_ARCH_PPC 6351 RegExpMacroAssemblerPPC macro_assembler(isolate, zone, mode, 6352 (data->capture_count + 1) * 2); 6353 #elif V8_TARGET_ARCH_MIPS 6354 RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode, 6355 (data->capture_count + 1) * 2); 6356 #elif V8_TARGET_ARCH_MIPS64 6357 RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode, 6358 (data->capture_count + 1) * 2); 6359 #elif V8_TARGET_ARCH_X87 6360 RegExpMacroAssemblerX87 macro_assembler(isolate, zone, mode, 6361 (data->capture_count + 1) * 2); 6362 #else 6363 #error "Unsupported architecture" 6364 #endif 6365 6366 #else // V8_INTERPRETED_REGEXP 6367 // Interpreted regexp implementation. 6368 EmbeddedVector<byte, 1024> codes; 6369 RegExpMacroAssemblerIrregexp macro_assembler(isolate, codes, zone); 6370 #endif // V8_INTERPRETED_REGEXP 6371 6372 macro_assembler.set_slow_safe(TooMuchRegExpCode(pattern)); 6373 6374 // Inserted here, instead of in Assembler, because it depends on information 6375 // in the AST that isn't replicated in the Node structure. 6376 static const int kMaxBacksearchLimit = 1024; 6377 if (is_end_anchored && 6378 !is_start_anchored && 6379 max_length < kMaxBacksearchLimit) { 6380 macro_assembler.SetCurrentPositionFromEnd(max_length); 6381 } 6382 6383 if (is_global) { 6384 macro_assembler.set_global_mode( 6385 (data->tree->min_match() > 0) 6386 ? RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK 6387 : RegExpMacroAssembler::GLOBAL); 6388 } 6389 6390 return compiler.Assemble(¯o_assembler, 6391 node, 6392 data->capture_count, 6393 pattern); 6394 } 6395 6396 6397 bool RegExpEngine::TooMuchRegExpCode(Handle<String> pattern) { 6398 Heap* heap = pattern->GetHeap(); 6399 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize; 6400 if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit && 6401 heap->isolate()->memory_allocator()->SizeExecutable() > 6402 RegExpImpl::kRegExpExecutableMemoryLimit) { 6403 too_much = true; 6404 } 6405 return too_much; 6406 } 6407 6408 6409 Object* RegExpResultsCache::Lookup(Heap* heap, String* key_string, 6410 Object* key_pattern, 6411 FixedArray** last_match_cache, 6412 ResultsCacheType type) { 6413 FixedArray* cache; 6414 if (!key_string->IsInternalizedString()) return Smi::FromInt(0); 6415 if (type == STRING_SPLIT_SUBSTRINGS) { 6416 DCHECK(key_pattern->IsString()); 6417 if (!key_pattern->IsInternalizedString()) return Smi::FromInt(0); 6418 cache = heap->string_split_cache(); 6419 } else { 6420 DCHECK(type == REGEXP_MULTIPLE_INDICES); 6421 DCHECK(key_pattern->IsFixedArray()); 6422 cache = heap->regexp_multiple_cache(); 6423 } 6424 6425 uint32_t hash = key_string->Hash(); 6426 uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) & 6427 ~(kArrayEntriesPerCacheEntry - 1)); 6428 if (cache->get(index + kStringOffset) != key_string || 6429 cache->get(index + kPatternOffset) != key_pattern) { 6430 index = 6431 ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1)); 6432 if (cache->get(index + kStringOffset) != key_string || 6433 cache->get(index + kPatternOffset) != key_pattern) { 6434 return Smi::FromInt(0); 6435 } 6436 } 6437 6438 *last_match_cache = FixedArray::cast(cache->get(index + kLastMatchOffset)); 6439 return cache->get(index + kArrayOffset); 6440 } 6441 6442 6443 void RegExpResultsCache::Enter(Isolate* isolate, Handle<String> key_string, 6444 Handle<Object> key_pattern, 6445 Handle<FixedArray> value_array, 6446 Handle<FixedArray> last_match_cache, 6447 ResultsCacheType type) { 6448 Factory* factory = isolate->factory(); 6449 Handle<FixedArray> cache; 6450 if (!key_string->IsInternalizedString()) return; 6451 if (type == STRING_SPLIT_SUBSTRINGS) { 6452 DCHECK(key_pattern->IsString()); 6453 if (!key_pattern->IsInternalizedString()) return; 6454 cache = factory->string_split_cache(); 6455 } else { 6456 DCHECK(type == REGEXP_MULTIPLE_INDICES); 6457 DCHECK(key_pattern->IsFixedArray()); 6458 cache = factory->regexp_multiple_cache(); 6459 } 6460 6461 uint32_t hash = key_string->Hash(); 6462 uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) & 6463 ~(kArrayEntriesPerCacheEntry - 1)); 6464 if (cache->get(index + kStringOffset) == Smi::FromInt(0)) { 6465 cache->set(index + kStringOffset, *key_string); 6466 cache->set(index + kPatternOffset, *key_pattern); 6467 cache->set(index + kArrayOffset, *value_array); 6468 cache->set(index + kLastMatchOffset, *last_match_cache); 6469 } else { 6470 uint32_t index2 = 6471 ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1)); 6472 if (cache->get(index2 + kStringOffset) == Smi::FromInt(0)) { 6473 cache->set(index2 + kStringOffset, *key_string); 6474 cache->set(index2 + kPatternOffset, *key_pattern); 6475 cache->set(index2 + kArrayOffset, *value_array); 6476 cache->set(index2 + kLastMatchOffset, *last_match_cache); 6477 } else { 6478 cache->set(index2 + kStringOffset, Smi::FromInt(0)); 6479 cache->set(index2 + kPatternOffset, Smi::FromInt(0)); 6480 cache->set(index2 + kArrayOffset, Smi::FromInt(0)); 6481 cache->set(index2 + kLastMatchOffset, Smi::FromInt(0)); 6482 cache->set(index + kStringOffset, *key_string); 6483 cache->set(index + kPatternOffset, *key_pattern); 6484 cache->set(index + kArrayOffset, *value_array); 6485 cache->set(index + kLastMatchOffset, *last_match_cache); 6486 } 6487 } 6488 // If the array is a reasonably short list of substrings, convert it into a 6489 // list of internalized strings. 6490 if (type == STRING_SPLIT_SUBSTRINGS && value_array->length() < 100) { 6491 for (int i = 0; i < value_array->length(); i++) { 6492 Handle<String> str(String::cast(value_array->get(i)), isolate); 6493 Handle<String> internalized_str = factory->InternalizeString(str); 6494 value_array->set(i, *internalized_str); 6495 } 6496 } 6497 // Convert backing store to a copy-on-write array. 6498 value_array->set_map_no_write_barrier(*factory->fixed_cow_array_map()); 6499 } 6500 6501 6502 void RegExpResultsCache::Clear(FixedArray* cache) { 6503 for (int i = 0; i < kRegExpResultsCacheSize; i++) { 6504 cache->set(i, Smi::FromInt(0)); 6505 } 6506 } 6507 6508 } // namespace internal 6509 } // namespace v8 6510