Home | History | Annotate | Download | only in regexp
      1 // Copyright 2012 the V8 project authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "src/regexp/jsregexp.h"
      6 
      7 #include <memory>
      8 #include <vector>
      9 
     10 #include "src/base/platform/platform.h"
     11 #include "src/code-tracer.h"
     12 #include "src/compilation-cache.h"
     13 #include "src/elements.h"
     14 #include "src/execution.h"
     15 #include "src/heap/factory.h"
     16 #include "src/isolate-inl.h"
     17 #include "src/messages.h"
     18 #include "src/ostreams.h"
     19 #include "src/regexp/interpreter-irregexp.h"
     20 #include "src/regexp/jsregexp-inl.h"
     21 #include "src/regexp/regexp-macro-assembler-irregexp.h"
     22 #include "src/regexp/regexp-macro-assembler-tracer.h"
     23 #include "src/regexp/regexp-macro-assembler.h"
     24 #include "src/regexp/regexp-parser.h"
     25 #include "src/regexp/regexp-stack.h"
     26 #include "src/runtime/runtime.h"
     27 #include "src/splay-tree-inl.h"
     28 #include "src/string-search.h"
     29 #include "src/unicode-decoder.h"
     30 #include "src/unicode-inl.h"
     31 
     32 #ifdef V8_INTL_SUPPORT
     33 #include "unicode/uniset.h"
     34 #include "unicode/utypes.h"
     35 #endif  // V8_INTL_SUPPORT
     36 
     37 #ifndef V8_INTERPRETED_REGEXP
     38 #if V8_TARGET_ARCH_IA32
     39 #include "src/regexp/ia32/regexp-macro-assembler-ia32.h"
     40 #elif V8_TARGET_ARCH_X64
     41 #include "src/regexp/x64/regexp-macro-assembler-x64.h"
     42 #elif V8_TARGET_ARCH_ARM64
     43 #include "src/regexp/arm64/regexp-macro-assembler-arm64.h"
     44 #elif V8_TARGET_ARCH_ARM
     45 #include "src/regexp/arm/regexp-macro-assembler-arm.h"
     46 #elif V8_TARGET_ARCH_PPC
     47 #include "src/regexp/ppc/regexp-macro-assembler-ppc.h"
     48 #elif V8_TARGET_ARCH_S390
     49 #include "src/regexp/s390/regexp-macro-assembler-s390.h"
     50 #elif V8_TARGET_ARCH_MIPS
     51 #include "src/regexp/mips/regexp-macro-assembler-mips.h"
     52 #elif V8_TARGET_ARCH_MIPS64
     53 #include "src/regexp/mips64/regexp-macro-assembler-mips64.h"
     54 #else
     55 #error Unsupported target architecture.
     56 #endif
     57 #endif
     58 
     59 
     60 namespace v8 {
     61 namespace internal {
     62 
     63 V8_WARN_UNUSED_RESULT
     64 static inline MaybeHandle<Object> ThrowRegExpException(
     65     Isolate* isolate, Handle<JSRegExp> re, Handle<String> pattern,
     66     Handle<String> error_text) {
     67   THROW_NEW_ERROR(isolate, NewSyntaxError(MessageTemplate::kMalformedRegExp,
     68                                           pattern, error_text),
     69                   Object);
     70 }
     71 
     72 inline void ThrowRegExpException(Isolate* isolate, Handle<JSRegExp> re,
     73                                  Handle<String> error_text) {
     74   USE(ThrowRegExpException(isolate, re, Handle<String>(re->Pattern(), isolate),
     75                            error_text));
     76 }
     77 
     78 
     79 ContainedInLattice AddRange(ContainedInLattice containment,
     80                             const int* ranges,
     81                             int ranges_length,
     82                             Interval new_range) {
     83   DCHECK_EQ(1, ranges_length & 1);
     84   DCHECK_EQ(String::kMaxCodePoint + 1, ranges[ranges_length - 1]);
     85   if (containment == kLatticeUnknown) return containment;
     86   bool inside = false;
     87   int last = 0;
     88   for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
     89     // Consider the range from last to ranges[i].
     90     // We haven't got to the new range yet.
     91     if (ranges[i] <= new_range.from()) continue;
     92     // New range is wholly inside last-ranges[i].  Note that new_range.to() is
     93     // inclusive, but the values in ranges are not.
     94     if (last <= new_range.from() && new_range.to() < ranges[i]) {
     95       return Combine(containment, inside ? kLatticeIn : kLatticeOut);
     96     }
     97     return kLatticeUnknown;
     98   }
     99   return containment;
    100 }
    101 
    102 // More makes code generation slower, less makes V8 benchmark score lower.
    103 const int kMaxLookaheadForBoyerMoore = 8;
    104 // In a 3-character pattern you can maximally step forwards 3 characters
    105 // at a time, which is not always enough to pay for the extra logic.
    106 const int kPatternTooShortForBoyerMoore = 2;
    107 
    108 // Identifies the sort of regexps where the regexp engine is faster
    109 // than the code used for atom matches.
    110 static bool HasFewDifferentCharacters(Handle<String> pattern) {
    111   int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
    112   if (length <= kPatternTooShortForBoyerMoore) return false;
    113   const int kMod = 128;
    114   bool character_found[kMod];
    115   int different = 0;
    116   memset(&character_found[0], 0, sizeof(character_found));
    117   for (int i = 0; i < length; i++) {
    118     int ch = (pattern->Get(i) & (kMod - 1));
    119     if (!character_found[ch]) {
    120       character_found[ch] = true;
    121       different++;
    122       // We declare a regexp low-alphabet if it has at least 3 times as many
    123       // characters as it has different characters.
    124       if (different * 3 > length) return false;
    125     }
    126   }
    127   return true;
    128 }
    129 
    130 // Generic RegExp methods. Dispatches to implementation specific methods.
    131 
    132 MaybeHandle<Object> RegExpImpl::Compile(Isolate* isolate, Handle<JSRegExp> re,
    133                                         Handle<String> pattern,
    134                                         JSRegExp::Flags flags) {
    135   DCHECK(pattern->IsFlat());
    136 
    137   Zone zone(isolate->allocator(), ZONE_NAME);
    138   CompilationCache* compilation_cache = isolate->compilation_cache();
    139   MaybeHandle<FixedArray> maybe_cached =
    140       compilation_cache->LookupRegExp(pattern, flags);
    141   Handle<FixedArray> cached;
    142   if (maybe_cached.ToHandle(&cached)) {
    143     re->set_data(*cached);
    144     return re;
    145   }
    146 
    147   PostponeInterruptsScope postpone(isolate);
    148   RegExpCompileData parse_result;
    149   FlatStringReader reader(isolate, pattern);
    150   DCHECK(!isolate->has_pending_exception());
    151   if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
    152                                  &parse_result)) {
    153     // Throw an exception if we fail to parse the pattern.
    154     return ThrowRegExpException(isolate, re, pattern, parse_result.error);
    155   }
    156 
    157   bool has_been_compiled = false;
    158 
    159   if (parse_result.simple && !IgnoreCase(flags) && !IsSticky(flags) &&
    160       !HasFewDifferentCharacters(pattern)) {
    161     // Parse-tree is a single atom that is equal to the pattern.
    162     AtomCompile(isolate, re, pattern, flags, pattern);
    163     has_been_compiled = true;
    164   } else if (parse_result.tree->IsAtom() && !IsSticky(flags) &&
    165              parse_result.capture_count == 0) {
    166     RegExpAtom* atom = parse_result.tree->AsAtom();
    167     Vector<const uc16> atom_pattern = atom->data();
    168     Handle<String> atom_string;
    169     ASSIGN_RETURN_ON_EXCEPTION(
    170         isolate, atom_string,
    171         isolate->factory()->NewStringFromTwoByte(atom_pattern), Object);
    172     if (!IgnoreCase(atom->flags()) && !HasFewDifferentCharacters(atom_string)) {
    173       AtomCompile(isolate, re, pattern, flags, atom_string);
    174       has_been_compiled = true;
    175     }
    176   }
    177   if (!has_been_compiled) {
    178     IrregexpInitialize(isolate, re, pattern, flags, parse_result.capture_count);
    179   }
    180   DCHECK(re->data()->IsFixedArray());
    181   // Compilation succeeded so the data is set on the regexp
    182   // and we can store it in the cache.
    183   Handle<FixedArray> data(FixedArray::cast(re->data()), isolate);
    184   compilation_cache->PutRegExp(pattern, flags, data);
    185 
    186   return re;
    187 }
    188 
    189 MaybeHandle<Object> RegExpImpl::Exec(Isolate* isolate, Handle<JSRegExp> regexp,
    190                                      Handle<String> subject, int index,
    191                                      Handle<RegExpMatchInfo> last_match_info) {
    192   switch (regexp->TypeTag()) {
    193     case JSRegExp::ATOM:
    194       return AtomExec(isolate, regexp, subject, index, last_match_info);
    195     case JSRegExp::IRREGEXP: {
    196       return IrregexpExec(isolate, regexp, subject, index, last_match_info);
    197     }
    198     default:
    199       UNREACHABLE();
    200   }
    201 }
    202 
    203 
    204 // RegExp Atom implementation: Simple string search using indexOf.
    205 
    206 void RegExpImpl::AtomCompile(Isolate* isolate, Handle<JSRegExp> re,
    207                              Handle<String> pattern, JSRegExp::Flags flags,
    208                              Handle<String> match_pattern) {
    209   isolate->factory()->SetRegExpAtomData(re, JSRegExp::ATOM, pattern, flags,
    210                                         match_pattern);
    211 }
    212 
    213 static void SetAtomLastCapture(Isolate* isolate,
    214                                Handle<RegExpMatchInfo> last_match_info,
    215                                String* subject, int from, int to) {
    216   SealHandleScope shs(isolate);
    217   last_match_info->SetNumberOfCaptureRegisters(2);
    218   last_match_info->SetLastSubject(subject);
    219   last_match_info->SetLastInput(subject);
    220   last_match_info->SetCapture(0, from);
    221   last_match_info->SetCapture(1, to);
    222 }
    223 
    224 int RegExpImpl::AtomExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
    225                             Handle<String> subject, int index, int32_t* output,
    226                             int output_size) {
    227   DCHECK_LE(0, index);
    228   DCHECK_LE(index, subject->length());
    229 
    230   subject = String::Flatten(isolate, subject);
    231   DisallowHeapAllocation no_gc;  // ensure vectors stay valid
    232 
    233   String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));
    234   int needle_len = needle->length();
    235   DCHECK(needle->IsFlat());
    236   DCHECK_LT(0, needle_len);
    237 
    238   if (index + needle_len > subject->length()) {
    239     return RegExpImpl::RE_FAILURE;
    240   }
    241 
    242   for (int i = 0; i < output_size; i += 2) {
    243     String::FlatContent needle_content = needle->GetFlatContent();
    244     String::FlatContent subject_content = subject->GetFlatContent();
    245     DCHECK(needle_content.IsFlat());
    246     DCHECK(subject_content.IsFlat());
    247     // dispatch on type of strings
    248     index =
    249         (needle_content.IsOneByte()
    250              ? (subject_content.IsOneByte()
    251                     ? SearchString(isolate, subject_content.ToOneByteVector(),
    252                                    needle_content.ToOneByteVector(), index)
    253                     : SearchString(isolate, subject_content.ToUC16Vector(),
    254                                    needle_content.ToOneByteVector(), index))
    255              : (subject_content.IsOneByte()
    256                     ? SearchString(isolate, subject_content.ToOneByteVector(),
    257                                    needle_content.ToUC16Vector(), index)
    258                     : SearchString(isolate, subject_content.ToUC16Vector(),
    259                                    needle_content.ToUC16Vector(), index)));
    260     if (index == -1) {
    261       return i / 2;  // Return number of matches.
    262     } else {
    263       output[i] = index;
    264       output[i+1] = index + needle_len;
    265       index += needle_len;
    266     }
    267   }
    268   return output_size / 2;
    269 }
    270 
    271 Handle<Object> RegExpImpl::AtomExec(Isolate* isolate, Handle<JSRegExp> re,
    272                                     Handle<String> subject, int index,
    273                                     Handle<RegExpMatchInfo> last_match_info) {
    274   static const int kNumRegisters = 2;
    275   STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
    276   int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
    277 
    278   int res =
    279       AtomExecRaw(isolate, re, subject, index, output_registers, kNumRegisters);
    280 
    281   if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value();
    282 
    283   DCHECK_EQ(res, RegExpImpl::RE_SUCCESS);
    284   SealHandleScope shs(isolate);
    285   SetAtomLastCapture(isolate, last_match_info, *subject, output_registers[0],
    286                      output_registers[1]);
    287   return last_match_info;
    288 }
    289 
    290 
    291 // Irregexp implementation.
    292 
    293 // Ensures that the regexp object contains a compiled version of the
    294 // source for either one-byte or two-byte subject strings.
    295 // If the compiled version doesn't already exist, it is compiled
    296 // from the source pattern.
    297 // If compilation fails, an exception is thrown and this function
    298 // returns false.
    299 bool RegExpImpl::EnsureCompiledIrregexp(Isolate* isolate, Handle<JSRegExp> re,
    300                                         Handle<String> sample_subject,
    301                                         bool is_one_byte) {
    302   Object* compiled_code = re->DataAt(JSRegExp::code_index(is_one_byte));
    303 #ifdef V8_INTERPRETED_REGEXP
    304   if (compiled_code->IsByteArray()) return true;
    305 #else  // V8_INTERPRETED_REGEXP (RegExp native code)
    306   if (compiled_code->IsCode()) return true;
    307 #endif
    308   return CompileIrregexp(isolate, re, sample_subject, is_one_byte);
    309 }
    310 
    311 bool RegExpImpl::CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re,
    312                                  Handle<String> sample_subject,
    313                                  bool is_one_byte) {
    314   // Compile the RegExp.
    315   Zone zone(isolate->allocator(), ZONE_NAME);
    316   PostponeInterruptsScope postpone(isolate);
    317 #ifdef DEBUG
    318   Object* entry = re->DataAt(JSRegExp::code_index(is_one_byte));
    319   // When arriving here entry can only be a smi representing an uncompiled
    320   // regexp.
    321   DCHECK(entry->IsSmi());
    322   int entry_value = Smi::ToInt(entry);
    323   DCHECK_EQ(JSRegExp::kUninitializedValue, entry_value);
    324 #endif
    325 
    326   JSRegExp::Flags flags = re->GetFlags();
    327 
    328   Handle<String> pattern(re->Pattern(), isolate);
    329   pattern = String::Flatten(isolate, pattern);
    330   RegExpCompileData compile_data;
    331   FlatStringReader reader(isolate, pattern);
    332   if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
    333                                  &compile_data)) {
    334     // Throw an exception if we fail to parse the pattern.
    335     // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
    336     USE(ThrowRegExpException(isolate, re, pattern, compile_data.error));
    337     return false;
    338   }
    339   RegExpEngine::CompilationResult result =
    340       RegExpEngine::Compile(isolate, &zone, &compile_data, flags, pattern,
    341                             sample_subject, is_one_byte);
    342   if (result.error_message != nullptr) {
    343     // Unable to compile regexp.
    344     if (FLAG_abort_on_stack_or_string_length_overflow &&
    345         strncmp(result.error_message, "Stack overflow", 15) == 0) {
    346       FATAL("Aborting on stack overflow");
    347     }
    348     Handle<String> error_message = isolate->factory()->NewStringFromUtf8(
    349         CStrVector(result.error_message)).ToHandleChecked();
    350     ThrowRegExpException(isolate, re, error_message);
    351     return false;
    352   }
    353 
    354   Handle<FixedArray> data =
    355       Handle<FixedArray>(FixedArray::cast(re->data()), isolate);
    356   data->set(JSRegExp::code_index(is_one_byte), result.code);
    357   SetIrregexpCaptureNameMap(*data, compile_data.capture_name_map);
    358   int register_max = IrregexpMaxRegisterCount(*data);
    359   if (result.num_registers > register_max) {
    360     SetIrregexpMaxRegisterCount(*data, result.num_registers);
    361   }
    362 
    363   return true;
    364 }
    365 
    366 
    367 int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
    368   return Smi::cast(
    369       re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
    370 }
    371 
    372 
    373 void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
    374   re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
    375 }
    376 
    377 void RegExpImpl::SetIrregexpCaptureNameMap(FixedArray* re,
    378                                            Handle<FixedArray> value) {
    379   if (value.is_null()) {
    380     re->set(JSRegExp::kIrregexpCaptureNameMapIndex, Smi::kZero);
    381   } else {
    382     re->set(JSRegExp::kIrregexpCaptureNameMapIndex, *value);
    383   }
    384 }
    385 
    386 int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
    387   return Smi::ToInt(re->get(JSRegExp::kIrregexpCaptureCountIndex));
    388 }
    389 
    390 
    391 int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
    392   return Smi::ToInt(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex));
    393 }
    394 
    395 
    396 ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_one_byte) {
    397   return ByteArray::cast(re->get(JSRegExp::code_index(is_one_byte)));
    398 }
    399 
    400 
    401 Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_one_byte) {
    402   return Code::cast(re->get(JSRegExp::code_index(is_one_byte)));
    403 }
    404 
    405 void RegExpImpl::IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re,
    406                                     Handle<String> pattern,
    407                                     JSRegExp::Flags flags, int capture_count) {
    408   // Initialize compiled code entries to null.
    409   isolate->factory()->SetRegExpIrregexpData(re, JSRegExp::IRREGEXP, pattern,
    410                                             flags, capture_count);
    411 }
    412 
    413 int RegExpImpl::IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp,
    414                                 Handle<String> subject) {
    415   DCHECK(subject->IsFlat());
    416 
    417   // Check representation of the underlying storage.
    418   bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
    419   if (!EnsureCompiledIrregexp(isolate, regexp, subject, is_one_byte)) return -1;
    420 
    421 #ifdef V8_INTERPRETED_REGEXP
    422   // Byte-code regexp needs space allocated for all its registers.
    423   // The result captures are copied to the start of the registers array
    424   // if the match succeeds.  This way those registers are not clobbered
    425   // when we set the last match info from last successful match.
    426   return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) +
    427          (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
    428 #else  // V8_INTERPRETED_REGEXP
    429   // Native regexp only needs room to output captures. Registers are handled
    430   // internally.
    431   return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
    432 #endif  // V8_INTERPRETED_REGEXP
    433 }
    434 
    435 int RegExpImpl::IrregexpExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
    436                                 Handle<String> subject, int index,
    437                                 int32_t* output, int output_size) {
    438   Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
    439 
    440   DCHECK_LE(0, index);
    441   DCHECK_LE(index, subject->length());
    442   DCHECK(subject->IsFlat());
    443 
    444   bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
    445 
    446 #ifndef V8_INTERPRETED_REGEXP
    447   DCHECK(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
    448   do {
    449     EnsureCompiledIrregexp(isolate, regexp, subject, is_one_byte);
    450     Handle<Code> code(IrregexpNativeCode(*irregexp, is_one_byte), isolate);
    451     // The stack is used to allocate registers for the compiled regexp code.
    452     // This means that in case of failure, the output registers array is left
    453     // untouched and contains the capture results from the previous successful
    454     // match.  We can use that to set the last match info lazily.
    455     NativeRegExpMacroAssembler::Result res =
    456         NativeRegExpMacroAssembler::Match(code,
    457                                           subject,
    458                                           output,
    459                                           output_size,
    460                                           index,
    461                                           isolate);
    462     if (res != NativeRegExpMacroAssembler::RETRY) {
    463       DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION ||
    464              isolate->has_pending_exception());
    465       STATIC_ASSERT(
    466           static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
    467       STATIC_ASSERT(
    468           static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
    469       STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)
    470                     == RE_EXCEPTION);
    471       return static_cast<IrregexpResult>(res);
    472     }
    473     // If result is RETRY, the string has changed representation, and we
    474     // must restart from scratch.
    475     // In this case, it means we must make sure we are prepared to handle
    476     // the, potentially, different subject (the string can switch between
    477     // being internal and external, and even between being Latin1 and UC16,
    478     // but the characters are always the same).
    479     IrregexpPrepare(isolate, regexp, subject);
    480     is_one_byte = subject->IsOneByteRepresentationUnderneath();
    481   } while (true);
    482   UNREACHABLE();
    483 #else  // V8_INTERPRETED_REGEXP
    484 
    485   DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp));
    486   // We must have done EnsureCompiledIrregexp, so we can get the number of
    487   // registers.
    488   int number_of_capture_registers =
    489       (IrregexpNumberOfCaptures(*irregexp) + 1) * 2;
    490   int32_t* raw_output = &output[number_of_capture_registers];
    491   // We do not touch the actual capture result registers until we know there
    492   // has been a match so that we can use those capture results to set the
    493   // last match info.
    494   for (int i = number_of_capture_registers - 1; i >= 0; i--) {
    495     raw_output[i] = -1;
    496   }
    497   Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_one_byte),
    498                                isolate);
    499 
    500   IrregexpResult result = IrregexpInterpreter::Match(isolate,
    501                                                      byte_codes,
    502                                                      subject,
    503                                                      raw_output,
    504                                                      index);
    505   if (result == RE_SUCCESS) {
    506     // Copy capture results to the start of the registers array.
    507     MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t));
    508   }
    509   if (result == RE_EXCEPTION) {
    510     DCHECK(!isolate->has_pending_exception());
    511     isolate->StackOverflow();
    512   }
    513   return result;
    514 #endif  // V8_INTERPRETED_REGEXP
    515 }
    516 
    517 MaybeHandle<Object> RegExpImpl::IrregexpExec(
    518     Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
    519     int previous_index, Handle<RegExpMatchInfo> last_match_info) {
    520   DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
    521 
    522   subject = String::Flatten(isolate, subject);
    523 
    524   // Prepare space for the return values.
    525 #if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG)
    526   if (FLAG_trace_regexp_bytecodes) {
    527     String* pattern = regexp->Pattern();
    528     PrintF("\n\nRegexp match:   /%s/\n\n", pattern->ToCString().get());
    529     PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());
    530   }
    531 #endif
    532   int required_registers =
    533       RegExpImpl::IrregexpPrepare(isolate, regexp, subject);
    534   if (required_registers < 0) {
    535     // Compiling failed with an exception.
    536     DCHECK(isolate->has_pending_exception());
    537     return MaybeHandle<Object>();
    538   }
    539 
    540   int32_t* output_registers = nullptr;
    541   if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
    542     output_registers = NewArray<int32_t>(required_registers);
    543   }
    544   std::unique_ptr<int32_t[]> auto_release(output_registers);
    545   if (output_registers == nullptr) {
    546     output_registers = isolate->jsregexp_static_offsets_vector();
    547   }
    548 
    549   int res =
    550       RegExpImpl::IrregexpExecRaw(isolate, regexp, subject, previous_index,
    551                                   output_registers, required_registers);
    552   if (res == RE_SUCCESS) {
    553     int capture_count =
    554         IrregexpNumberOfCaptures(FixedArray::cast(regexp->data()));
    555     return SetLastMatchInfo(isolate, last_match_info, subject, capture_count,
    556                             output_registers);
    557   }
    558   if (res == RE_EXCEPTION) {
    559     DCHECK(isolate->has_pending_exception());
    560     return MaybeHandle<Object>();
    561   }
    562   DCHECK(res == RE_FAILURE);
    563   return isolate->factory()->null_value();
    564 }
    565 
    566 Handle<RegExpMatchInfo> RegExpImpl::SetLastMatchInfo(
    567     Isolate* isolate, Handle<RegExpMatchInfo> last_match_info,
    568     Handle<String> subject, int capture_count, int32_t* match) {
    569   // This is the only place where match infos can grow. If, after executing the
    570   // regexp, RegExpExecStub finds that the match info is too small, it restarts
    571   // execution in RegExpImpl::Exec, which finally grows the match info right
    572   // here.
    573 
    574   int capture_register_count = (capture_count + 1) * 2;
    575   Handle<RegExpMatchInfo> result = RegExpMatchInfo::ReserveCaptures(
    576       isolate, last_match_info, capture_register_count);
    577   result->SetNumberOfCaptureRegisters(capture_register_count);
    578 
    579   if (*result != *last_match_info) {
    580     // The match info has been reallocated, update the corresponding reference
    581     // on the native context.
    582     if (*last_match_info == *isolate->regexp_last_match_info()) {
    583       isolate->native_context()->set_regexp_last_match_info(*result);
    584     } else if (*last_match_info == *isolate->regexp_internal_match_info()) {
    585       isolate->native_context()->set_regexp_internal_match_info(*result);
    586     }
    587   }
    588 
    589   DisallowHeapAllocation no_allocation;
    590   if (match != nullptr) {
    591     for (int i = 0; i < capture_register_count; i += 2) {
    592       result->SetCapture(i, match[i]);
    593       result->SetCapture(i + 1, match[i + 1]);
    594     }
    595   }
    596   result->SetLastSubject(*subject);
    597   result->SetLastInput(*subject);
    598   return result;
    599 }
    600 
    601 RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
    602                                      Handle<String> subject, Isolate* isolate)
    603     : register_array_(nullptr),
    604       register_array_size_(0),
    605       regexp_(regexp),
    606       subject_(subject),
    607       isolate_(isolate) {
    608 #ifdef V8_INTERPRETED_REGEXP
    609   bool interpreted = true;
    610 #else
    611   bool interpreted = false;
    612 #endif  // V8_INTERPRETED_REGEXP
    613 
    614   if (regexp_->TypeTag() == JSRegExp::ATOM) {
    615     static const int kAtomRegistersPerMatch = 2;
    616     registers_per_match_ = kAtomRegistersPerMatch;
    617     // There is no distinction between interpreted and native for atom regexps.
    618     interpreted = false;
    619   } else {
    620     registers_per_match_ =
    621         RegExpImpl::IrregexpPrepare(isolate_, regexp_, subject_);
    622     if (registers_per_match_ < 0) {
    623       num_matches_ = -1;  // Signal exception.
    624       return;
    625     }
    626   }
    627 
    628   DCHECK(IsGlobal(regexp->GetFlags()));
    629   if (!interpreted) {
    630     register_array_size_ =
    631         Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
    632     max_matches_ = register_array_size_ / registers_per_match_;
    633   } else {
    634     // Global loop in interpreted regexp is not implemented.  We choose
    635     // the size of the offsets vector so that it can only store one match.
    636     register_array_size_ = registers_per_match_;
    637     max_matches_ = 1;
    638   }
    639 
    640   if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
    641     register_array_ = NewArray<int32_t>(register_array_size_);
    642   } else {
    643     register_array_ = isolate->jsregexp_static_offsets_vector();
    644   }
    645 
    646   // Set state so that fetching the results the first time triggers a call
    647   // to the compiled regexp.
    648   current_match_index_ = max_matches_ - 1;
    649   num_matches_ = max_matches_;
    650   DCHECK_LE(2, registers_per_match_);  // Each match has at least one capture.
    651   DCHECK_GE(register_array_size_, registers_per_match_);
    652   int32_t* last_match =
    653       &register_array_[current_match_index_ * registers_per_match_];
    654   last_match[0] = -1;
    655   last_match[1] = 0;
    656 }
    657 
    658 int RegExpImpl::GlobalCache::AdvanceZeroLength(int last_index) {
    659   if (IsUnicode(regexp_->GetFlags()) && last_index + 1 < subject_->length() &&
    660       unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) &&
    661       unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) {
    662     // Advance over the surrogate pair.
    663     return last_index + 2;
    664   }
    665   return last_index + 1;
    666 }
    667 
    668 // -------------------------------------------------------------------
    669 // Implementation of the Irregexp regular expression engine.
    670 //
    671 // The Irregexp regular expression engine is intended to be a complete
    672 // implementation of ECMAScript regular expressions.  It generates either
    673 // bytecodes or native code.
    674 
    675 //   The Irregexp regexp engine is structured in three steps.
    676 //   1) The parser generates an abstract syntax tree.  See ast.cc.
    677 //   2) From the AST a node network is created.  The nodes are all
    678 //      subclasses of RegExpNode.  The nodes represent states when
    679 //      executing a regular expression.  Several optimizations are
    680 //      performed on the node network.
    681 //   3) From the nodes we generate either byte codes or native code
    682 //      that can actually execute the regular expression (perform
    683 //      the search).  The code generation step is described in more
    684 //      detail below.
    685 
    686 // Code generation.
    687 //
    688 //   The nodes are divided into four main categories.
    689 //   * Choice nodes
    690 //        These represent places where the regular expression can
    691 //        match in more than one way.  For example on entry to an
    692 //        alternation (foo|bar) or a repetition (*, +, ? or {}).
    693 //   * Action nodes
    694 //        These represent places where some action should be
    695 //        performed.  Examples include recording the current position
    696 //        in the input string to a register (in order to implement
    697 //        captures) or other actions on register for example in order
    698 //        to implement the counters needed for {} repetitions.
    699 //   * Matching nodes
    700 //        These attempt to match some element part of the input string.
    701 //        Examples of elements include character classes, plain strings
    702 //        or back references.
    703 //   * End nodes
    704 //        These are used to implement the actions required on finding
    705 //        a successful match or failing to find a match.
    706 //
    707 //   The code generated (whether as byte codes or native code) maintains
    708 //   some state as it runs.  This consists of the following elements:
    709 //
    710 //   * The capture registers.  Used for string captures.
    711 //   * Other registers.  Used for counters etc.
    712 //   * The current position.
    713 //   * The stack of backtracking information.  Used when a matching node
    714 //     fails to find a match and needs to try an alternative.
    715 //
    716 // Conceptual regular expression execution model:
    717 //
    718 //   There is a simple conceptual model of regular expression execution
    719 //   which will be presented first.  The actual code generated is a more
    720 //   efficient simulation of the simple conceptual model:
    721 //
    722 //   * Choice nodes are implemented as follows:
    723 //     For each choice except the last {
    724 //       push current position
    725 //       push backtrack code location
    726 //       <generate code to test for choice>
    727 //       backtrack code location:
    728 //       pop current position
    729 //     }
    730 //     <generate code to test for last choice>
    731 //
    732 //   * Actions nodes are generated as follows
    733 //     <push affected registers on backtrack stack>
    734 //     <generate code to perform action>
    735 //     push backtrack code location
    736 //     <generate code to test for following nodes>
    737 //     backtrack code location:
    738 //     <pop affected registers to restore their state>
    739 //     <pop backtrack location from stack and go to it>
    740 //
    741 //   * Matching nodes are generated as follows:
    742 //     if input string matches at current position
    743 //       update current position
    744 //       <generate code to test for following nodes>
    745 //     else
    746 //       <pop backtrack location from stack and go to it>
    747 //
    748 //   Thus it can be seen that the current position is saved and restored
    749 //   by the choice nodes, whereas the registers are saved and restored by
    750 //   by the action nodes that manipulate them.
    751 //
    752 //   The other interesting aspect of this model is that nodes are generated
    753 //   at the point where they are needed by a recursive call to Emit().  If
    754 //   the node has already been code generated then the Emit() call will
    755 //   generate a jump to the previously generated code instead.  In order to
    756 //   limit recursion it is possible for the Emit() function to put the node
    757 //   on a work list for later generation and instead generate a jump.  The
    758 //   destination of the jump is resolved later when the code is generated.
    759 //
    760 // Actual regular expression code generation.
    761 //
    762 //   Code generation is actually more complicated than the above.  In order
    763 //   to improve the efficiency of the generated code some optimizations are
    764 //   performed
    765 //
    766 //   * Choice nodes have 1-character lookahead.
    767 //     A choice node looks at the following character and eliminates some of
    768 //     the choices immediately based on that character.  This is not yet
    769 //     implemented.
    770 //   * Simple greedy loops store reduced backtracking information.
    771 //     A quantifier like /.*foo/m will greedily match the whole input.  It will
    772 //     then need to backtrack to a point where it can match "foo".  The naive
    773 //     implementation of this would push each character position onto the
    774 //     backtracking stack, then pop them off one by one.  This would use space
    775 //     proportional to the length of the input string.  However since the "."
    776 //     can only match in one way and always has a constant length (in this case
    777 //     of 1) it suffices to store the current position on the top of the stack
    778 //     once.  Matching now becomes merely incrementing the current position and
    779 //     backtracking becomes decrementing the current position and checking the
    780 //     result against the stored current position.  This is faster and saves
    781 //     space.
    782 //   * The current state is virtualized.
    783 //     This is used to defer expensive operations until it is clear that they
    784 //     are needed and to generate code for a node more than once, allowing
    785 //     specialized an efficient versions of the code to be created. This is
    786 //     explained in the section below.
    787 //
    788 // Execution state virtualization.
    789 //
    790 //   Instead of emitting code, nodes that manipulate the state can record their
    791 //   manipulation in an object called the Trace.  The Trace object can record a
    792 //   current position offset, an optional backtrack code location on the top of
    793 //   the virtualized backtrack stack and some register changes.  When a node is
    794 //   to be emitted it can flush the Trace or update it.  Flushing the Trace
    795 //   will emit code to bring the actual state into line with the virtual state.
    796 //   Avoiding flushing the state can postpone some work (e.g. updates of capture
    797 //   registers).  Postponing work can save time when executing the regular
    798 //   expression since it may be found that the work never has to be done as a
    799 //   failure to match can occur.  In addition it is much faster to jump to a
    800 //   known backtrack code location than it is to pop an unknown backtrack
    801 //   location from the stack and jump there.
    802 //
    803 //   The virtual state found in the Trace affects code generation.  For example
    804 //   the virtual state contains the difference between the actual current
    805 //   position and the virtual current position, and matching code needs to use
    806 //   this offset to attempt a match in the correct location of the input
    807 //   string.  Therefore code generated for a non-trivial trace is specialized
    808 //   to that trace.  The code generator therefore has the ability to generate
    809 //   code for each node several times.  In order to limit the size of the
    810 //   generated code there is an arbitrary limit on how many specialized sets of
    811 //   code may be generated for a given node.  If the limit is reached, the
    812 //   trace is flushed and a generic version of the code for a node is emitted.
    813 //   This is subsequently used for that node.  The code emitted for non-generic
    814 //   trace is not recorded in the node and so it cannot currently be reused in
    815 //   the event that code generation is requested for an identical trace.
    816 
    817 
    818 void RegExpTree::AppendToText(RegExpText* text, Zone* zone) {
    819   UNREACHABLE();
    820 }
    821 
    822 
    823 void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
    824   text->AddElement(TextElement::Atom(this), zone);
    825 }
    826 
    827 
    828 void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
    829   text->AddElement(TextElement::CharClass(this), zone);
    830 }
    831 
    832 
    833 void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
    834   for (int i = 0; i < elements()->length(); i++)
    835     text->AddElement(elements()->at(i), zone);
    836 }
    837 
    838 
    839 TextElement TextElement::Atom(RegExpAtom* atom) {
    840   return TextElement(ATOM, atom);
    841 }
    842 
    843 
    844 TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
    845   return TextElement(CHAR_CLASS, char_class);
    846 }
    847 
    848 
    849 int TextElement::length() const {
    850   switch (text_type()) {
    851     case ATOM:
    852       return atom()->length();
    853 
    854     case CHAR_CLASS:
    855       return 1;
    856   }
    857   UNREACHABLE();
    858 }
    859 
    860 
    861 DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
    862   if (table_ == nullptr) {
    863     table_ = new(zone()) DispatchTable(zone());
    864     DispatchTableConstructor cons(table_, ignore_case, zone());
    865     cons.BuildTable(this);
    866   }
    867   return table_;
    868 }
    869 
    870 
    871 class FrequencyCollator {
    872  public:
    873   FrequencyCollator() : total_samples_(0) {
    874     for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
    875       frequencies_[i] = CharacterFrequency(i);
    876     }
    877   }
    878 
    879   void CountCharacter(int character) {
    880     int index = (character & RegExpMacroAssembler::kTableMask);
    881     frequencies_[index].Increment();
    882     total_samples_++;
    883   }
    884 
    885   // Does not measure in percent, but rather per-128 (the table size from the
    886   // regexp macro assembler).
    887   int Frequency(int in_character) {
    888     DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
    889     if (total_samples_ < 1) return 1;  // Division by zero.
    890     int freq_in_per128 =
    891         (frequencies_[in_character].counter() * 128) / total_samples_;
    892     return freq_in_per128;
    893   }
    894 
    895  private:
    896   class CharacterFrequency {
    897    public:
    898     CharacterFrequency() : counter_(0), character_(-1) { }
    899     explicit CharacterFrequency(int character)
    900         : counter_(0), character_(character) { }
    901 
    902     void Increment() { counter_++; }
    903     int counter() { return counter_; }
    904     int character() { return character_; }
    905 
    906    private:
    907     int counter_;
    908     int character_;
    909   };
    910 
    911 
    912  private:
    913   CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
    914   int total_samples_;
    915 };
    916 
    917 
    918 class RegExpCompiler {
    919  public:
    920   RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
    921                  bool is_one_byte);
    922 
    923   int AllocateRegister() {
    924     if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
    925       reg_exp_too_big_ = true;
    926       return next_register_;
    927     }
    928     return next_register_++;
    929   }
    930 
    931   // Lookarounds to match lone surrogates for unicode character class matches
    932   // are never nested. We can therefore reuse registers.
    933   int UnicodeLookaroundStackRegister() {
    934     if (unicode_lookaround_stack_register_ == kNoRegister) {
    935       unicode_lookaround_stack_register_ = AllocateRegister();
    936     }
    937     return unicode_lookaround_stack_register_;
    938   }
    939 
    940   int UnicodeLookaroundPositionRegister() {
    941     if (unicode_lookaround_position_register_ == kNoRegister) {
    942       unicode_lookaround_position_register_ = AllocateRegister();
    943     }
    944     return unicode_lookaround_position_register_;
    945   }
    946 
    947   RegExpEngine::CompilationResult Assemble(Isolate* isolate,
    948                                            RegExpMacroAssembler* assembler,
    949                                            RegExpNode* start, int capture_count,
    950                                            Handle<String> pattern);
    951 
    952   inline void AddWork(RegExpNode* node) {
    953     if (!node->on_work_list() && !node->label()->is_bound()) {
    954       node->set_on_work_list(true);
    955       work_list_->push_back(node);
    956     }
    957   }
    958 
    959   static const int kImplementationOffset = 0;
    960   static const int kNumberOfRegistersOffset = 0;
    961   static const int kCodeOffset = 1;
    962 
    963   RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
    964   EndNode* accept() { return accept_; }
    965 
    966   static const int kMaxRecursion = 100;
    967   inline int recursion_depth() { return recursion_depth_; }
    968   inline void IncrementRecursionDepth() { recursion_depth_++; }
    969   inline void DecrementRecursionDepth() { recursion_depth_--; }
    970 
    971   void SetRegExpTooBig() { reg_exp_too_big_ = true; }
    972 
    973   inline bool one_byte() { return one_byte_; }
    974   inline bool optimize() { return optimize_; }
    975   inline void set_optimize(bool value) { optimize_ = value; }
    976   inline bool limiting_recursion() { return limiting_recursion_; }
    977   inline void set_limiting_recursion(bool value) {
    978     limiting_recursion_ = value;
    979   }
    980   bool read_backward() { return read_backward_; }
    981   void set_read_backward(bool value) { read_backward_ = value; }
    982   FrequencyCollator* frequency_collator() { return &frequency_collator_; }
    983 
    984   int current_expansion_factor() { return current_expansion_factor_; }
    985   void set_current_expansion_factor(int value) {
    986     current_expansion_factor_ = value;
    987   }
    988 
    989   Isolate* isolate() const { return isolate_; }
    990   Zone* zone() const { return zone_; }
    991 
    992   static const int kNoRegister = -1;
    993 
    994  private:
    995   EndNode* accept_;
    996   int next_register_;
    997   int unicode_lookaround_stack_register_;
    998   int unicode_lookaround_position_register_;
    999   std::vector<RegExpNode*>* work_list_;
   1000   int recursion_depth_;
   1001   RegExpMacroAssembler* macro_assembler_;
   1002   bool one_byte_;
   1003   bool reg_exp_too_big_;
   1004   bool limiting_recursion_;
   1005   bool optimize_;
   1006   bool read_backward_;
   1007   int current_expansion_factor_;
   1008   FrequencyCollator frequency_collator_;
   1009   Isolate* isolate_;
   1010   Zone* zone_;
   1011 };
   1012 
   1013 
   1014 class RecursionCheck {
   1015  public:
   1016   explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
   1017     compiler->IncrementRecursionDepth();
   1018   }
   1019   ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
   1020  private:
   1021   RegExpCompiler* compiler_;
   1022 };
   1023 
   1024 
   1025 static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) {
   1026   return RegExpEngine::CompilationResult(isolate, "RegExp too big");
   1027 }
   1028 
   1029 
   1030 // Attempts to compile the regexp using an Irregexp code generator.  Returns
   1031 // a fixed array or a null handle depending on whether it succeeded.
   1032 RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
   1033                                bool one_byte)
   1034     : next_register_(2 * (capture_count + 1)),
   1035       unicode_lookaround_stack_register_(kNoRegister),
   1036       unicode_lookaround_position_register_(kNoRegister),
   1037       work_list_(nullptr),
   1038       recursion_depth_(0),
   1039       one_byte_(one_byte),
   1040       reg_exp_too_big_(false),
   1041       limiting_recursion_(false),
   1042       optimize_(FLAG_regexp_optimization),
   1043       read_backward_(false),
   1044       current_expansion_factor_(1),
   1045       frequency_collator_(),
   1046       isolate_(isolate),
   1047       zone_(zone) {
   1048   accept_ = new(zone) EndNode(EndNode::ACCEPT, zone);
   1049   DCHECK_GE(RegExpMacroAssembler::kMaxRegister, next_register_ - 1);
   1050 }
   1051 
   1052 RegExpEngine::CompilationResult RegExpCompiler::Assemble(
   1053     Isolate* isolate, RegExpMacroAssembler* macro_assembler, RegExpNode* start,
   1054     int capture_count, Handle<String> pattern) {
   1055 #ifdef DEBUG
   1056   if (FLAG_trace_regexp_assembler)
   1057     macro_assembler_ = new RegExpMacroAssemblerTracer(isolate, macro_assembler);
   1058   else
   1059 #endif
   1060     macro_assembler_ = macro_assembler;
   1061 
   1062   std::vector<RegExpNode*> work_list;
   1063   work_list_ = &work_list;
   1064   Label fail;
   1065   macro_assembler_->PushBacktrack(&fail);
   1066   Trace new_trace;
   1067   start->Emit(this, &new_trace);
   1068   macro_assembler_->Bind(&fail);
   1069   macro_assembler_->Fail();
   1070   while (!work_list.empty()) {
   1071     RegExpNode* node = work_list.back();
   1072     work_list.pop_back();
   1073     node->set_on_work_list(false);
   1074     if (!node->label()->is_bound()) node->Emit(this, &new_trace);
   1075   }
   1076   if (reg_exp_too_big_) {
   1077     macro_assembler_->AbortedCodeGeneration();
   1078     return IrregexpRegExpTooBig(isolate_);
   1079   }
   1080 
   1081   Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
   1082   isolate->IncreaseTotalRegexpCodeGenerated(code->Size());
   1083   work_list_ = nullptr;
   1084 #if defined(ENABLE_DISASSEMBLER) && !defined(V8_INTERPRETED_REGEXP)
   1085   if (FLAG_print_code) {
   1086     CodeTracer::Scope trace_scope(isolate->GetCodeTracer());
   1087     OFStream os(trace_scope.file());
   1088     Handle<Code>::cast(code)->Disassemble(pattern->ToCString().get(), os);
   1089   }
   1090 #endif
   1091 #ifdef DEBUG
   1092   if (FLAG_trace_regexp_assembler) {
   1093     delete macro_assembler_;
   1094   }
   1095 #endif
   1096   return RegExpEngine::CompilationResult(*code, next_register_);
   1097 }
   1098 
   1099 
   1100 bool Trace::DeferredAction::Mentions(int that) {
   1101   if (action_type() == ActionNode::CLEAR_CAPTURES) {
   1102     Interval range = static_cast<DeferredClearCaptures*>(this)->range();
   1103     return range.Contains(that);
   1104   } else {
   1105     return reg() == that;
   1106   }
   1107 }
   1108 
   1109 
   1110 bool Trace::mentions_reg(int reg) {
   1111   for (DeferredAction* action = actions_; action != nullptr;
   1112        action = action->next()) {
   1113     if (action->Mentions(reg))
   1114       return true;
   1115   }
   1116   return false;
   1117 }
   1118 
   1119 
   1120 bool Trace::GetStoredPosition(int reg, int* cp_offset) {
   1121   DCHECK_EQ(0, *cp_offset);
   1122   for (DeferredAction* action = actions_; action != nullptr;
   1123        action = action->next()) {
   1124     if (action->Mentions(reg)) {
   1125       if (action->action_type() == ActionNode::STORE_POSITION) {
   1126         *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
   1127         return true;
   1128       } else {
   1129         return false;
   1130       }
   1131     }
   1132   }
   1133   return false;
   1134 }
   1135 
   1136 
   1137 int Trace::FindAffectedRegisters(OutSet* affected_registers,
   1138                                  Zone* zone) {
   1139   int max_register = RegExpCompiler::kNoRegister;
   1140   for (DeferredAction* action = actions_; action != nullptr;
   1141        action = action->next()) {
   1142     if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
   1143       Interval range = static_cast<DeferredClearCaptures*>(action)->range();
   1144       for (int i = range.from(); i <= range.to(); i++)
   1145         affected_registers->Set(i, zone);
   1146       if (range.to() > max_register) max_register = range.to();
   1147     } else {
   1148       affected_registers->Set(action->reg(), zone);
   1149       if (action->reg() > max_register) max_register = action->reg();
   1150     }
   1151   }
   1152   return max_register;
   1153 }
   1154 
   1155 
   1156 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
   1157                                      int max_register,
   1158                                      const OutSet& registers_to_pop,
   1159                                      const OutSet& registers_to_clear) {
   1160   for (int reg = max_register; reg >= 0; reg--) {
   1161     if (registers_to_pop.Get(reg)) {
   1162       assembler->PopRegister(reg);
   1163     } else if (registers_to_clear.Get(reg)) {
   1164       int clear_to = reg;
   1165       while (reg > 0 && registers_to_clear.Get(reg - 1)) {
   1166         reg--;
   1167       }
   1168       assembler->ClearRegisters(reg, clear_to);
   1169     }
   1170   }
   1171 }
   1172 
   1173 
   1174 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
   1175                                    int max_register,
   1176                                    const OutSet& affected_registers,
   1177                                    OutSet* registers_to_pop,
   1178                                    OutSet* registers_to_clear,
   1179                                    Zone* zone) {
   1180   // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
   1181   const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
   1182 
   1183   // Count pushes performed to force a stack limit check occasionally.
   1184   int pushes = 0;
   1185 
   1186   for (int reg = 0; reg <= max_register; reg++) {
   1187     if (!affected_registers.Get(reg)) {
   1188       continue;
   1189     }
   1190 
   1191     // The chronologically first deferred action in the trace
   1192     // is used to infer the action needed to restore a register
   1193     // to its previous state (or not, if it's safe to ignore it).
   1194     enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
   1195     DeferredActionUndoType undo_action = IGNORE;
   1196 
   1197     int value = 0;
   1198     bool absolute = false;
   1199     bool clear = false;
   1200     static const int kNoStore = kMinInt;
   1201     int store_position = kNoStore;
   1202     // This is a little tricky because we are scanning the actions in reverse
   1203     // historical order (newest first).
   1204     for (DeferredAction* action = actions_; action != nullptr;
   1205          action = action->next()) {
   1206       if (action->Mentions(reg)) {
   1207         switch (action->action_type()) {
   1208           case ActionNode::SET_REGISTER: {
   1209             Trace::DeferredSetRegister* psr =
   1210                 static_cast<Trace::DeferredSetRegister*>(action);
   1211             if (!absolute) {
   1212               value += psr->value();
   1213               absolute = true;
   1214             }
   1215             // SET_REGISTER is currently only used for newly introduced loop
   1216             // counters. They can have a significant previous value if they
   1217             // occur in a loop. TODO(lrn): Propagate this information, so
   1218             // we can set undo_action to IGNORE if we know there is no value to
   1219             // restore.
   1220             undo_action = RESTORE;
   1221             DCHECK_EQ(store_position, kNoStore);
   1222             DCHECK(!clear);
   1223             break;
   1224           }
   1225           case ActionNode::INCREMENT_REGISTER:
   1226             if (!absolute) {
   1227               value++;
   1228             }
   1229             DCHECK_EQ(store_position, kNoStore);
   1230             DCHECK(!clear);
   1231             undo_action = RESTORE;
   1232             break;
   1233           case ActionNode::STORE_POSITION: {
   1234             Trace::DeferredCapture* pc =
   1235                 static_cast<Trace::DeferredCapture*>(action);
   1236             if (!clear && store_position == kNoStore) {
   1237               store_position = pc->cp_offset();
   1238             }
   1239 
   1240             // For captures we know that stores and clears alternate.
   1241             // Other register, are never cleared, and if the occur
   1242             // inside a loop, they might be assigned more than once.
   1243             if (reg <= 1) {
   1244               // Registers zero and one, aka "capture zero", is
   1245               // always set correctly if we succeed. There is no
   1246               // need to undo a setting on backtrack, because we
   1247               // will set it again or fail.
   1248               undo_action = IGNORE;
   1249             } else {
   1250               undo_action = pc->is_capture() ? CLEAR : RESTORE;
   1251             }
   1252             DCHECK(!absolute);
   1253             DCHECK_EQ(value, 0);
   1254             break;
   1255           }
   1256           case ActionNode::CLEAR_CAPTURES: {
   1257             // Since we're scanning in reverse order, if we've already
   1258             // set the position we have to ignore historically earlier
   1259             // clearing operations.
   1260             if (store_position == kNoStore) {
   1261               clear = true;
   1262             }
   1263             undo_action = RESTORE;
   1264             DCHECK(!absolute);
   1265             DCHECK_EQ(value, 0);
   1266             break;
   1267           }
   1268           default:
   1269             UNREACHABLE();
   1270             break;
   1271         }
   1272       }
   1273     }
   1274     // Prepare for the undo-action (e.g., push if it's going to be popped).
   1275     if (undo_action == RESTORE) {
   1276       pushes++;
   1277       RegExpMacroAssembler::StackCheckFlag stack_check =
   1278           RegExpMacroAssembler::kNoStackLimitCheck;
   1279       if (pushes == push_limit) {
   1280         stack_check = RegExpMacroAssembler::kCheckStackLimit;
   1281         pushes = 0;
   1282       }
   1283 
   1284       assembler->PushRegister(reg, stack_check);
   1285       registers_to_pop->Set(reg, zone);
   1286     } else if (undo_action == CLEAR) {
   1287       registers_to_clear->Set(reg, zone);
   1288     }
   1289     // Perform the chronologically last action (or accumulated increment)
   1290     // for the register.
   1291     if (store_position != kNoStore) {
   1292       assembler->WriteCurrentPositionToRegister(reg, store_position);
   1293     } else if (clear) {
   1294       assembler->ClearRegisters(reg, reg);
   1295     } else if (absolute) {
   1296       assembler->SetRegister(reg, value);
   1297     } else if (value != 0) {
   1298       assembler->AdvanceRegister(reg, value);
   1299     }
   1300   }
   1301 }
   1302 
   1303 
   1304 // This is called as we come into a loop choice node and some other tricky
   1305 // nodes.  It normalizes the state of the code generator to ensure we can
   1306 // generate generic code.
   1307 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
   1308   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   1309 
   1310   DCHECK(!is_trivial());
   1311 
   1312   if (actions_ == nullptr && backtrack() == nullptr) {
   1313     // Here we just have some deferred cp advances to fix and we are back to
   1314     // a normal situation.  We may also have to forget some information gained
   1315     // through a quick check that was already performed.
   1316     if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
   1317     // Create a new trivial state and generate the node with that.
   1318     Trace new_state;
   1319     successor->Emit(compiler, &new_state);
   1320     return;
   1321   }
   1322 
   1323   // Generate deferred actions here along with code to undo them again.
   1324   OutSet affected_registers;
   1325 
   1326   if (backtrack() != nullptr) {
   1327     // Here we have a concrete backtrack location.  These are set up by choice
   1328     // nodes and so they indicate that we have a deferred save of the current
   1329     // position which we may need to emit here.
   1330     assembler->PushCurrentPosition();
   1331   }
   1332 
   1333   int max_register = FindAffectedRegisters(&affected_registers,
   1334                                            compiler->zone());
   1335   OutSet registers_to_pop;
   1336   OutSet registers_to_clear;
   1337   PerformDeferredActions(assembler,
   1338                          max_register,
   1339                          affected_registers,
   1340                          &registers_to_pop,
   1341                          &registers_to_clear,
   1342                          compiler->zone());
   1343   if (cp_offset_ != 0) {
   1344     assembler->AdvanceCurrentPosition(cp_offset_);
   1345   }
   1346 
   1347   // Create a new trivial state and generate the node with that.
   1348   Label undo;
   1349   assembler->PushBacktrack(&undo);
   1350   if (successor->KeepRecursing(compiler)) {
   1351     Trace new_state;
   1352     successor->Emit(compiler, &new_state);
   1353   } else {
   1354     compiler->AddWork(successor);
   1355     assembler->GoTo(successor->label());
   1356   }
   1357 
   1358   // On backtrack we need to restore state.
   1359   assembler->Bind(&undo);
   1360   RestoreAffectedRegisters(assembler,
   1361                            max_register,
   1362                            registers_to_pop,
   1363                            registers_to_clear);
   1364   if (backtrack() == nullptr) {
   1365     assembler->Backtrack();
   1366   } else {
   1367     assembler->PopCurrentPosition();
   1368     assembler->GoTo(backtrack());
   1369   }
   1370 }
   1371 
   1372 
   1373 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
   1374   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   1375 
   1376   // Omit flushing the trace. We discard the entire stack frame anyway.
   1377 
   1378   if (!label()->is_bound()) {
   1379     // We are completely independent of the trace, since we ignore it,
   1380     // so this code can be used as the generic version.
   1381     assembler->Bind(label());
   1382   }
   1383 
   1384   // Throw away everything on the backtrack stack since the start
   1385   // of the negative submatch and restore the character position.
   1386   assembler->ReadCurrentPositionFromRegister(current_position_register_);
   1387   assembler->ReadStackPointerFromRegister(stack_pointer_register_);
   1388   if (clear_capture_count_ > 0) {
   1389     // Clear any captures that might have been performed during the success
   1390     // of the body of the negative look-ahead.
   1391     int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
   1392     assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
   1393   }
   1394   // Now that we have unwound the stack we find at the top of the stack the
   1395   // backtrack that the BeginSubmatch node got.
   1396   assembler->Backtrack();
   1397 }
   1398 
   1399 
   1400 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
   1401   if (!trace->is_trivial()) {
   1402     trace->Flush(compiler, this);
   1403     return;
   1404   }
   1405   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   1406   if (!label()->is_bound()) {
   1407     assembler->Bind(label());
   1408   }
   1409   switch (action_) {
   1410     case ACCEPT:
   1411       assembler->Succeed();
   1412       return;
   1413     case BACKTRACK:
   1414       assembler->GoTo(trace->backtrack());
   1415       return;
   1416     case NEGATIVE_SUBMATCH_SUCCESS:
   1417       // This case is handled in a different virtual method.
   1418       UNREACHABLE();
   1419   }
   1420   UNIMPLEMENTED();
   1421 }
   1422 
   1423 
   1424 void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
   1425   if (guards_ == nullptr) guards_ = new (zone) ZoneList<Guard*>(1, zone);
   1426   guards_->Add(guard, zone);
   1427 }
   1428 
   1429 
   1430 ActionNode* ActionNode::SetRegister(int reg,
   1431                                     int val,
   1432                                     RegExpNode* on_success) {
   1433   ActionNode* result =
   1434       new(on_success->zone()) ActionNode(SET_REGISTER, on_success);
   1435   result->data_.u_store_register.reg = reg;
   1436   result->data_.u_store_register.value = val;
   1437   return result;
   1438 }
   1439 
   1440 
   1441 ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
   1442   ActionNode* result =
   1443       new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success);
   1444   result->data_.u_increment_register.reg = reg;
   1445   return result;
   1446 }
   1447 
   1448 
   1449 ActionNode* ActionNode::StorePosition(int reg,
   1450                                       bool is_capture,
   1451                                       RegExpNode* on_success) {
   1452   ActionNode* result =
   1453       new(on_success->zone()) ActionNode(STORE_POSITION, on_success);
   1454   result->data_.u_position_register.reg = reg;
   1455   result->data_.u_position_register.is_capture = is_capture;
   1456   return result;
   1457 }
   1458 
   1459 
   1460 ActionNode* ActionNode::ClearCaptures(Interval range,
   1461                                       RegExpNode* on_success) {
   1462   ActionNode* result =
   1463       new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success);
   1464   result->data_.u_clear_captures.range_from = range.from();
   1465   result->data_.u_clear_captures.range_to = range.to();
   1466   return result;
   1467 }
   1468 
   1469 
   1470 ActionNode* ActionNode::BeginSubmatch(int stack_reg,
   1471                                       int position_reg,
   1472                                       RegExpNode* on_success) {
   1473   ActionNode* result =
   1474       new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success);
   1475   result->data_.u_submatch.stack_pointer_register = stack_reg;
   1476   result->data_.u_submatch.current_position_register = position_reg;
   1477   return result;
   1478 }
   1479 
   1480 
   1481 ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
   1482                                                 int position_reg,
   1483                                                 int clear_register_count,
   1484                                                 int clear_register_from,
   1485                                                 RegExpNode* on_success) {
   1486   ActionNode* result =
   1487       new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
   1488   result->data_.u_submatch.stack_pointer_register = stack_reg;
   1489   result->data_.u_submatch.current_position_register = position_reg;
   1490   result->data_.u_submatch.clear_register_count = clear_register_count;
   1491   result->data_.u_submatch.clear_register_from = clear_register_from;
   1492   return result;
   1493 }
   1494 
   1495 
   1496 ActionNode* ActionNode::EmptyMatchCheck(int start_register,
   1497                                         int repetition_register,
   1498                                         int repetition_limit,
   1499                                         RegExpNode* on_success) {
   1500   ActionNode* result =
   1501       new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success);
   1502   result->data_.u_empty_match_check.start_register = start_register;
   1503   result->data_.u_empty_match_check.repetition_register = repetition_register;
   1504   result->data_.u_empty_match_check.repetition_limit = repetition_limit;
   1505   return result;
   1506 }
   1507 
   1508 
   1509 #define DEFINE_ACCEPT(Type)                                          \
   1510   void Type##Node::Accept(NodeVisitor* visitor) {                    \
   1511     visitor->Visit##Type(this);                                      \
   1512   }
   1513 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
   1514 #undef DEFINE_ACCEPT
   1515 
   1516 
   1517 void LoopChoiceNode::Accept(NodeVisitor* visitor) {
   1518   visitor->VisitLoopChoice(this);
   1519 }
   1520 
   1521 
   1522 // -------------------------------------------------------------------
   1523 // Emit code.
   1524 
   1525 
   1526 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
   1527                                Guard* guard,
   1528                                Trace* trace) {
   1529   switch (guard->op()) {
   1530     case Guard::LT:
   1531       DCHECK(!trace->mentions_reg(guard->reg()));
   1532       macro_assembler->IfRegisterGE(guard->reg(),
   1533                                     guard->value(),
   1534                                     trace->backtrack());
   1535       break;
   1536     case Guard::GEQ:
   1537       DCHECK(!trace->mentions_reg(guard->reg()));
   1538       macro_assembler->IfRegisterLT(guard->reg(),
   1539                                     guard->value(),
   1540                                     trace->backtrack());
   1541       break;
   1542   }
   1543 }
   1544 
   1545 
   1546 // Returns the number of characters in the equivalence class, omitting those
   1547 // that cannot occur in the source string because it is Latin1.
   1548 static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
   1549                                      bool one_byte_subject,
   1550                                      unibrow::uchar* letters) {
   1551   int length =
   1552       isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
   1553   // Unibrow returns 0 or 1 for characters where case independence is
   1554   // trivial.
   1555   if (length == 0) {
   1556     letters[0] = character;
   1557     length = 1;
   1558   }
   1559 
   1560   if (one_byte_subject) {
   1561     int new_length = 0;
   1562     for (int i = 0; i < length; i++) {
   1563       if (letters[i] <= String::kMaxOneByteCharCode) {
   1564         letters[new_length++] = letters[i];
   1565       }
   1566     }
   1567     length = new_length;
   1568   }
   1569 
   1570   return length;
   1571 }
   1572 
   1573 
   1574 static inline bool EmitSimpleCharacter(Isolate* isolate,
   1575                                        RegExpCompiler* compiler,
   1576                                        uc16 c,
   1577                                        Label* on_failure,
   1578                                        int cp_offset,
   1579                                        bool check,
   1580                                        bool preloaded) {
   1581   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   1582   bool bound_checked = false;
   1583   if (!preloaded) {
   1584     assembler->LoadCurrentCharacter(
   1585         cp_offset,
   1586         on_failure,
   1587         check);
   1588     bound_checked = true;
   1589   }
   1590   assembler->CheckNotCharacter(c, on_failure);
   1591   return bound_checked;
   1592 }
   1593 
   1594 
   1595 // Only emits non-letters (things that don't have case).  Only used for case
   1596 // independent matches.
   1597 static inline bool EmitAtomNonLetter(Isolate* isolate,
   1598                                      RegExpCompiler* compiler,
   1599                                      uc16 c,
   1600                                      Label* on_failure,
   1601                                      int cp_offset,
   1602                                      bool check,
   1603                                      bool preloaded) {
   1604   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
   1605   bool one_byte = compiler->one_byte();
   1606   unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
   1607   int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
   1608   if (length < 1) {
   1609     // This can't match.  Must be an one-byte subject and a non-one-byte
   1610     // character.  We do not need to do anything since the one-byte pass
   1611     // already handled this.
   1612     return false;  // Bounds not checked.
   1613   }
   1614   bool checked = false;
   1615   // We handle the length > 1 case in a later pass.
   1616   if (length == 1) {
   1617     if (one_byte && c > String::kMaxOneByteCharCodeU) {
   1618       // Can't match - see above.
   1619       return false;  // Bounds not checked.
   1620     }
   1621     if (!preloaded) {
   1622       macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
   1623       checked = check;
   1624     }
   1625     macro_assembler->CheckNotCharacter(c, on_failure);
   1626   }
   1627   return checked;
   1628 }
   1629 
   1630 
   1631 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
   1632                                       bool one_byte, uc16 c1, uc16 c2,
   1633                                       Label* on_failure) {
   1634   uc16 char_mask;
   1635   if (one_byte) {
   1636     char_mask = String::kMaxOneByteCharCode;
   1637   } else {
   1638     char_mask = String::kMaxUtf16CodeUnit;
   1639   }
   1640   uc16 exor = c1 ^ c2;
   1641   // Check whether exor has only one bit set.
   1642   if (((exor - 1) & exor) == 0) {
   1643     // If c1 and c2 differ only by one bit.
   1644     // Ecma262UnCanonicalize always gives the highest number last.
   1645     DCHECK(c2 > c1);
   1646     uc16 mask = char_mask ^ exor;
   1647     macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
   1648     return true;
   1649   }
   1650   DCHECK(c2 > c1);
   1651   uc16 diff = c2 - c1;
   1652   if (((diff - 1) & diff) == 0 && c1 >= diff) {
   1653     // If the characters differ by 2^n but don't differ by one bit then
   1654     // subtract the difference from the found character, then do the or
   1655     // trick.  We avoid the theoretical case where negative numbers are
   1656     // involved in order to simplify code generation.
   1657     uc16 mask = char_mask ^ diff;
   1658     macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
   1659                                                     diff,
   1660                                                     mask,
   1661                                                     on_failure);
   1662     return true;
   1663   }
   1664   return false;
   1665 }
   1666 
   1667 
   1668 typedef bool EmitCharacterFunction(Isolate* isolate,
   1669                                    RegExpCompiler* compiler,
   1670                                    uc16 c,
   1671                                    Label* on_failure,
   1672                                    int cp_offset,
   1673                                    bool check,
   1674                                    bool preloaded);
   1675 
   1676 // Only emits letters (things that have case).  Only used for case independent
   1677 // matches.
   1678 static inline bool EmitAtomLetter(Isolate* isolate,
   1679                                   RegExpCompiler* compiler,
   1680                                   uc16 c,
   1681                                   Label* on_failure,
   1682                                   int cp_offset,
   1683                                   bool check,
   1684                                   bool preloaded) {
   1685   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
   1686   bool one_byte = compiler->one_byte();
   1687   unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
   1688   int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
   1689   if (length <= 1) return false;
   1690   // We may not need to check against the end of the input string
   1691   // if this character lies before a character that matched.
   1692   if (!preloaded) {
   1693     macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
   1694   }
   1695   Label ok;
   1696   DCHECK_EQ(4, unibrow::Ecma262UnCanonicalize::kMaxWidth);
   1697   switch (length) {
   1698     case 2: {
   1699       if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
   1700                                     chars[1], on_failure)) {
   1701       } else {
   1702         macro_assembler->CheckCharacter(chars[0], &ok);
   1703         macro_assembler->CheckNotCharacter(chars[1], on_failure);
   1704         macro_assembler->Bind(&ok);
   1705       }
   1706       break;
   1707     }
   1708     case 4:
   1709       macro_assembler->CheckCharacter(chars[3], &ok);
   1710       V8_FALLTHROUGH;
   1711     case 3:
   1712       macro_assembler->CheckCharacter(chars[0], &ok);
   1713       macro_assembler->CheckCharacter(chars[1], &ok);
   1714       macro_assembler->CheckNotCharacter(chars[2], on_failure);
   1715       macro_assembler->Bind(&ok);
   1716       break;
   1717     default:
   1718       UNREACHABLE();
   1719       break;
   1720   }
   1721   return true;
   1722 }
   1723 
   1724 
   1725 static void EmitBoundaryTest(RegExpMacroAssembler* masm,
   1726                              int border,
   1727                              Label* fall_through,
   1728                              Label* above_or_equal,
   1729                              Label* below) {
   1730   if (below != fall_through) {
   1731     masm->CheckCharacterLT(border, below);
   1732     if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
   1733   } else {
   1734     masm->CheckCharacterGT(border - 1, above_or_equal);
   1735   }
   1736 }
   1737 
   1738 
   1739 static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
   1740                                    int first,
   1741                                    int last,
   1742                                    Label* fall_through,
   1743                                    Label* in_range,
   1744                                    Label* out_of_range) {
   1745   if (in_range == fall_through) {
   1746     if (first == last) {
   1747       masm->CheckNotCharacter(first, out_of_range);
   1748     } else {
   1749       masm->CheckCharacterNotInRange(first, last, out_of_range);
   1750     }
   1751   } else {
   1752     if (first == last) {
   1753       masm->CheckCharacter(first, in_range);
   1754     } else {
   1755       masm->CheckCharacterInRange(first, last, in_range);
   1756     }
   1757     if (out_of_range != fall_through) masm->GoTo(out_of_range);
   1758   }
   1759 }
   1760 
   1761 
   1762 // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
   1763 // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
   1764 static void EmitUseLookupTable(
   1765     RegExpMacroAssembler* masm,
   1766     ZoneList<int>* ranges,
   1767     int start_index,
   1768     int end_index,
   1769     int min_char,
   1770     Label* fall_through,
   1771     Label* even_label,
   1772     Label* odd_label) {
   1773   static const int kSize = RegExpMacroAssembler::kTableSize;
   1774   static const int kMask = RegExpMacroAssembler::kTableMask;
   1775 
   1776   int base = (min_char & ~kMask);
   1777   USE(base);
   1778 
   1779   // Assert that everything is on one kTableSize page.
   1780   for (int i = start_index; i <= end_index; i++) {
   1781     DCHECK_EQ(ranges->at(i) & ~kMask, base);
   1782   }
   1783   DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base);
   1784 
   1785   char templ[kSize];
   1786   Label* on_bit_set;
   1787   Label* on_bit_clear;
   1788   int bit;
   1789   if (even_label == fall_through) {
   1790     on_bit_set = odd_label;
   1791     on_bit_clear = even_label;
   1792     bit = 1;
   1793   } else {
   1794     on_bit_set = even_label;
   1795     on_bit_clear = odd_label;
   1796     bit = 0;
   1797   }
   1798   for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) {
   1799     templ[i] = bit;
   1800   }
   1801   int j = 0;
   1802   bit ^= 1;
   1803   for (int i = start_index; i < end_index; i++) {
   1804     for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
   1805       templ[j] = bit;
   1806     }
   1807     bit ^= 1;
   1808   }
   1809   for (int i = j; i < kSize; i++) {
   1810     templ[i] = bit;
   1811   }
   1812   Factory* factory = masm->isolate()->factory();
   1813   // TODO(erikcorry): Cache these.
   1814   Handle<ByteArray> ba = factory->NewByteArray(kSize, TENURED);
   1815   for (int i = 0; i < kSize; i++) {
   1816     ba->set(i, templ[i]);
   1817   }
   1818   masm->CheckBitInTable(ba, on_bit_set);
   1819   if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
   1820 }
   1821 
   1822 
   1823 static void CutOutRange(RegExpMacroAssembler* masm,
   1824                         ZoneList<int>* ranges,
   1825                         int start_index,
   1826                         int end_index,
   1827                         int cut_index,
   1828                         Label* even_label,
   1829                         Label* odd_label) {
   1830   bool odd = (((cut_index - start_index) & 1) == 1);
   1831   Label* in_range_label = odd ? odd_label : even_label;
   1832   Label dummy;
   1833   EmitDoubleBoundaryTest(masm,
   1834                          ranges->at(cut_index),
   1835                          ranges->at(cut_index + 1) - 1,
   1836                          &dummy,
   1837                          in_range_label,
   1838                          &dummy);
   1839   DCHECK(!dummy.is_linked());
   1840   // Cut out the single range by rewriting the array.  This creates a new
   1841   // range that is a merger of the two ranges on either side of the one we
   1842   // are cutting out.  The oddity of the labels is preserved.
   1843   for (int j = cut_index; j > start_index; j--) {
   1844     ranges->at(j) = ranges->at(j - 1);
   1845   }
   1846   for (int j = cut_index + 1; j < end_index; j++) {
   1847     ranges->at(j) = ranges->at(j + 1);
   1848   }
   1849 }
   1850 
   1851 
   1852 // Unicode case.  Split the search space into kSize spaces that are handled
   1853 // with recursion.
   1854 static void SplitSearchSpace(ZoneList<int>* ranges,
   1855                              int start_index,
   1856                              int end_index,
   1857                              int* new_start_index,
   1858                              int* new_end_index,
   1859                              int* border) {
   1860   static const int kSize = RegExpMacroAssembler::kTableSize;
   1861   static const int kMask = RegExpMacroAssembler::kTableMask;
   1862 
   1863   int first = ranges->at(start_index);
   1864   int last = ranges->at(end_index) - 1;
   1865 
   1866   *new_start_index = start_index;
   1867   *border = (ranges->at(start_index) & ~kMask) + kSize;
   1868   while (*new_start_index < end_index) {
   1869     if (ranges->at(*new_start_index) > *border) break;
   1870     (*new_start_index)++;
   1871   }
   1872   // new_start_index is the index of the first edge that is beyond the
   1873   // current kSize space.
   1874 
   1875   // For very large search spaces we do a binary chop search of the non-Latin1
   1876   // space instead of just going to the end of the current kSize space.  The
   1877   // heuristics are complicated a little by the fact that any 128-character
   1878   // encoding space can be quickly tested with a table lookup, so we don't
   1879   // wish to do binary chop search at a smaller granularity than that.  A
   1880   // 128-character space can take up a lot of space in the ranges array if,
   1881   // for example, we only want to match every second character (eg. the lower
   1882   // case characters on some Unicode pages).
   1883   int binary_chop_index = (end_index + start_index) / 2;
   1884   // The first test ensures that we get to the code that handles the Latin1
   1885   // range with a single not-taken branch, speeding up this important
   1886   // character range (even non-Latin1 charset-based text has spaces and
   1887   // punctuation).
   1888   if (*border - 1 > String::kMaxOneByteCharCode &&  // Latin1 case.
   1889       end_index - start_index > (*new_start_index - start_index) * 2 &&
   1890       last - first > kSize * 2 && binary_chop_index > *new_start_index &&
   1891       ranges->at(binary_chop_index) >= first + 2 * kSize) {
   1892     int scan_forward_for_section_border = binary_chop_index;;
   1893     int new_border = (ranges->at(binary_chop_index) | kMask) + 1;
   1894 
   1895     while (scan_forward_for_section_border < end_index) {
   1896       if (ranges->at(scan_forward_for_section_border) > new_border) {
   1897         *new_start_index = scan_forward_for_section_border;
   1898         *border = new_border;
   1899         break;
   1900       }
   1901       scan_forward_for_section_border++;
   1902     }
   1903   }
   1904 
   1905   DCHECK(*new_start_index > start_index);
   1906   *new_end_index = *new_start_index - 1;
   1907   if (ranges->at(*new_end_index) == *border) {
   1908     (*new_end_index)--;
   1909   }
   1910   if (*border >= ranges->at(end_index)) {
   1911     *border = ranges->at(end_index);
   1912     *new_start_index = end_index;  // Won't be used.
   1913     *new_end_index = end_index - 1;
   1914   }
   1915 }
   1916 
   1917 // Gets a series of segment boundaries representing a character class.  If the
   1918 // character is in the range between an even and an odd boundary (counting from
   1919 // start_index) then go to even_label, otherwise go to odd_label.  We already
   1920 // know that the character is in the range of min_char to max_char inclusive.
   1921 // Either label can be nullptr indicating backtracking.  Either label can also
   1922 // be equal to the fall_through label.
   1923 static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<int>* ranges,
   1924                              int start_index, int end_index, uc32 min_char,
   1925                              uc32 max_char, Label* fall_through,
   1926                              Label* even_label, Label* odd_label) {
   1927   DCHECK_LE(min_char, String::kMaxUtf16CodeUnit);
   1928   DCHECK_LE(max_char, String::kMaxUtf16CodeUnit);
   1929 
   1930   int first = ranges->at(start_index);
   1931   int last = ranges->at(end_index) - 1;
   1932 
   1933   DCHECK_LT(min_char, first);
   1934 
   1935   // Just need to test if the character is before or on-or-after
   1936   // a particular character.
   1937   if (start_index == end_index) {
   1938     EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
   1939     return;
   1940   }
   1941 
   1942   // Another almost trivial case:  There is one interval in the middle that is
   1943   // different from the end intervals.
   1944   if (start_index + 1 == end_index) {
   1945     EmitDoubleBoundaryTest(
   1946         masm, first, last, fall_through, even_label, odd_label);
   1947     return;
   1948   }
   1949 
   1950   // It's not worth using table lookup if there are very few intervals in the
   1951   // character class.
   1952   if (end_index - start_index <= 6) {
   1953     // It is faster to test for individual characters, so we look for those
   1954     // first, then try arbitrary ranges in the second round.
   1955     static int kNoCutIndex = -1;
   1956     int cut = kNoCutIndex;
   1957     for (int i = start_index; i < end_index; i++) {
   1958       if (ranges->at(i) == ranges->at(i + 1) - 1) {
   1959         cut = i;
   1960         break;
   1961       }
   1962     }
   1963     if (cut == kNoCutIndex) cut = start_index;
   1964     CutOutRange(
   1965         masm, ranges, start_index, end_index, cut, even_label, odd_label);
   1966     DCHECK_GE(end_index - start_index, 2);
   1967     GenerateBranches(masm,
   1968                      ranges,
   1969                      start_index + 1,
   1970                      end_index - 1,
   1971                      min_char,
   1972                      max_char,
   1973                      fall_through,
   1974                      even_label,
   1975                      odd_label);
   1976     return;
   1977   }
   1978 
   1979   // If there are a lot of intervals in the regexp, then we will use tables to
   1980   // determine whether the character is inside or outside the character class.
   1981   static const int kBits = RegExpMacroAssembler::kTableSizeBits;
   1982 
   1983   if ((max_char >> kBits) == (min_char >> kBits)) {
   1984     EmitUseLookupTable(masm,
   1985                        ranges,
   1986                        start_index,
   1987                        end_index,
   1988                        min_char,
   1989                        fall_through,
   1990                        even_label,
   1991                        odd_label);
   1992     return;
   1993   }
   1994 
   1995   if ((min_char >> kBits) != (first >> kBits)) {
   1996     masm->CheckCharacterLT(first, odd_label);
   1997     GenerateBranches(masm,
   1998                      ranges,
   1999                      start_index + 1,
   2000                      end_index,
   2001                      first,
   2002                      max_char,
   2003                      fall_through,
   2004                      odd_label,
   2005                      even_label);
   2006     return;
   2007   }
   2008 
   2009   int new_start_index = 0;
   2010   int new_end_index = 0;
   2011   int border = 0;
   2012 
   2013   SplitSearchSpace(ranges,
   2014                    start_index,
   2015                    end_index,
   2016                    &new_start_index,
   2017                    &new_end_index,
   2018                    &border);
   2019 
   2020   Label handle_rest;
   2021   Label* above = &handle_rest;
   2022   if (border == last + 1) {
   2023     // We didn't find any section that started after the limit, so everything
   2024     // above the border is one of the terminal labels.
   2025     above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
   2026     DCHECK(new_end_index == end_index - 1);
   2027   }
   2028 
   2029   DCHECK_LE(start_index, new_end_index);
   2030   DCHECK_LE(new_start_index, end_index);
   2031   DCHECK_LT(start_index, new_start_index);
   2032   DCHECK_LT(new_end_index, end_index);
   2033   DCHECK(new_end_index + 1 == new_start_index ||
   2034          (new_end_index + 2 == new_start_index &&
   2035           border == ranges->at(new_end_index + 1)));
   2036   DCHECK_LT(min_char, border - 1);
   2037   DCHECK_LT(border, max_char);
   2038   DCHECK_LT(ranges->at(new_end_index), border);
   2039   DCHECK(border < ranges->at(new_start_index) ||
   2040          (border == ranges->at(new_start_index) &&
   2041           new_start_index == end_index &&
   2042           new_end_index == end_index - 1 &&
   2043           border == last + 1));
   2044   DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1));
   2045 
   2046   masm->CheckCharacterGT(border - 1, above);
   2047   Label dummy;
   2048   GenerateBranches(masm,
   2049                    ranges,
   2050                    start_index,
   2051                    new_end_index,
   2052                    min_char,
   2053                    border - 1,
   2054                    &dummy,
   2055                    even_label,
   2056                    odd_label);
   2057   if (handle_rest.is_linked()) {
   2058     masm->Bind(&handle_rest);
   2059     bool flip = (new_start_index & 1) != (start_index & 1);
   2060     GenerateBranches(masm,
   2061                      ranges,
   2062                      new_start_index,
   2063                      end_index,
   2064                      border,
   2065                      max_char,
   2066                      &dummy,
   2067                      flip ? odd_label : even_label,
   2068                      flip ? even_label : odd_label);
   2069   }
   2070 }
   2071 
   2072 
   2073 static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
   2074                           RegExpCharacterClass* cc, bool one_byte,
   2075                           Label* on_failure, int cp_offset, bool check_offset,
   2076                           bool preloaded, Zone* zone) {
   2077   ZoneList<CharacterRange>* ranges = cc->ranges(zone);
   2078   CharacterRange::Canonicalize(ranges);
   2079 
   2080   int max_char;
   2081   if (one_byte) {
   2082     max_char = String::kMaxOneByteCharCode;
   2083   } else {
   2084     max_char = String::kMaxUtf16CodeUnit;
   2085   }
   2086 
   2087   int range_count = ranges->length();
   2088 
   2089   int last_valid_range = range_count - 1;
   2090   while (last_valid_range >= 0) {
   2091     CharacterRange& range = ranges->at(last_valid_range);
   2092     if (range.from() <= max_char) {
   2093       break;
   2094     }
   2095     last_valid_range--;
   2096   }
   2097 
   2098   if (last_valid_range < 0) {
   2099     if (!cc->is_negated()) {
   2100       macro_assembler->GoTo(on_failure);
   2101     }
   2102     if (check_offset) {
   2103       macro_assembler->CheckPosition(cp_offset, on_failure);
   2104     }
   2105     return;
   2106   }
   2107 
   2108   if (last_valid_range == 0 &&
   2109       ranges->at(0).IsEverything(max_char)) {
   2110     if (cc->is_negated()) {
   2111       macro_assembler->GoTo(on_failure);
   2112     } else {
   2113       // This is a common case hit by non-anchored expressions.
   2114       if (check_offset) {
   2115         macro_assembler->CheckPosition(cp_offset, on_failure);
   2116       }
   2117     }
   2118     return;
   2119   }
   2120 
   2121   if (!preloaded) {
   2122     macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
   2123   }
   2124 
   2125   if (cc->is_standard(zone) &&
   2126       macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
   2127                                                   on_failure)) {
   2128       return;
   2129   }
   2130 
   2131 
   2132   // A new list with ascending entries.  Each entry is a code unit
   2133   // where there is a boundary between code units that are part of
   2134   // the class and code units that are not.  Normally we insert an
   2135   // entry at zero which goes to the failure label, but if there
   2136   // was already one there we fall through for success on that entry.
   2137   // Subsequent entries have alternating meaning (success/failure).
   2138   ZoneList<int>* range_boundaries =
   2139       new(zone) ZoneList<int>(last_valid_range, zone);
   2140 
   2141   bool zeroth_entry_is_failure = !cc->is_negated();
   2142 
   2143   for (int i = 0; i <= last_valid_range; i++) {
   2144     CharacterRange& range = ranges->at(i);
   2145     if (range.from() == 0) {
   2146       DCHECK_EQ(i, 0);
   2147       zeroth_entry_is_failure = !zeroth_entry_is_failure;
   2148     } else {
   2149       range_boundaries->Add(range.from(), zone);
   2150     }
   2151     range_boundaries->Add(range.to() + 1, zone);
   2152   }
   2153   int end_index = range_boundaries->length() - 1;
   2154   if (range_boundaries->at(end_index) > max_char) {
   2155     end_index--;
   2156   }
   2157 
   2158   Label fall_through;
   2159   GenerateBranches(macro_assembler,
   2160                    range_boundaries,
   2161                    0,  // start_index.
   2162                    end_index,
   2163                    0,  // min_char.
   2164                    max_char,
   2165                    &fall_through,
   2166                    zeroth_entry_is_failure ? &fall_through : on_failure,
   2167                    zeroth_entry_is_failure ? on_failure : &fall_through);
   2168   macro_assembler->Bind(&fall_through);
   2169 }
   2170 
   2171 
   2172 RegExpNode::~RegExpNode() {
   2173 }
   2174 
   2175 
   2176 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
   2177                                                   Trace* trace) {
   2178   // If we are generating a greedy loop then don't stop and don't reuse code.
   2179   if (trace->stop_node() != nullptr) {
   2180     return CONTINUE;
   2181   }
   2182 
   2183   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
   2184   if (trace->is_trivial()) {
   2185     if (label_.is_bound() || on_work_list() || !KeepRecursing(compiler)) {
   2186       // If a generic version is already scheduled to be generated or we have
   2187       // recursed too deeply then just generate a jump to that code.
   2188       macro_assembler->GoTo(&label_);
   2189       // This will queue it up for generation of a generic version if it hasn't
   2190       // already been queued.
   2191       compiler->AddWork(this);
   2192       return DONE;
   2193     }
   2194     // Generate generic version of the node and bind the label for later use.
   2195     macro_assembler->Bind(&label_);
   2196     return CONTINUE;
   2197   }
   2198 
   2199   // We are being asked to make a non-generic version.  Keep track of how many
   2200   // non-generic versions we generate so as not to overdo it.
   2201   trace_count_++;
   2202   if (KeepRecursing(compiler) && compiler->optimize() &&
   2203       trace_count_ < kMaxCopiesCodeGenerated) {
   2204     return CONTINUE;
   2205   }
   2206 
   2207   // If we get here code has been generated for this node too many times or
   2208   // recursion is too deep.  Time to switch to a generic version.  The code for
   2209   // generic versions above can handle deep recursion properly.
   2210   bool was_limiting = compiler->limiting_recursion();
   2211   compiler->set_limiting_recursion(true);
   2212   trace->Flush(compiler, this);
   2213   compiler->set_limiting_recursion(was_limiting);
   2214   return DONE;
   2215 }
   2216 
   2217 
   2218 bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) {
   2219   return !compiler->limiting_recursion() &&
   2220          compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion;
   2221 }
   2222 
   2223 
   2224 int ActionNode::EatsAtLeast(int still_to_find,
   2225                             int budget,
   2226                             bool not_at_start) {
   2227   if (budget <= 0) return 0;
   2228   if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0;  // Rewinds input!
   2229   return on_success()->EatsAtLeast(still_to_find,
   2230                                    budget - 1,
   2231                                    not_at_start);
   2232 }
   2233 
   2234 
   2235 void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
   2236                               BoyerMooreLookahead* bm, bool not_at_start) {
   2237   if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
   2238     on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
   2239   }
   2240   SaveBMInfo(bm, not_at_start, offset);
   2241 }
   2242 
   2243 
   2244 int AssertionNode::EatsAtLeast(int still_to_find,
   2245                                int budget,
   2246                                bool not_at_start) {
   2247   if (budget <= 0) return 0;
   2248   // If we know we are not at the start and we are asked "how many characters
   2249   // will you match if you succeed?" then we can answer anything since false
   2250   // implies false.  So lets just return the max answer (still_to_find) since
   2251   // that won't prevent us from preloading a lot of characters for the other
   2252   // branches in the node graph.
   2253   if (assertion_type() == AT_START && not_at_start) return still_to_find;
   2254   return on_success()->EatsAtLeast(still_to_find,
   2255                                    budget - 1,
   2256                                    not_at_start);
   2257 }
   2258 
   2259 
   2260 void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
   2261                                  BoyerMooreLookahead* bm, bool not_at_start) {
   2262   // Match the behaviour of EatsAtLeast on this node.
   2263   if (assertion_type() == AT_START && not_at_start) return;
   2264   on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
   2265   SaveBMInfo(bm, not_at_start, offset);
   2266 }
   2267 
   2268 
   2269 int BackReferenceNode::EatsAtLeast(int still_to_find,
   2270                                    int budget,
   2271                                    bool not_at_start) {
   2272   if (read_backward()) return 0;
   2273   if (budget <= 0) return 0;
   2274   return on_success()->EatsAtLeast(still_to_find,
   2275                                    budget - 1,
   2276                                    not_at_start);
   2277 }
   2278 
   2279 
   2280 int TextNode::EatsAtLeast(int still_to_find,
   2281                           int budget,
   2282                           bool not_at_start) {
   2283   if (read_backward()) return 0;
   2284   int answer = Length();
   2285   if (answer >= still_to_find) return answer;
   2286   if (budget <= 0) return answer;
   2287   // We are not at start after this node so we set the last argument to 'true'.
   2288   return answer + on_success()->EatsAtLeast(still_to_find - answer,
   2289                                             budget - 1,
   2290                                             true);
   2291 }
   2292 
   2293 
   2294 int NegativeLookaroundChoiceNode::EatsAtLeast(int still_to_find, int budget,
   2295                                               bool not_at_start) {
   2296   if (budget <= 0) return 0;
   2297   // Alternative 0 is the negative lookahead, alternative 1 is what comes
   2298   // afterwards.
   2299   RegExpNode* node = alternatives_->at(1).node();
   2300   return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
   2301 }
   2302 
   2303 
   2304 void NegativeLookaroundChoiceNode::GetQuickCheckDetails(
   2305     QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in,
   2306     bool not_at_start) {
   2307   // Alternative 0 is the negative lookahead, alternative 1 is what comes
   2308   // afterwards.
   2309   RegExpNode* node = alternatives_->at(1).node();
   2310   return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
   2311 }
   2312 
   2313 
   2314 int ChoiceNode::EatsAtLeastHelper(int still_to_find,
   2315                                   int budget,
   2316                                   RegExpNode* ignore_this_node,
   2317                                   bool not_at_start) {
   2318   if (budget <= 0) return 0;
   2319   int min = 100;
   2320   int choice_count = alternatives_->length();
   2321   budget = (budget - 1) / choice_count;
   2322   for (int i = 0; i < choice_count; i++) {
   2323     RegExpNode* node = alternatives_->at(i).node();
   2324     if (node == ignore_this_node) continue;
   2325     int node_eats_at_least =
   2326         node->EatsAtLeast(still_to_find, budget, not_at_start);
   2327     if (node_eats_at_least < min) min = node_eats_at_least;
   2328     if (min == 0) return 0;
   2329   }
   2330   return min;
   2331 }
   2332 
   2333 
   2334 int LoopChoiceNode::EatsAtLeast(int still_to_find,
   2335                                 int budget,
   2336                                 bool not_at_start) {
   2337   return EatsAtLeastHelper(still_to_find,
   2338                            budget - 1,
   2339                            loop_node_,
   2340                            not_at_start);
   2341 }
   2342 
   2343 
   2344 int ChoiceNode::EatsAtLeast(int still_to_find,
   2345                             int budget,
   2346                             bool not_at_start) {
   2347   return EatsAtLeastHelper(still_to_find, budget, nullptr, not_at_start);
   2348 }
   2349 
   2350 
   2351 // Takes the left-most 1-bit and smears it out, setting all bits to its right.
   2352 static inline uint32_t SmearBitsRight(uint32_t v) {
   2353   v |= v >> 1;
   2354   v |= v >> 2;
   2355   v |= v >> 4;
   2356   v |= v >> 8;
   2357   v |= v >> 16;
   2358   return v;
   2359 }
   2360 
   2361 
   2362 bool QuickCheckDetails::Rationalize(bool asc) {
   2363   bool found_useful_op = false;
   2364   uint32_t char_mask;
   2365   if (asc) {
   2366     char_mask = String::kMaxOneByteCharCode;
   2367   } else {
   2368     char_mask = String::kMaxUtf16CodeUnit;
   2369   }
   2370   mask_ = 0;
   2371   value_ = 0;
   2372   int char_shift = 0;
   2373   for (int i = 0; i < characters_; i++) {
   2374     Position* pos = &positions_[i];
   2375     if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
   2376       found_useful_op = true;
   2377     }
   2378     mask_ |= (pos->mask & char_mask) << char_shift;
   2379     value_ |= (pos->value & char_mask) << char_shift;
   2380     char_shift += asc ? 8 : 16;
   2381   }
   2382   return found_useful_op;
   2383 }
   2384 
   2385 
   2386 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
   2387                                 Trace* bounds_check_trace,
   2388                                 Trace* trace,
   2389                                 bool preload_has_checked_bounds,
   2390                                 Label* on_possible_success,
   2391                                 QuickCheckDetails* details,
   2392                                 bool fall_through_on_failure) {
   2393   if (details->characters() == 0) return false;
   2394   GetQuickCheckDetails(
   2395       details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE);
   2396   if (details->cannot_match()) return false;
   2397   if (!details->Rationalize(compiler->one_byte())) return false;
   2398   DCHECK(details->characters() == 1 ||
   2399          compiler->macro_assembler()->CanReadUnaligned());
   2400   uint32_t mask = details->mask();
   2401   uint32_t value = details->value();
   2402 
   2403   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   2404 
   2405   if (trace->characters_preloaded() != details->characters()) {
   2406     DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset());
   2407     // We are attempting to preload the minimum number of characters
   2408     // any choice would eat, so if the bounds check fails, then none of the
   2409     // choices can succeed, so we can just immediately backtrack, rather
   2410     // than go to the next choice.
   2411     assembler->LoadCurrentCharacter(trace->cp_offset(),
   2412                                     bounds_check_trace->backtrack(),
   2413                                     !preload_has_checked_bounds,
   2414                                     details->characters());
   2415   }
   2416 
   2417 
   2418   bool need_mask = true;
   2419 
   2420   if (details->characters() == 1) {
   2421     // If number of characters preloaded is 1 then we used a byte or 16 bit
   2422     // load so the value is already masked down.
   2423     uint32_t char_mask;
   2424     if (compiler->one_byte()) {
   2425       char_mask = String::kMaxOneByteCharCode;
   2426     } else {
   2427       char_mask = String::kMaxUtf16CodeUnit;
   2428     }
   2429     if ((mask & char_mask) == char_mask) need_mask = false;
   2430     mask &= char_mask;
   2431   } else {
   2432     // For 2-character preloads in one-byte mode or 1-character preloads in
   2433     // two-byte mode we also use a 16 bit load with zero extend.
   2434     static const uint32_t kTwoByteMask = 0xFFFF;
   2435     static const uint32_t kFourByteMask = 0xFFFFFFFF;
   2436     if (details->characters() == 2 && compiler->one_byte()) {
   2437       if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
   2438     } else if (details->characters() == 1 && !compiler->one_byte()) {
   2439       if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
   2440     } else {
   2441       if (mask == kFourByteMask) need_mask = false;
   2442     }
   2443   }
   2444 
   2445   if (fall_through_on_failure) {
   2446     if (need_mask) {
   2447       assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
   2448     } else {
   2449       assembler->CheckCharacter(value, on_possible_success);
   2450     }
   2451   } else {
   2452     if (need_mask) {
   2453       assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
   2454     } else {
   2455       assembler->CheckNotCharacter(value, trace->backtrack());
   2456     }
   2457   }
   2458   return true;
   2459 }
   2460 
   2461 
   2462 // Here is the meat of GetQuickCheckDetails (see also the comment on the
   2463 // super-class in the .h file).
   2464 //
   2465 // We iterate along the text object, building up for each character a
   2466 // mask and value that can be used to test for a quick failure to match.
   2467 // The masks and values for the positions will be combined into a single
   2468 // machine word for the current character width in order to be used in
   2469 // generating a quick check.
   2470 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
   2471                                     RegExpCompiler* compiler,
   2472                                     int characters_filled_in,
   2473                                     bool not_at_start) {
   2474   // Do not collect any quick check details if the text node reads backward,
   2475   // since it reads in the opposite direction than we use for quick checks.
   2476   if (read_backward()) return;
   2477   Isolate* isolate = compiler->macro_assembler()->isolate();
   2478   DCHECK(characters_filled_in < details->characters());
   2479   int characters = details->characters();
   2480   int char_mask;
   2481   if (compiler->one_byte()) {
   2482     char_mask = String::kMaxOneByteCharCode;
   2483   } else {
   2484     char_mask = String::kMaxUtf16CodeUnit;
   2485   }
   2486   for (int k = 0; k < elements()->length(); k++) {
   2487     TextElement elm = elements()->at(k);
   2488     if (elm.text_type() == TextElement::ATOM) {
   2489       Vector<const uc16> quarks = elm.atom()->data();
   2490       for (int i = 0; i < characters && i < quarks.length(); i++) {
   2491         QuickCheckDetails::Position* pos =
   2492             details->positions(characters_filled_in);
   2493         uc16 c = quarks[i];
   2494         if (elm.atom()->ignore_case()) {
   2495           unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
   2496           int length = GetCaseIndependentLetters(isolate, c,
   2497                                                  compiler->one_byte(), chars);
   2498           if (length == 0) {
   2499             // This can happen because all case variants are non-Latin1, but we
   2500             // know the input is Latin1.
   2501             details->set_cannot_match();
   2502             pos->determines_perfectly = false;
   2503             return;
   2504           }
   2505           if (length == 1) {
   2506             // This letter has no case equivalents, so it's nice and simple
   2507             // and the mask-compare will determine definitely whether we have
   2508             // a match at this character position.
   2509             pos->mask = char_mask;
   2510             pos->value = c;
   2511             pos->determines_perfectly = true;
   2512           } else {
   2513             uint32_t common_bits = char_mask;
   2514             uint32_t bits = chars[0];
   2515             for (int j = 1; j < length; j++) {
   2516               uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
   2517               common_bits ^= differing_bits;
   2518               bits &= common_bits;
   2519             }
   2520             // If length is 2 and common bits has only one zero in it then
   2521             // our mask and compare instruction will determine definitely
   2522             // whether we have a match at this character position.  Otherwise
   2523             // it can only be an approximate check.
   2524             uint32_t one_zero = (common_bits | ~char_mask);
   2525             if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
   2526               pos->determines_perfectly = true;
   2527             }
   2528             pos->mask = common_bits;
   2529             pos->value = bits;
   2530           }
   2531         } else {
   2532           // Don't ignore case.  Nice simple case where the mask-compare will
   2533           // determine definitely whether we have a match at this character
   2534           // position.
   2535           if (c > char_mask) {
   2536             details->set_cannot_match();
   2537             pos->determines_perfectly = false;
   2538             return;
   2539           }
   2540           pos->mask = char_mask;
   2541           pos->value = c;
   2542           pos->determines_perfectly = true;
   2543         }
   2544         characters_filled_in++;
   2545         DCHECK(characters_filled_in <= details->characters());
   2546         if (characters_filled_in == details->characters()) {
   2547           return;
   2548         }
   2549       }
   2550     } else {
   2551       QuickCheckDetails::Position* pos =
   2552           details->positions(characters_filled_in);
   2553       RegExpCharacterClass* tree = elm.char_class();
   2554       ZoneList<CharacterRange>* ranges = tree->ranges(zone());
   2555       DCHECK(!ranges->is_empty());
   2556       if (tree->is_negated()) {
   2557         // A quick check uses multi-character mask and compare.  There is no
   2558         // useful way to incorporate a negative char class into this scheme
   2559         // so we just conservatively create a mask and value that will always
   2560         // succeed.
   2561         pos->mask = 0;
   2562         pos->value = 0;
   2563       } else {
   2564         int first_range = 0;
   2565         while (ranges->at(first_range).from() > char_mask) {
   2566           first_range++;
   2567           if (first_range == ranges->length()) {
   2568             details->set_cannot_match();
   2569             pos->determines_perfectly = false;
   2570             return;
   2571           }
   2572         }
   2573         CharacterRange range = ranges->at(first_range);
   2574         uc16 from = range.from();
   2575         uc16 to = range.to();
   2576         if (to > char_mask) {
   2577           to = char_mask;
   2578         }
   2579         uint32_t differing_bits = (from ^ to);
   2580         // A mask and compare is only perfect if the differing bits form a
   2581         // number like 00011111 with one single block of trailing 1s.
   2582         if ((differing_bits & (differing_bits + 1)) == 0 &&
   2583              from + differing_bits == to) {
   2584           pos->determines_perfectly = true;
   2585         }
   2586         uint32_t common_bits = ~SmearBitsRight(differing_bits);
   2587         uint32_t bits = (from & common_bits);
   2588         for (int i = first_range + 1; i < ranges->length(); i++) {
   2589           CharacterRange range = ranges->at(i);
   2590           uc16 from = range.from();
   2591           uc16 to = range.to();
   2592           if (from > char_mask) continue;
   2593           if (to > char_mask) to = char_mask;
   2594           // Here we are combining more ranges into the mask and compare
   2595           // value.  With each new range the mask becomes more sparse and
   2596           // so the chances of a false positive rise.  A character class
   2597           // with multiple ranges is assumed never to be equivalent to a
   2598           // mask and compare operation.
   2599           pos->determines_perfectly = false;
   2600           uint32_t new_common_bits = (from ^ to);
   2601           new_common_bits = ~SmearBitsRight(new_common_bits);
   2602           common_bits &= new_common_bits;
   2603           bits &= new_common_bits;
   2604           uint32_t differing_bits = (from & common_bits) ^ bits;
   2605           common_bits ^= differing_bits;
   2606           bits &= common_bits;
   2607         }
   2608         pos->mask = common_bits;
   2609         pos->value = bits;
   2610       }
   2611       characters_filled_in++;
   2612       DCHECK(characters_filled_in <= details->characters());
   2613       if (characters_filled_in == details->characters()) {
   2614         return;
   2615       }
   2616     }
   2617   }
   2618   DCHECK(characters_filled_in != details->characters());
   2619   if (!details->cannot_match()) {
   2620     on_success()-> GetQuickCheckDetails(details,
   2621                                         compiler,
   2622                                         characters_filled_in,
   2623                                         true);
   2624   }
   2625 }
   2626 
   2627 
   2628 void QuickCheckDetails::Clear() {
   2629   for (int i = 0; i < characters_; i++) {
   2630     positions_[i].mask = 0;
   2631     positions_[i].value = 0;
   2632     positions_[i].determines_perfectly = false;
   2633   }
   2634   characters_ = 0;
   2635 }
   2636 
   2637 
   2638 void QuickCheckDetails::Advance(int by, bool one_byte) {
   2639   if (by >= characters_ || by < 0) {
   2640     DCHECK_IMPLIES(by < 0, characters_ == 0);
   2641     Clear();
   2642     return;
   2643   }
   2644   DCHECK_LE(characters_ - by, 4);
   2645   DCHECK_LE(characters_, 4);
   2646   for (int i = 0; i < characters_ - by; i++) {
   2647     positions_[i] = positions_[by + i];
   2648   }
   2649   for (int i = characters_ - by; i < characters_; i++) {
   2650     positions_[i].mask = 0;
   2651     positions_[i].value = 0;
   2652     positions_[i].determines_perfectly = false;
   2653   }
   2654   characters_ -= by;
   2655   // We could change mask_ and value_ here but we would never advance unless
   2656   // they had already been used in a check and they won't be used again because
   2657   // it would gain us nothing.  So there's no point.
   2658 }
   2659 
   2660 
   2661 void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
   2662   DCHECK(characters_ == other->characters_);
   2663   if (other->cannot_match_) {
   2664     return;
   2665   }
   2666   if (cannot_match_) {
   2667     *this = *other;
   2668     return;
   2669   }
   2670   for (int i = from_index; i < characters_; i++) {
   2671     QuickCheckDetails::Position* pos = positions(i);
   2672     QuickCheckDetails::Position* other_pos = other->positions(i);
   2673     if (pos->mask != other_pos->mask ||
   2674         pos->value != other_pos->value ||
   2675         !other_pos->determines_perfectly) {
   2676       // Our mask-compare operation will be approximate unless we have the
   2677       // exact same operation on both sides of the alternation.
   2678       pos->determines_perfectly = false;
   2679     }
   2680     pos->mask &= other_pos->mask;
   2681     pos->value &= pos->mask;
   2682     other_pos->value &= pos->mask;
   2683     uc16 differing_bits = (pos->value ^ other_pos->value);
   2684     pos->mask &= ~differing_bits;
   2685     pos->value &= pos->mask;
   2686   }
   2687 }
   2688 
   2689 
   2690 class VisitMarker {
   2691  public:
   2692   explicit VisitMarker(NodeInfo* info) : info_(info) {
   2693     DCHECK(!info->visited);
   2694     info->visited = true;
   2695   }
   2696   ~VisitMarker() {
   2697     info_->visited = false;
   2698   }
   2699  private:
   2700   NodeInfo* info_;
   2701 };
   2702 
   2703 RegExpNode* SeqRegExpNode::FilterOneByte(int depth) {
   2704   if (info()->replacement_calculated) return replacement();
   2705   if (depth < 0) return this;
   2706   DCHECK(!info()->visited);
   2707   VisitMarker marker(info());
   2708   return FilterSuccessor(depth - 1);
   2709 }
   2710 
   2711 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth) {
   2712   RegExpNode* next = on_success_->FilterOneByte(depth - 1);
   2713   if (next == nullptr) return set_replacement(nullptr);
   2714   on_success_ = next;
   2715   return set_replacement(this);
   2716 }
   2717 
   2718 // We need to check for the following characters: 0x39C 0x3BC 0x178.
   2719 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
   2720   // TODO(dcarney): this could be a lot more efficient.
   2721   return range.Contains(0x039C) || range.Contains(0x03BC) ||
   2722          range.Contains(0x0178);
   2723 }
   2724 
   2725 
   2726 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
   2727   for (int i = 0; i < ranges->length(); i++) {
   2728     // TODO(dcarney): this could be a lot more efficient.
   2729     if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
   2730   }
   2731   return false;
   2732 }
   2733 
   2734 RegExpNode* TextNode::FilterOneByte(int depth) {
   2735   if (info()->replacement_calculated) return replacement();
   2736   if (depth < 0) return this;
   2737   DCHECK(!info()->visited);
   2738   VisitMarker marker(info());
   2739   int element_count = elements()->length();
   2740   for (int i = 0; i < element_count; i++) {
   2741     TextElement elm = elements()->at(i);
   2742     if (elm.text_type() == TextElement::ATOM) {
   2743       Vector<const uc16> quarks = elm.atom()->data();
   2744       for (int j = 0; j < quarks.length(); j++) {
   2745         uint16_t c = quarks[j];
   2746         if (elm.atom()->ignore_case()) {
   2747           c = unibrow::Latin1::TryConvertToLatin1(c);
   2748         }
   2749         if (c > unibrow::Latin1::kMaxChar) return set_replacement(nullptr);
   2750         // Replace quark in case we converted to Latin-1.
   2751         uint16_t* writable_quarks = const_cast<uint16_t*>(quarks.start());
   2752         writable_quarks[j] = c;
   2753       }
   2754     } else {
   2755       DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
   2756       RegExpCharacterClass* cc = elm.char_class();
   2757       ZoneList<CharacterRange>* ranges = cc->ranges(zone());
   2758       CharacterRange::Canonicalize(ranges);
   2759       // Now they are in order so we only need to look at the first.
   2760       int range_count = ranges->length();
   2761       if (cc->is_negated()) {
   2762         if (range_count != 0 &&
   2763             ranges->at(0).from() == 0 &&
   2764             ranges->at(0).to() >= String::kMaxOneByteCharCode) {
   2765           // This will be handled in a later filter.
   2766           if (IgnoreCase(cc->flags()) && RangesContainLatin1Equivalents(ranges))
   2767             continue;
   2768           return set_replacement(nullptr);
   2769         }
   2770       } else {
   2771         if (range_count == 0 ||
   2772             ranges->at(0).from() > String::kMaxOneByteCharCode) {
   2773           // This will be handled in a later filter.
   2774           if (IgnoreCase(cc->flags()) && RangesContainLatin1Equivalents(ranges))
   2775             continue;
   2776           return set_replacement(nullptr);
   2777         }
   2778       }
   2779     }
   2780   }
   2781   return FilterSuccessor(depth - 1);
   2782 }
   2783 
   2784 RegExpNode* LoopChoiceNode::FilterOneByte(int depth) {
   2785   if (info()->replacement_calculated) return replacement();
   2786   if (depth < 0) return this;
   2787   if (info()->visited) return this;
   2788   {
   2789     VisitMarker marker(info());
   2790 
   2791     RegExpNode* continue_replacement = continue_node_->FilterOneByte(depth - 1);
   2792     // If we can't continue after the loop then there is no sense in doing the
   2793     // loop.
   2794     if (continue_replacement == nullptr) return set_replacement(nullptr);
   2795   }
   2796 
   2797   return ChoiceNode::FilterOneByte(depth - 1);
   2798 }
   2799 
   2800 RegExpNode* ChoiceNode::FilterOneByte(int depth) {
   2801   if (info()->replacement_calculated) return replacement();
   2802   if (depth < 0) return this;
   2803   if (info()->visited) return this;
   2804   VisitMarker marker(info());
   2805   int choice_count = alternatives_->length();
   2806 
   2807   for (int i = 0; i < choice_count; i++) {
   2808     GuardedAlternative alternative = alternatives_->at(i);
   2809     if (alternative.guards() != nullptr &&
   2810         alternative.guards()->length() != 0) {
   2811       set_replacement(this);
   2812       return this;
   2813     }
   2814   }
   2815 
   2816   int surviving = 0;
   2817   RegExpNode* survivor = nullptr;
   2818   for (int i = 0; i < choice_count; i++) {
   2819     GuardedAlternative alternative = alternatives_->at(i);
   2820     RegExpNode* replacement = alternative.node()->FilterOneByte(depth - 1);
   2821     DCHECK(replacement != this);  // No missing EMPTY_MATCH_CHECK.
   2822     if (replacement != nullptr) {
   2823       alternatives_->at(i).set_node(replacement);
   2824       surviving++;
   2825       survivor = replacement;
   2826     }
   2827   }
   2828   if (surviving < 2) return set_replacement(survivor);
   2829 
   2830   set_replacement(this);
   2831   if (surviving == choice_count) {
   2832     return this;
   2833   }
   2834   // Only some of the nodes survived the filtering.  We need to rebuild the
   2835   // alternatives list.
   2836   ZoneList<GuardedAlternative>* new_alternatives =
   2837       new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
   2838   for (int i = 0; i < choice_count; i++) {
   2839     RegExpNode* replacement =
   2840         alternatives_->at(i).node()->FilterOneByte(depth - 1);
   2841     if (replacement != nullptr) {
   2842       alternatives_->at(i).set_node(replacement);
   2843       new_alternatives->Add(alternatives_->at(i), zone());
   2844     }
   2845   }
   2846   alternatives_ = new_alternatives;
   2847   return this;
   2848 }
   2849 
   2850 RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth) {
   2851   if (info()->replacement_calculated) return replacement();
   2852   if (depth < 0) return this;
   2853   if (info()->visited) return this;
   2854   VisitMarker marker(info());
   2855   // Alternative 0 is the negative lookahead, alternative 1 is what comes
   2856   // afterwards.
   2857   RegExpNode* node = alternatives_->at(1).node();
   2858   RegExpNode* replacement = node->FilterOneByte(depth - 1);
   2859   if (replacement == nullptr) return set_replacement(nullptr);
   2860   alternatives_->at(1).set_node(replacement);
   2861 
   2862   RegExpNode* neg_node = alternatives_->at(0).node();
   2863   RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1);
   2864   // If the negative lookahead is always going to fail then
   2865   // we don't need to check it.
   2866   if (neg_replacement == nullptr) return set_replacement(replacement);
   2867   alternatives_->at(0).set_node(neg_replacement);
   2868   return set_replacement(this);
   2869 }
   2870 
   2871 
   2872 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
   2873                                           RegExpCompiler* compiler,
   2874                                           int characters_filled_in,
   2875                                           bool not_at_start) {
   2876   if (body_can_be_zero_length_ || info()->visited) return;
   2877   VisitMarker marker(info());
   2878   return ChoiceNode::GetQuickCheckDetails(details,
   2879                                           compiler,
   2880                                           characters_filled_in,
   2881                                           not_at_start);
   2882 }
   2883 
   2884 
   2885 void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
   2886                                   BoyerMooreLookahead* bm, bool not_at_start) {
   2887   if (body_can_be_zero_length_ || budget <= 0) {
   2888     bm->SetRest(offset);
   2889     SaveBMInfo(bm, not_at_start, offset);
   2890     return;
   2891   }
   2892   ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
   2893   SaveBMInfo(bm, not_at_start, offset);
   2894 }
   2895 
   2896 
   2897 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
   2898                                       RegExpCompiler* compiler,
   2899                                       int characters_filled_in,
   2900                                       bool not_at_start) {
   2901   not_at_start = (not_at_start || not_at_start_);
   2902   int choice_count = alternatives_->length();
   2903   DCHECK_LT(0, choice_count);
   2904   alternatives_->at(0).node()->GetQuickCheckDetails(details,
   2905                                                     compiler,
   2906                                                     characters_filled_in,
   2907                                                     not_at_start);
   2908   for (int i = 1; i < choice_count; i++) {
   2909     QuickCheckDetails new_details(details->characters());
   2910     RegExpNode* node = alternatives_->at(i).node();
   2911     node->GetQuickCheckDetails(&new_details, compiler,
   2912                                characters_filled_in,
   2913                                not_at_start);
   2914     // Here we merge the quick match details of the two branches.
   2915     details->Merge(&new_details, characters_filled_in);
   2916   }
   2917 }
   2918 
   2919 
   2920 // Check for [0-9A-Z_a-z].
   2921 static void EmitWordCheck(RegExpMacroAssembler* assembler,
   2922                           Label* word,
   2923                           Label* non_word,
   2924                           bool fall_through_on_word) {
   2925   if (assembler->CheckSpecialCharacterClass(
   2926           fall_through_on_word ? 'w' : 'W',
   2927           fall_through_on_word ? non_word : word)) {
   2928     // Optimized implementation available.
   2929     return;
   2930   }
   2931   assembler->CheckCharacterGT('z', non_word);
   2932   assembler->CheckCharacterLT('0', non_word);
   2933   assembler->CheckCharacterGT('a' - 1, word);
   2934   assembler->CheckCharacterLT('9' + 1, word);
   2935   assembler->CheckCharacterLT('A', non_word);
   2936   assembler->CheckCharacterLT('Z' + 1, word);
   2937   if (fall_through_on_word) {
   2938     assembler->CheckNotCharacter('_', non_word);
   2939   } else {
   2940     assembler->CheckCharacter('_', word);
   2941   }
   2942 }
   2943 
   2944 
   2945 // Emit the code to check for a ^ in multiline mode (1-character lookbehind
   2946 // that matches newline or the start of input).
   2947 static void EmitHat(RegExpCompiler* compiler,
   2948                     RegExpNode* on_success,
   2949                     Trace* trace) {
   2950   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   2951   // We will be loading the previous character into the current character
   2952   // register.
   2953   Trace new_trace(*trace);
   2954   new_trace.InvalidateCurrentCharacter();
   2955 
   2956   Label ok;
   2957   if (new_trace.cp_offset() == 0) {
   2958     // The start of input counts as a newline in this context, so skip to
   2959     // ok if we are at the start.
   2960     assembler->CheckAtStart(&ok);
   2961   }
   2962   // We already checked that we are not at the start of input so it must be
   2963   // OK to load the previous character.
   2964   assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
   2965                                   new_trace.backtrack(),
   2966                                   false);
   2967   if (!assembler->CheckSpecialCharacterClass('n',
   2968                                              new_trace.backtrack())) {
   2969     // Newline means \n, \r, 0x2028 or 0x2029.
   2970     if (!compiler->one_byte()) {
   2971       assembler->CheckCharacterAfterAnd(0x2028, 0xFFFE, &ok);
   2972     }
   2973     assembler->CheckCharacter('\n', &ok);
   2974     assembler->CheckNotCharacter('\r', new_trace.backtrack());
   2975   }
   2976   assembler->Bind(&ok);
   2977   on_success->Emit(compiler, &new_trace);
   2978 }
   2979 
   2980 
   2981 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
   2982 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
   2983   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   2984   Isolate* isolate = assembler->isolate();
   2985   Trace::TriBool next_is_word_character = Trace::UNKNOWN;
   2986   bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
   2987   BoyerMooreLookahead* lookahead = bm_info(not_at_start);
   2988   if (lookahead == nullptr) {
   2989     int eats_at_least =
   2990         Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,
   2991                                                     kRecursionBudget,
   2992                                                     not_at_start));
   2993     if (eats_at_least >= 1) {
   2994       BoyerMooreLookahead* bm =
   2995           new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());
   2996       FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start);
   2997       if (bm->at(0)->is_non_word())
   2998         next_is_word_character = Trace::FALSE_VALUE;
   2999       if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
   3000     }
   3001   } else {
   3002     if (lookahead->at(0)->is_non_word())
   3003       next_is_word_character = Trace::FALSE_VALUE;
   3004     if (lookahead->at(0)->is_word())
   3005       next_is_word_character = Trace::TRUE_VALUE;
   3006   }
   3007   bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
   3008   if (next_is_word_character == Trace::UNKNOWN) {
   3009     Label before_non_word;
   3010     Label before_word;
   3011     if (trace->characters_preloaded() != 1) {
   3012       assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
   3013     }
   3014     // Fall through on non-word.
   3015     EmitWordCheck(assembler, &before_word, &before_non_word, false);
   3016     // Next character is not a word character.
   3017     assembler->Bind(&before_non_word);
   3018     Label ok;
   3019     BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
   3020     assembler->GoTo(&ok);
   3021 
   3022     assembler->Bind(&before_word);
   3023     BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
   3024     assembler->Bind(&ok);
   3025   } else if (next_is_word_character == Trace::TRUE_VALUE) {
   3026     BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
   3027   } else {
   3028     DCHECK(next_is_word_character == Trace::FALSE_VALUE);
   3029     BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
   3030   }
   3031 }
   3032 
   3033 
   3034 void AssertionNode::BacktrackIfPrevious(
   3035     RegExpCompiler* compiler,
   3036     Trace* trace,
   3037     AssertionNode::IfPrevious backtrack_if_previous) {
   3038   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   3039   Trace new_trace(*trace);
   3040   new_trace.InvalidateCurrentCharacter();
   3041 
   3042   Label fall_through, dummy;
   3043 
   3044   Label* non_word = backtrack_if_previous == kIsNonWord ?
   3045                     new_trace.backtrack() :
   3046                     &fall_through;
   3047   Label* word = backtrack_if_previous == kIsNonWord ?
   3048                 &fall_through :
   3049                 new_trace.backtrack();
   3050 
   3051   if (new_trace.cp_offset() == 0) {
   3052     // The start of input counts as a non-word character, so the question is
   3053     // decided if we are at the start.
   3054     assembler->CheckAtStart(non_word);
   3055   }
   3056   // We already checked that we are not at the start of input so it must be
   3057   // OK to load the previous character.
   3058   assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
   3059   EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
   3060 
   3061   assembler->Bind(&fall_through);
   3062   on_success()->Emit(compiler, &new_trace);
   3063 }
   3064 
   3065 
   3066 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
   3067                                          RegExpCompiler* compiler,
   3068                                          int filled_in,
   3069                                          bool not_at_start) {
   3070   if (assertion_type_ == AT_START && not_at_start) {
   3071     details->set_cannot_match();
   3072     return;
   3073   }
   3074   return on_success()->GetQuickCheckDetails(details,
   3075                                             compiler,
   3076                                             filled_in,
   3077                                             not_at_start);
   3078 }
   3079 
   3080 
   3081 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
   3082   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   3083   switch (assertion_type_) {
   3084     case AT_END: {
   3085       Label ok;
   3086       assembler->CheckPosition(trace->cp_offset(), &ok);
   3087       assembler->GoTo(trace->backtrack());
   3088       assembler->Bind(&ok);
   3089       break;
   3090     }
   3091     case AT_START: {
   3092       if (trace->at_start() == Trace::FALSE_VALUE) {
   3093         assembler->GoTo(trace->backtrack());
   3094         return;
   3095       }
   3096       if (trace->at_start() == Trace::UNKNOWN) {
   3097         assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack());
   3098         Trace at_start_trace = *trace;
   3099         at_start_trace.set_at_start(Trace::TRUE_VALUE);
   3100         on_success()->Emit(compiler, &at_start_trace);
   3101         return;
   3102       }
   3103     }
   3104     break;
   3105     case AFTER_NEWLINE:
   3106       EmitHat(compiler, on_success(), trace);
   3107       return;
   3108     case AT_BOUNDARY:
   3109     case AT_NON_BOUNDARY: {
   3110       EmitBoundaryCheck(compiler, trace);
   3111       return;
   3112     }
   3113   }
   3114   on_success()->Emit(compiler, trace);
   3115 }
   3116 
   3117 
   3118 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
   3119   if (quick_check == nullptr) return false;
   3120   if (offset >= quick_check->characters()) return false;
   3121   return quick_check->positions(offset)->determines_perfectly;
   3122 }
   3123 
   3124 
   3125 static void UpdateBoundsCheck(int index, int* checked_up_to) {
   3126   if (index > *checked_up_to) {
   3127     *checked_up_to = index;
   3128   }
   3129 }
   3130 
   3131 
   3132 // We call this repeatedly to generate code for each pass over the text node.
   3133 // The passes are in increasing order of difficulty because we hope one
   3134 // of the first passes will fail in which case we are saved the work of the
   3135 // later passes.  for example for the case independent regexp /%[asdfghjkl]a/
   3136 // we will check the '%' in the first pass, the case independent 'a' in the
   3137 // second pass and the character class in the last pass.
   3138 //
   3139 // The passes are done from right to left, so for example to test for /bar/
   3140 // we will first test for an 'r' with offset 2, then an 'a' with offset 1
   3141 // and then a 'b' with offset 0.  This means we can avoid the end-of-input
   3142 // bounds check most of the time.  In the example we only need to check for
   3143 // end-of-input when loading the putative 'r'.
   3144 //
   3145 // A slight complication involves the fact that the first character may already
   3146 // be fetched into a register by the previous node.  In this case we want to
   3147 // do the test for that character first.  We do this in separate passes.  The
   3148 // 'preloaded' argument indicates that we are doing such a 'pass'.  If such a
   3149 // pass has been performed then subsequent passes will have true in
   3150 // first_element_checked to indicate that that character does not need to be
   3151 // checked again.
   3152 //
   3153 // In addition to all this we are passed a Trace, which can
   3154 // contain an AlternativeGeneration object.  In this AlternativeGeneration
   3155 // object we can see details of any quick check that was already passed in
   3156 // order to get to the code we are now generating.  The quick check can involve
   3157 // loading characters, which means we do not need to recheck the bounds
   3158 // up to the limit the quick check already checked.  In addition the quick
   3159 // check can have involved a mask and compare operation which may simplify
   3160 // or obviate the need for further checks at some character positions.
   3161 void TextNode::TextEmitPass(RegExpCompiler* compiler,
   3162                             TextEmitPassType pass,
   3163                             bool preloaded,
   3164                             Trace* trace,
   3165                             bool first_element_checked,
   3166                             int* checked_up_to) {
   3167   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   3168   Isolate* isolate = assembler->isolate();
   3169   bool one_byte = compiler->one_byte();
   3170   Label* backtrack = trace->backtrack();
   3171   QuickCheckDetails* quick_check = trace->quick_check_performed();
   3172   int element_count = elements()->length();
   3173   int backward_offset = read_backward() ? -Length() : 0;
   3174   for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
   3175     TextElement elm = elements()->at(i);
   3176     int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
   3177     if (elm.text_type() == TextElement::ATOM) {
   3178       if (SkipPass(pass, elm.atom()->ignore_case())) continue;
   3179       Vector<const uc16> quarks = elm.atom()->data();
   3180       for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
   3181         if (first_element_checked && i == 0 && j == 0) continue;
   3182         if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
   3183         EmitCharacterFunction* emit_function = nullptr;
   3184         uc16 quark = quarks[j];
   3185         if (elm.atom()->ignore_case()) {
   3186           // Everywhere else we assume that a non-Latin-1 character cannot match
   3187           // a Latin-1 character. Avoid the cases where this is assumption is
   3188           // invalid by using the Latin1 equivalent instead.
   3189           quark = unibrow::Latin1::TryConvertToLatin1(quark);
   3190         }
   3191         switch (pass) {
   3192           case NON_LATIN1_MATCH:
   3193             DCHECK(one_byte);
   3194             if (quark > String::kMaxOneByteCharCode) {
   3195               assembler->GoTo(backtrack);
   3196               return;
   3197             }
   3198             break;
   3199           case NON_LETTER_CHARACTER_MATCH:
   3200             emit_function = &EmitAtomNonLetter;
   3201             break;
   3202           case SIMPLE_CHARACTER_MATCH:
   3203             emit_function = &EmitSimpleCharacter;
   3204             break;
   3205           case CASE_CHARACTER_MATCH:
   3206             emit_function = &EmitAtomLetter;
   3207             break;
   3208           default:
   3209             break;
   3210         }
   3211         if (emit_function != nullptr) {
   3212           bool bounds_check = *checked_up_to < cp_offset + j || read_backward();
   3213           bool bound_checked =
   3214               emit_function(isolate, compiler, quark, backtrack, cp_offset + j,
   3215                             bounds_check, preloaded);
   3216           if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
   3217         }
   3218       }
   3219     } else {
   3220       DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());
   3221       if (pass == CHARACTER_CLASS_MATCH) {
   3222         if (first_element_checked && i == 0) continue;
   3223         if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
   3224         RegExpCharacterClass* cc = elm.char_class();
   3225         bool bounds_check = *checked_up_to < cp_offset || read_backward();
   3226         EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,
   3227                       bounds_check, preloaded, zone());
   3228         UpdateBoundsCheck(cp_offset, checked_up_to);
   3229       }
   3230     }
   3231   }
   3232 }
   3233 
   3234 
   3235 int TextNode::Length() {
   3236   TextElement elm = elements()->last();
   3237   DCHECK_LE(0, elm.cp_offset());
   3238   return elm.cp_offset() + elm.length();
   3239 }
   3240 
   3241 bool TextNode::SkipPass(TextEmitPassType pass, bool ignore_case) {
   3242   if (ignore_case) {
   3243     return pass == SIMPLE_CHARACTER_MATCH;
   3244   } else {
   3245     return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
   3246   }
   3247 }
   3248 
   3249 TextNode* TextNode::CreateForCharacterRanges(Zone* zone,
   3250                                              ZoneList<CharacterRange>* ranges,
   3251                                              bool read_backward,
   3252                                              RegExpNode* on_success,
   3253                                              JSRegExp::Flags flags) {
   3254   DCHECK_NOT_NULL(ranges);
   3255   ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone);
   3256   elms->Add(TextElement::CharClass(
   3257                 new (zone) RegExpCharacterClass(zone, ranges, flags)),
   3258             zone);
   3259   return new (zone) TextNode(elms, read_backward, on_success);
   3260 }
   3261 
   3262 TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead,
   3263                                            CharacterRange trail,
   3264                                            bool read_backward,
   3265                                            RegExpNode* on_success,
   3266                                            JSRegExp::Flags flags) {
   3267   ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
   3268   ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);
   3269   ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone);
   3270   elms->Add(TextElement::CharClass(
   3271                 new (zone) RegExpCharacterClass(zone, lead_ranges, flags)),
   3272             zone);
   3273   elms->Add(TextElement::CharClass(
   3274                 new (zone) RegExpCharacterClass(zone, trail_ranges, flags)),
   3275             zone);
   3276   return new (zone) TextNode(elms, read_backward, on_success);
   3277 }
   3278 
   3279 
   3280 // This generates the code to match a text node.  A text node can contain
   3281 // straight character sequences (possibly to be matched in a case-independent
   3282 // way) and character classes.  For efficiency we do not do this in a single
   3283 // pass from left to right.  Instead we pass over the text node several times,
   3284 // emitting code for some character positions every time.  See the comment on
   3285 // TextEmitPass for details.
   3286 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
   3287   LimitResult limit_result = LimitVersions(compiler, trace);
   3288   if (limit_result == DONE) return;
   3289   DCHECK(limit_result == CONTINUE);
   3290 
   3291   if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
   3292     compiler->SetRegExpTooBig();
   3293     return;
   3294   }
   3295 
   3296   if (compiler->one_byte()) {
   3297     int dummy = 0;
   3298     TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
   3299   }
   3300 
   3301   bool first_elt_done = false;
   3302   int bound_checked_to = trace->cp_offset() - 1;
   3303   bound_checked_to += trace->bound_checked_up_to();
   3304 
   3305   // If a character is preloaded into the current character register then
   3306   // check that now.
   3307   if (trace->characters_preloaded() == 1) {
   3308     for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
   3309       TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), true, trace,
   3310                    false, &bound_checked_to);
   3311     }
   3312     first_elt_done = true;
   3313   }
   3314 
   3315   for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
   3316     TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), false, trace,
   3317                  first_elt_done, &bound_checked_to);
   3318   }
   3319 
   3320   Trace successor_trace(*trace);
   3321   // If we advance backward, we may end up at the start.
   3322   successor_trace.AdvanceCurrentPositionInTrace(
   3323       read_backward() ? -Length() : Length(), compiler);
   3324   successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN
   3325                                                : Trace::FALSE_VALUE);
   3326   RecursionCheck rc(compiler);
   3327   on_success()->Emit(compiler, &successor_trace);
   3328 }
   3329 
   3330 
   3331 void Trace::InvalidateCurrentCharacter() {
   3332   characters_preloaded_ = 0;
   3333 }
   3334 
   3335 
   3336 void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
   3337   // We don't have an instruction for shifting the current character register
   3338   // down or for using a shifted value for anything so lets just forget that
   3339   // we preloaded any characters into it.
   3340   characters_preloaded_ = 0;
   3341   // Adjust the offsets of the quick check performed information.  This
   3342   // information is used to find out what we already determined about the
   3343   // characters by means of mask and compare.
   3344   quick_check_performed_.Advance(by, compiler->one_byte());
   3345   cp_offset_ += by;
   3346   if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
   3347     compiler->SetRegExpTooBig();
   3348     cp_offset_ = 0;
   3349   }
   3350   bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
   3351 }
   3352 
   3353 
   3354 void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) {
   3355   int element_count = elements()->length();
   3356   for (int i = 0; i < element_count; i++) {
   3357     TextElement elm = elements()->at(i);
   3358     if (elm.text_type() == TextElement::CHAR_CLASS) {
   3359       RegExpCharacterClass* cc = elm.char_class();
   3360 #ifdef V8_INTL_SUPPORT
   3361       bool case_equivalents_already_added =
   3362           NeedsUnicodeCaseEquivalents(cc->flags());
   3363 #else
   3364       bool case_equivalents_already_added = false;
   3365 #endif
   3366       if (IgnoreCase(cc->flags()) && !case_equivalents_already_added) {
   3367         // None of the standard character classes is different in the case
   3368         // independent case and it slows us down if we don't know that.
   3369         if (cc->is_standard(zone())) continue;
   3370         ZoneList<CharacterRange>* ranges = cc->ranges(zone());
   3371         CharacterRange::AddCaseEquivalents(isolate, zone(), ranges,
   3372                                            is_one_byte);
   3373       }
   3374     }
   3375   }
   3376 }
   3377 
   3378 
   3379 int TextNode::GreedyLoopTextLength() { return Length(); }
   3380 
   3381 
   3382 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
   3383     RegExpCompiler* compiler) {
   3384   if (read_backward()) return nullptr;
   3385   if (elements()->length() != 1) return nullptr;
   3386   TextElement elm = elements()->at(0);
   3387   if (elm.text_type() != TextElement::CHAR_CLASS) return nullptr;
   3388   RegExpCharacterClass* node = elm.char_class();
   3389   ZoneList<CharacterRange>* ranges = node->ranges(zone());
   3390   CharacterRange::Canonicalize(ranges);
   3391   if (node->is_negated()) {
   3392     return ranges->length() == 0 ? on_success() : nullptr;
   3393   }
   3394   if (ranges->length() != 1) return nullptr;
   3395   uint32_t max_char;
   3396   if (compiler->one_byte()) {
   3397     max_char = String::kMaxOneByteCharCode;
   3398   } else {
   3399     max_char = String::kMaxUtf16CodeUnit;
   3400   }
   3401   return ranges->at(0).IsEverything(max_char) ? on_success() : nullptr;
   3402 }
   3403 
   3404 
   3405 // Finds the fixed match length of a sequence of nodes that goes from
   3406 // this alternative and back to this choice node.  If there are variable
   3407 // length nodes or other complications in the way then return a sentinel
   3408 // value indicating that a greedy loop cannot be constructed.
   3409 int ChoiceNode::GreedyLoopTextLengthForAlternative(
   3410     GuardedAlternative* alternative) {
   3411   int length = 0;
   3412   RegExpNode* node = alternative->node();
   3413   // Later we will generate code for all these text nodes using recursion
   3414   // so we have to limit the max number.
   3415   int recursion_depth = 0;
   3416   while (node != this) {
   3417     if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
   3418       return kNodeIsTooComplexForGreedyLoops;
   3419     }
   3420     int node_length = node->GreedyLoopTextLength();
   3421     if (node_length == kNodeIsTooComplexForGreedyLoops) {
   3422       return kNodeIsTooComplexForGreedyLoops;
   3423     }
   3424     length += node_length;
   3425     SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
   3426     node = seq_node->on_success();
   3427   }
   3428   return read_backward() ? -length : length;
   3429 }
   3430 
   3431 
   3432 void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
   3433   DCHECK_NULL(loop_node_);
   3434   AddAlternative(alt);
   3435   loop_node_ = alt.node();
   3436 }
   3437 
   3438 
   3439 void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
   3440   DCHECK_NULL(continue_node_);
   3441   AddAlternative(alt);
   3442   continue_node_ = alt.node();
   3443 }
   3444 
   3445 
   3446 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
   3447   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
   3448   if (trace->stop_node() == this) {
   3449     // Back edge of greedy optimized loop node graph.
   3450     int text_length =
   3451         GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
   3452     DCHECK_NE(kNodeIsTooComplexForGreedyLoops, text_length);
   3453     // Update the counter-based backtracking info on the stack.  This is an
   3454     // optimization for greedy loops (see below).
   3455     DCHECK(trace->cp_offset() == text_length);
   3456     macro_assembler->AdvanceCurrentPosition(text_length);
   3457     macro_assembler->GoTo(trace->loop_label());
   3458     return;
   3459   }
   3460   DCHECK_NULL(trace->stop_node());
   3461   if (!trace->is_trivial()) {
   3462     trace->Flush(compiler, this);
   3463     return;
   3464   }
   3465   ChoiceNode::Emit(compiler, trace);
   3466 }
   3467 
   3468 
   3469 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
   3470                                            int eats_at_least) {
   3471   int preload_characters = Min(4, eats_at_least);
   3472   DCHECK_LE(preload_characters, 4);
   3473   if (compiler->macro_assembler()->CanReadUnaligned()) {
   3474     bool one_byte = compiler->one_byte();
   3475     if (one_byte) {
   3476       // We can't preload 3 characters because there is no machine instruction
   3477       // to do that.  We can't just load 4 because we could be reading
   3478       // beyond the end of the string, which could cause a memory fault.
   3479       if (preload_characters == 3) preload_characters = 2;
   3480     } else {
   3481       if (preload_characters > 2) preload_characters = 2;
   3482     }
   3483   } else {
   3484     if (preload_characters > 1) preload_characters = 1;
   3485   }
   3486   return preload_characters;
   3487 }
   3488 
   3489 
   3490 // This class is used when generating the alternatives in a choice node.  It
   3491 // records the way the alternative is being code generated.
   3492 class AlternativeGeneration: public Malloced {
   3493  public:
   3494   AlternativeGeneration()
   3495       : possible_success(),
   3496         expects_preload(false),
   3497         after(),
   3498         quick_check_details() { }
   3499   Label possible_success;
   3500   bool expects_preload;
   3501   Label after;
   3502   QuickCheckDetails quick_check_details;
   3503 };
   3504 
   3505 
   3506 // Creates a list of AlternativeGenerations.  If the list has a reasonable
   3507 // size then it is on the stack, otherwise the excess is on the heap.
   3508 class AlternativeGenerationList {
   3509  public:
   3510   AlternativeGenerationList(int count, Zone* zone)
   3511       : alt_gens_(count, zone) {
   3512     for (int i = 0; i < count && i < kAFew; i++) {
   3513       alt_gens_.Add(a_few_alt_gens_ + i, zone);
   3514     }
   3515     for (int i = kAFew; i < count; i++) {
   3516       alt_gens_.Add(new AlternativeGeneration(), zone);
   3517     }
   3518   }
   3519   ~AlternativeGenerationList() {
   3520     for (int i = kAFew; i < alt_gens_.length(); i++) {
   3521       delete alt_gens_[i];
   3522       alt_gens_[i] = nullptr;
   3523     }
   3524   }
   3525 
   3526   AlternativeGeneration* at(int i) {
   3527     return alt_gens_[i];
   3528   }
   3529 
   3530  private:
   3531   static const int kAFew = 10;
   3532   ZoneList<AlternativeGeneration*> alt_gens_;
   3533   AlternativeGeneration a_few_alt_gens_[kAFew];
   3534 };
   3535 
   3536 
   3537 static const uc32 kRangeEndMarker = 0x110000;
   3538 
   3539 // The '2' variant is has inclusive from and exclusive to.
   3540 // This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
   3541 // which include WhiteSpace (7.2) or LineTerminator (7.3) values.
   3542 static const int kSpaceRanges[] = {
   3543     '\t',   '\r' + 1, ' ',    ' ' + 1, 0x00A0, 0x00A1, 0x1680,
   3544     0x1681, 0x2000,   0x200B, 0x2028,  0x202A, 0x202F, 0x2030,
   3545     0x205F, 0x2060,   0x3000, 0x3001,  0xFEFF, 0xFF00, kRangeEndMarker};
   3546 static const int kSpaceRangeCount = arraysize(kSpaceRanges);
   3547 
   3548 static const int kWordRanges[] = {
   3549     '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, kRangeEndMarker};
   3550 static const int kWordRangeCount = arraysize(kWordRanges);
   3551 static const int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
   3552 static const int kDigitRangeCount = arraysize(kDigitRanges);
   3553 static const int kSurrogateRanges[] = {
   3554     kLeadSurrogateStart, kLeadSurrogateStart + 1, kRangeEndMarker};
   3555 static const int kSurrogateRangeCount = arraysize(kSurrogateRanges);
   3556 static const int kLineTerminatorRanges[] = {
   3557     0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, kRangeEndMarker};
   3558 static const int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
   3559 
   3560 void BoyerMoorePositionInfo::Set(int character) {
   3561   SetInterval(Interval(character, character));
   3562 }
   3563 
   3564 
   3565 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
   3566   s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
   3567   w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
   3568   d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
   3569   surrogate_ =
   3570       AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
   3571   if (interval.to() - interval.from() >= kMapSize - 1) {
   3572     if (map_count_ != kMapSize) {
   3573       map_count_ = kMapSize;
   3574       for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
   3575     }
   3576     return;
   3577   }
   3578   for (int i = interval.from(); i <= interval.to(); i++) {
   3579     int mod_character = (i & kMask);
   3580     if (!map_->at(mod_character)) {
   3581       map_count_++;
   3582       map_->at(mod_character) = true;
   3583     }
   3584     if (map_count_ == kMapSize) return;
   3585   }
   3586 }
   3587 
   3588 
   3589 void BoyerMoorePositionInfo::SetAll() {
   3590   s_ = w_ = d_ = kLatticeUnknown;
   3591   if (map_count_ != kMapSize) {
   3592     map_count_ = kMapSize;
   3593     for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
   3594   }
   3595 }
   3596 
   3597 
   3598 BoyerMooreLookahead::BoyerMooreLookahead(
   3599     int length, RegExpCompiler* compiler, Zone* zone)
   3600     : length_(length),
   3601       compiler_(compiler) {
   3602   if (compiler->one_byte()) {
   3603     max_char_ = String::kMaxOneByteCharCode;
   3604   } else {
   3605     max_char_ = String::kMaxUtf16CodeUnit;
   3606   }
   3607   bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone);
   3608   for (int i = 0; i < length; i++) {
   3609     bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone);
   3610   }
   3611 }
   3612 
   3613 
   3614 // Find the longest range of lookahead that has the fewest number of different
   3615 // characters that can occur at a given position.  Since we are optimizing two
   3616 // different parameters at once this is a tradeoff.
   3617 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
   3618   int biggest_points = 0;
   3619   // If more than 32 characters out of 128 can occur it is unlikely that we can
   3620   // be lucky enough to step forwards much of the time.
   3621   const int kMaxMax = 32;
   3622   for (int max_number_of_chars = 4;
   3623        max_number_of_chars < kMaxMax;
   3624        max_number_of_chars *= 2) {
   3625     biggest_points =
   3626         FindBestInterval(max_number_of_chars, biggest_points, from, to);
   3627   }
   3628   if (biggest_points == 0) return false;
   3629   return true;
   3630 }
   3631 
   3632 
   3633 // Find the highest-points range between 0 and length_ where the character
   3634 // information is not too vague.  'Too vague' means that there are more than
   3635 // max_number_of_chars that can occur at this position.  Calculates the number
   3636 // of points as the product of width-of-the-range and
   3637 // probability-of-finding-one-of-the-characters, where the probability is
   3638 // calculated using the frequency distribution of the sample subject string.
   3639 int BoyerMooreLookahead::FindBestInterval(
   3640     int max_number_of_chars, int old_biggest_points, int* from, int* to) {
   3641   int biggest_points = old_biggest_points;
   3642   static const int kSize = RegExpMacroAssembler::kTableSize;
   3643   for (int i = 0; i < length_; ) {
   3644     while (i < length_ && Count(i) > max_number_of_chars) i++;
   3645     if (i == length_) break;
   3646     int remembered_from = i;
   3647     bool union_map[kSize];
   3648     for (int j = 0; j < kSize; j++) union_map[j] = false;
   3649     while (i < length_ && Count(i) <= max_number_of_chars) {
   3650       BoyerMoorePositionInfo* map = bitmaps_->at(i);
   3651       for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j);
   3652       i++;
   3653     }
   3654     int frequency = 0;
   3655     for (int j = 0; j < kSize; j++) {
   3656       if (union_map[j]) {
   3657         // Add 1 to the frequency to give a small per-character boost for
   3658         // the cases where our sampling is not good enough and many
   3659         // characters have a frequency of zero.  This means the frequency
   3660         // can theoretically be up to 2*kSize though we treat it mostly as
   3661         // a fraction of kSize.
   3662         frequency += compiler_->frequency_collator()->Frequency(j) + 1;
   3663       }
   3664     }
   3665     // We use the probability of skipping times the distance we are skipping to
   3666     // judge the effectiveness of this.  Actually we have a cut-off:  By
   3667     // dividing by 2 we switch off the skipping if the probability of skipping
   3668     // is less than 50%.  This is because the multibyte mask-and-compare
   3669     // skipping in quickcheck is more likely to do well on this case.
   3670     bool in_quickcheck_range =
   3671         ((i - remembered_from < 4) ||
   3672          (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
   3673     // Called 'probability' but it is only a rough estimate and can actually
   3674     // be outside the 0-kSize range.
   3675     int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
   3676     int points = (i - remembered_from) * probability;
   3677     if (points > biggest_points) {
   3678       *from = remembered_from;
   3679       *to = i - 1;
   3680       biggest_points = points;
   3681     }
   3682   }
   3683   return biggest_points;
   3684 }
   3685 
   3686 
   3687 // Take all the characters that will not prevent a successful match if they
   3688 // occur in the subject string in the range between min_lookahead and
   3689 // max_lookahead (inclusive) measured from the current position.  If the
   3690 // character at max_lookahead offset is not one of these characters, then we
   3691 // can safely skip forwards by the number of characters in the range.
   3692 int BoyerMooreLookahead::GetSkipTable(int min_lookahead,
   3693                                       int max_lookahead,
   3694                                       Handle<ByteArray> boolean_skip_table) {
   3695   const int kSize = RegExpMacroAssembler::kTableSize;
   3696 
   3697   const int kSkipArrayEntry = 0;
   3698   const int kDontSkipArrayEntry = 1;
   3699 
   3700   for (int i = 0; i < kSize; i++) {
   3701     boolean_skip_table->set(i, kSkipArrayEntry);
   3702   }
   3703   int skip = max_lookahead + 1 - min_lookahead;
   3704 
   3705   for (int i = max_lookahead; i >= min_lookahead; i--) {
   3706     BoyerMoorePositionInfo* map = bitmaps_->at(i);
   3707     for (int j = 0; j < kSize; j++) {
   3708       if (map->at(j)) {
   3709         boolean_skip_table->set(j, kDontSkipArrayEntry);
   3710       }
   3711     }
   3712   }
   3713 
   3714   return skip;
   3715 }
   3716 
   3717 
   3718 // See comment above on the implementation of GetSkipTable.
   3719 void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
   3720   const int kSize = RegExpMacroAssembler::kTableSize;
   3721 
   3722   int min_lookahead = 0;
   3723   int max_lookahead = 0;
   3724 
   3725   if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
   3726 
   3727   bool found_single_character = false;
   3728   int single_character = 0;
   3729   for (int i = max_lookahead; i >= min_lookahead; i--) {
   3730     BoyerMoorePositionInfo* map = bitmaps_->at(i);
   3731     if (map->map_count() > 1 ||
   3732         (found_single_character && map->map_count() != 0)) {
   3733       found_single_character = false;
   3734       break;
   3735     }
   3736     for (int j = 0; j < kSize; j++) {
   3737       if (map->at(j)) {
   3738         found_single_character = true;
   3739         single_character = j;
   3740         break;
   3741       }
   3742     }
   3743   }
   3744 
   3745   int lookahead_width = max_lookahead + 1 - min_lookahead;
   3746 
   3747   if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
   3748     // The mask-compare can probably handle this better.
   3749     return;
   3750   }
   3751 
   3752   if (found_single_character) {
   3753     Label cont, again;
   3754     masm->Bind(&again);
   3755     masm->LoadCurrentCharacter(max_lookahead, &cont, true);
   3756     if (max_char_ > kSize) {
   3757       masm->CheckCharacterAfterAnd(single_character,
   3758                                    RegExpMacroAssembler::kTableMask,
   3759                                    &cont);
   3760     } else {
   3761       masm->CheckCharacter(single_character, &cont);
   3762     }
   3763     masm->AdvanceCurrentPosition(lookahead_width);
   3764     masm->GoTo(&again);
   3765     masm->Bind(&cont);
   3766     return;
   3767   }
   3768 
   3769   Factory* factory = masm->isolate()->factory();
   3770   Handle<ByteArray> boolean_skip_table = factory->NewByteArray(kSize, TENURED);
   3771   int skip_distance = GetSkipTable(
   3772       min_lookahead, max_lookahead, boolean_skip_table);
   3773   DCHECK_NE(0, skip_distance);
   3774 
   3775   Label cont, again;
   3776   masm->Bind(&again);
   3777   masm->LoadCurrentCharacter(max_lookahead, &cont, true);
   3778   masm->CheckBitInTable(boolean_skip_table, &cont);
   3779   masm->AdvanceCurrentPosition(skip_distance);
   3780   masm->GoTo(&again);
   3781   masm->Bind(&cont);
   3782 }
   3783 
   3784 
   3785 /* Code generation for choice nodes.
   3786  *
   3787  * We generate quick checks that do a mask and compare to eliminate a
   3788  * choice.  If the quick check succeeds then it jumps to the continuation to
   3789  * do slow checks and check subsequent nodes.  If it fails (the common case)
   3790  * it falls through to the next choice.
   3791  *
   3792  * Here is the desired flow graph.  Nodes directly below each other imply
   3793  * fallthrough.  Alternatives 1 and 2 have quick checks.  Alternative
   3794  * 3 doesn't have a quick check so we have to call the slow check.
   3795  * Nodes are marked Qn for quick checks and Sn for slow checks.  The entire
   3796  * regexp continuation is generated directly after the Sn node, up to the
   3797  * next GoTo if we decide to reuse some already generated code.  Some
   3798  * nodes expect preload_characters to be preloaded into the current
   3799  * character register.  R nodes do this preloading.  Vertices are marked
   3800  * F for failures and S for success (possible success in the case of quick
   3801  * nodes).  L, V, < and > are used as arrow heads.
   3802  *
   3803  * ----------> R
   3804  *             |
   3805  *             V
   3806  *            Q1 -----> S1
   3807  *             |   S   /
   3808  *            F|      /
   3809  *             |    F/
   3810  *             |    /
   3811  *             |   R
   3812  *             |  /
   3813  *             V L
   3814  *            Q2 -----> S2
   3815  *             |   S   /
   3816  *            F|      /
   3817  *             |    F/
   3818  *             |    /
   3819  *             |   R
   3820  *             |  /
   3821  *             V L
   3822  *            S3
   3823  *             |
   3824  *            F|
   3825  *             |
   3826  *             R
   3827  *             |
   3828  * backtrack   V
   3829  * <----------Q4
   3830  *   \    F    |
   3831  *    \        |S
   3832  *     \   F   V
   3833  *      \-----S4
   3834  *
   3835  * For greedy loops we push the current position, then generate the code that
   3836  * eats the input specially in EmitGreedyLoop.  The other choice (the
   3837  * continuation) is generated by the normal code in EmitChoices, and steps back
   3838  * in the input to the starting position when it fails to match.  The loop code
   3839  * looks like this (U is the unwind code that steps back in the greedy loop).
   3840  *
   3841  *              _____
   3842  *             /     \
   3843  *             V     |
   3844  * ----------> S1    |
   3845  *            /|     |
   3846  *           / |S    |
   3847  *         F/  \_____/
   3848  *         /
   3849  *        |<-----
   3850  *        |      \
   3851  *        V       |S
   3852  *        Q2 ---> U----->backtrack
   3853  *        |  F   /
   3854  *       S|     /
   3855  *        V  F /
   3856  *        S2--/
   3857  */
   3858 
   3859 GreedyLoopState::GreedyLoopState(bool not_at_start) {
   3860   counter_backtrack_trace_.set_backtrack(&label_);
   3861   if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE);
   3862 }
   3863 
   3864 
   3865 void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
   3866 #ifdef DEBUG
   3867   int choice_count = alternatives_->length();
   3868   for (int i = 0; i < choice_count - 1; i++) {
   3869     GuardedAlternative alternative = alternatives_->at(i);
   3870     ZoneList<Guard*>* guards = alternative.guards();
   3871     int guard_count = (guards == nullptr) ? 0 : guards->length();
   3872     for (int j = 0; j < guard_count; j++) {
   3873       DCHECK(!trace->mentions_reg(guards->at(j)->reg()));
   3874     }
   3875   }
   3876 #endif
   3877 }
   3878 
   3879 
   3880 void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler,
   3881                               Trace* current_trace,
   3882                               PreloadState* state) {
   3883     if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
   3884       // Save some time by looking at most one machine word ahead.
   3885       state->eats_at_least_ =
   3886           EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget,
   3887                       current_trace->at_start() == Trace::FALSE_VALUE);
   3888     }
   3889     state->preload_characters_ =
   3890         CalculatePreloadCharacters(compiler, state->eats_at_least_);
   3891 
   3892     state->preload_is_current_ =
   3893         (current_trace->characters_preloaded() == state->preload_characters_);
   3894     state->preload_has_checked_bounds_ = state->preload_is_current_;
   3895 }
   3896 
   3897 
   3898 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
   3899   int choice_count = alternatives_->length();
   3900 
   3901   if (choice_count == 1 && alternatives_->at(0).guards() == nullptr) {
   3902     alternatives_->at(0).node()->Emit(compiler, trace);
   3903     return;
   3904   }
   3905 
   3906   AssertGuardsMentionRegisters(trace);
   3907 
   3908   LimitResult limit_result = LimitVersions(compiler, trace);
   3909   if (limit_result == DONE) return;
   3910   DCHECK(limit_result == CONTINUE);
   3911 
   3912   // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
   3913   // other choice nodes we only flush if we are out of code size budget.
   3914   if (trace->flush_budget() == 0 && trace->actions() != nullptr) {
   3915     trace->Flush(compiler, this);
   3916     return;
   3917   }
   3918 
   3919   RecursionCheck rc(compiler);
   3920 
   3921   PreloadState preload;
   3922   preload.init();
   3923   GreedyLoopState greedy_loop_state(not_at_start());
   3924 
   3925   int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0));
   3926   AlternativeGenerationList alt_gens(choice_count, zone());
   3927 
   3928   if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
   3929     trace = EmitGreedyLoop(compiler,
   3930                            trace,
   3931                            &alt_gens,
   3932                            &preload,
   3933                            &greedy_loop_state,
   3934                            text_length);
   3935   } else {
   3936     // TODO(erikcorry): Delete this.  We don't need this label, but it makes us
   3937     // match the traces produced pre-cleanup.
   3938     Label second_choice;
   3939     compiler->macro_assembler()->Bind(&second_choice);
   3940 
   3941     preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
   3942 
   3943     EmitChoices(compiler,
   3944                 &alt_gens,
   3945                 0,
   3946                 trace,
   3947                 &preload);
   3948   }
   3949 
   3950   // At this point we need to generate slow checks for the alternatives where
   3951   // the quick check was inlined.  We can recognize these because the associated
   3952   // label was bound.
   3953   int new_flush_budget = trace->flush_budget() / choice_count;
   3954   for (int i = 0; i < choice_count; i++) {
   3955     AlternativeGeneration* alt_gen = alt_gens.at(i);
   3956     Trace new_trace(*trace);
   3957     // If there are actions to be flushed we have to limit how many times
   3958     // they are flushed.  Take the budget of the parent trace and distribute
   3959     // it fairly amongst the children.
   3960     if (new_trace.actions() != nullptr) {
   3961       new_trace.set_flush_budget(new_flush_budget);
   3962     }
   3963     bool next_expects_preload =
   3964         i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;
   3965     EmitOutOfLineContinuation(compiler,
   3966                               &new_trace,
   3967                               alternatives_->at(i),
   3968                               alt_gen,
   3969                               preload.preload_characters_,
   3970                               next_expects_preload);
   3971   }
   3972 }
   3973 
   3974 
   3975 Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler,
   3976                                   Trace* trace,
   3977                                   AlternativeGenerationList* alt_gens,
   3978                                   PreloadState* preload,
   3979                                   GreedyLoopState* greedy_loop_state,
   3980                                   int text_length) {
   3981   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
   3982   // Here we have special handling for greedy loops containing only text nodes
   3983   // and other simple nodes.  These are handled by pushing the current
   3984   // position on the stack and then incrementing the current position each
   3985   // time around the switch.  On backtrack we decrement the current position
   3986   // and check it against the pushed value.  This avoids pushing backtrack
   3987   // information for each iteration of the loop, which could take up a lot of
   3988   // space.
   3989   DCHECK(trace->stop_node() == nullptr);
   3990   macro_assembler->PushCurrentPosition();
   3991   Label greedy_match_failed;
   3992   Trace greedy_match_trace;
   3993   if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE);
   3994   greedy_match_trace.set_backtrack(&greedy_match_failed);
   3995   Label loop_label;
   3996   macro_assembler->Bind(&loop_label);
   3997   greedy_match_trace.set_stop_node(this);
   3998   greedy_match_trace.set_loop_label(&loop_label);
   3999   alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
   4000   macro_assembler->Bind(&greedy_match_failed);
   4001 
   4002   Label second_choice;  // For use in greedy matches.
   4003   macro_assembler->Bind(&second_choice);
   4004 
   4005   Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
   4006 
   4007   EmitChoices(compiler,
   4008               alt_gens,
   4009               1,
   4010               new_trace,
   4011               preload);
   4012 
   4013   macro_assembler->Bind(greedy_loop_state->label());
   4014   // If we have unwound to the bottom then backtrack.
   4015   macro_assembler->CheckGreedyLoop(trace->backtrack());
   4016   // Otherwise try the second priority at an earlier position.
   4017   macro_assembler->AdvanceCurrentPosition(-text_length);
   4018   macro_assembler->GoTo(&second_choice);
   4019   return new_trace;
   4020 }
   4021 
   4022 int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
   4023                                               Trace* trace) {
   4024   int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
   4025   if (alternatives_->length() != 2) return eats_at_least;
   4026 
   4027   GuardedAlternative alt1 = alternatives_->at(1);
   4028   if (alt1.guards() != nullptr && alt1.guards()->length() != 0) {
   4029     return eats_at_least;
   4030   }
   4031   RegExpNode* eats_anything_node = alt1.node();
   4032   if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
   4033     return eats_at_least;
   4034   }
   4035 
   4036   // Really we should be creating a new trace when we execute this function,
   4037   // but there is no need, because the code it generates cannot backtrack, and
   4038   // we always arrive here with a trivial trace (since it's the entry to a
   4039   // loop.  That also implies that there are no preloaded characters, which is
   4040   // good, because it means we won't be violating any assumptions by
   4041   // overwriting those characters with new load instructions.
   4042   DCHECK(trace->is_trivial());
   4043 
   4044   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
   4045   Isolate* isolate = macro_assembler->isolate();
   4046   // At this point we know that we are at a non-greedy loop that will eat
   4047   // any character one at a time.  Any non-anchored regexp has such a
   4048   // loop prepended to it in order to find where it starts.  We look for
   4049   // a pattern of the form ...abc... where we can look 6 characters ahead
   4050   // and step forwards 3 if the character is not one of abc.  Abc need
   4051   // not be atoms, they can be any reasonably limited character class or
   4052   // small alternation.
   4053   BoyerMooreLookahead* bm = bm_info(false);
   4054   if (bm == nullptr) {
   4055     eats_at_least = Min(kMaxLookaheadForBoyerMoore,
   4056                         EatsAtLeast(kMaxLookaheadForBoyerMoore,
   4057                                     kRecursionBudget,
   4058                                     false));
   4059     if (eats_at_least >= 1) {
   4060       bm = new(zone()) BoyerMooreLookahead(eats_at_least,
   4061                                            compiler,
   4062                                            zone());
   4063       GuardedAlternative alt0 = alternatives_->at(0);
   4064       alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false);
   4065     }
   4066   }
   4067   if (bm != nullptr) {
   4068     bm->EmitSkipInstructions(macro_assembler);
   4069   }
   4070   return eats_at_least;
   4071 }
   4072 
   4073 
   4074 void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
   4075                              AlternativeGenerationList* alt_gens,
   4076                              int first_choice,
   4077                              Trace* trace,
   4078                              PreloadState* preload) {
   4079   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
   4080   SetUpPreLoad(compiler, trace, preload);
   4081 
   4082   // For now we just call all choices one after the other.  The idea ultimately
   4083   // is to use the Dispatch table to try only the relevant ones.
   4084   int choice_count = alternatives_->length();
   4085 
   4086   int new_flush_budget = trace->flush_budget() / choice_count;
   4087 
   4088   for (int i = first_choice; i < choice_count; i++) {
   4089     bool is_last = i == choice_count - 1;
   4090     bool fall_through_on_failure = !is_last;
   4091     GuardedAlternative alternative = alternatives_->at(i);
   4092     AlternativeGeneration* alt_gen = alt_gens->at(i);
   4093     alt_gen->quick_check_details.set_characters(preload->preload_characters_);
   4094     ZoneList<Guard*>* guards = alternative.guards();
   4095     int guard_count = (guards == nullptr) ? 0 : guards->length();
   4096     Trace new_trace(*trace);
   4097     new_trace.set_characters_preloaded(preload->preload_is_current_ ?
   4098                                          preload->preload_characters_ :
   4099                                          0);
   4100     if (preload->preload_has_checked_bounds_) {
   4101       new_trace.set_bound_checked_up_to(preload->preload_characters_);
   4102     }
   4103     new_trace.quick_check_performed()->Clear();
   4104     if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
   4105     if (!is_last) {
   4106       new_trace.set_backtrack(&alt_gen->after);
   4107     }
   4108     alt_gen->expects_preload = preload->preload_is_current_;
   4109     bool generate_full_check_inline = false;
   4110     if (compiler->optimize() &&
   4111         try_to_emit_quick_check_for_alternative(i == 0) &&
   4112         alternative.node()->EmitQuickCheck(
   4113             compiler, trace, &new_trace, preload->preload_has_checked_bounds_,
   4114             &alt_gen->possible_success, &alt_gen->quick_check_details,
   4115             fall_through_on_failure)) {
   4116       // Quick check was generated for this choice.
   4117       preload->preload_is_current_ = true;
   4118       preload->preload_has_checked_bounds_ = true;
   4119       // If we generated the quick check to fall through on possible success,
   4120       // we now need to generate the full check inline.
   4121       if (!fall_through_on_failure) {
   4122         macro_assembler->Bind(&alt_gen->possible_success);
   4123         new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
   4124         new_trace.set_characters_preloaded(preload->preload_characters_);
   4125         new_trace.set_bound_checked_up_to(preload->preload_characters_);
   4126         generate_full_check_inline = true;
   4127       }
   4128     } else if (alt_gen->quick_check_details.cannot_match()) {
   4129       if (!fall_through_on_failure) {
   4130         macro_assembler->GoTo(trace->backtrack());
   4131       }
   4132       continue;
   4133     } else {
   4134       // No quick check was generated.  Put the full code here.
   4135       // If this is not the first choice then there could be slow checks from
   4136       // previous cases that go here when they fail.  There's no reason to
   4137       // insist that they preload characters since the slow check we are about
   4138       // to generate probably can't use it.
   4139       if (i != first_choice) {
   4140         alt_gen->expects_preload = false;
   4141         new_trace.InvalidateCurrentCharacter();
   4142       }
   4143       generate_full_check_inline = true;
   4144     }
   4145     if (generate_full_check_inline) {
   4146       if (new_trace.actions() != nullptr) {
   4147         new_trace.set_flush_budget(new_flush_budget);
   4148       }
   4149       for (int j = 0; j < guard_count; j++) {
   4150         GenerateGuard(macro_assembler, guards->at(j), &new_trace);
   4151       }
   4152       alternative.node()->Emit(compiler, &new_trace);
   4153       preload->preload_is_current_ = false;
   4154     }
   4155     macro_assembler->Bind(&alt_gen->after);
   4156   }
   4157 }
   4158 
   4159 
   4160 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
   4161                                            Trace* trace,
   4162                                            GuardedAlternative alternative,
   4163                                            AlternativeGeneration* alt_gen,
   4164                                            int preload_characters,
   4165                                            bool next_expects_preload) {
   4166   if (!alt_gen->possible_success.is_linked()) return;
   4167 
   4168   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
   4169   macro_assembler->Bind(&alt_gen->possible_success);
   4170   Trace out_of_line_trace(*trace);
   4171   out_of_line_trace.set_characters_preloaded(preload_characters);
   4172   out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
   4173   if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
   4174   ZoneList<Guard*>* guards = alternative.guards();
   4175   int guard_count = (guards == nullptr) ? 0 : guards->length();
   4176   if (next_expects_preload) {
   4177     Label reload_current_char;
   4178     out_of_line_trace.set_backtrack(&reload_current_char);
   4179     for (int j = 0; j < guard_count; j++) {
   4180       GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
   4181     }
   4182     alternative.node()->Emit(compiler, &out_of_line_trace);
   4183     macro_assembler->Bind(&reload_current_char);
   4184     // Reload the current character, since the next quick check expects that.
   4185     // We don't need to check bounds here because we only get into this
   4186     // code through a quick check which already did the checked load.
   4187     macro_assembler->LoadCurrentCharacter(trace->cp_offset(), nullptr, false,
   4188                                           preload_characters);
   4189     macro_assembler->GoTo(&(alt_gen->after));
   4190   } else {
   4191     out_of_line_trace.set_backtrack(&(alt_gen->after));
   4192     for (int j = 0; j < guard_count; j++) {
   4193       GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
   4194     }
   4195     alternative.node()->Emit(compiler, &out_of_line_trace);
   4196   }
   4197 }
   4198 
   4199 
   4200 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
   4201   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   4202   LimitResult limit_result = LimitVersions(compiler, trace);
   4203   if (limit_result == DONE) return;
   4204   DCHECK(limit_result == CONTINUE);
   4205 
   4206   RecursionCheck rc(compiler);
   4207 
   4208   switch (action_type_) {
   4209     case STORE_POSITION: {
   4210       Trace::DeferredCapture
   4211           new_capture(data_.u_position_register.reg,
   4212                       data_.u_position_register.is_capture,
   4213                       trace);
   4214       Trace new_trace = *trace;
   4215       new_trace.add_action(&new_capture);
   4216       on_success()->Emit(compiler, &new_trace);
   4217       break;
   4218     }
   4219     case INCREMENT_REGISTER: {
   4220       Trace::DeferredIncrementRegister
   4221           new_increment(data_.u_increment_register.reg);
   4222       Trace new_trace = *trace;
   4223       new_trace.add_action(&new_increment);
   4224       on_success()->Emit(compiler, &new_trace);
   4225       break;
   4226     }
   4227     case SET_REGISTER: {
   4228       Trace::DeferredSetRegister
   4229           new_set(data_.u_store_register.reg, data_.u_store_register.value);
   4230       Trace new_trace = *trace;
   4231       new_trace.add_action(&new_set);
   4232       on_success()->Emit(compiler, &new_trace);
   4233       break;
   4234     }
   4235     case CLEAR_CAPTURES: {
   4236       Trace::DeferredClearCaptures
   4237         new_capture(Interval(data_.u_clear_captures.range_from,
   4238                              data_.u_clear_captures.range_to));
   4239       Trace new_trace = *trace;
   4240       new_trace.add_action(&new_capture);
   4241       on_success()->Emit(compiler, &new_trace);
   4242       break;
   4243     }
   4244     case BEGIN_SUBMATCH:
   4245       if (!trace->is_trivial()) {
   4246         trace->Flush(compiler, this);
   4247       } else {
   4248         assembler->WriteCurrentPositionToRegister(
   4249             data_.u_submatch.current_position_register, 0);
   4250         assembler->WriteStackPointerToRegister(
   4251             data_.u_submatch.stack_pointer_register);
   4252         on_success()->Emit(compiler, trace);
   4253       }
   4254       break;
   4255     case EMPTY_MATCH_CHECK: {
   4256       int start_pos_reg = data_.u_empty_match_check.start_register;
   4257       int stored_pos = 0;
   4258       int rep_reg = data_.u_empty_match_check.repetition_register;
   4259       bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
   4260       bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
   4261       if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
   4262         // If we know we haven't advanced and there is no minimum we
   4263         // can just backtrack immediately.
   4264         assembler->GoTo(trace->backtrack());
   4265       } else if (know_dist && stored_pos < trace->cp_offset()) {
   4266         // If we know we've advanced we can generate the continuation
   4267         // immediately.
   4268         on_success()->Emit(compiler, trace);
   4269       } else if (!trace->is_trivial()) {
   4270         trace->Flush(compiler, this);
   4271       } else {
   4272         Label skip_empty_check;
   4273         // If we have a minimum number of repetitions we check the current
   4274         // number first and skip the empty check if it's not enough.
   4275         if (has_minimum) {
   4276           int limit = data_.u_empty_match_check.repetition_limit;
   4277           assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
   4278         }
   4279         // If the match is empty we bail out, otherwise we fall through
   4280         // to the on-success continuation.
   4281         assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
   4282                                    trace->backtrack());
   4283         assembler->Bind(&skip_empty_check);
   4284         on_success()->Emit(compiler, trace);
   4285       }
   4286       break;
   4287     }
   4288     case POSITIVE_SUBMATCH_SUCCESS: {
   4289       if (!trace->is_trivial()) {
   4290         trace->Flush(compiler, this);
   4291         return;
   4292       }
   4293       assembler->ReadCurrentPositionFromRegister(
   4294           data_.u_submatch.current_position_register);
   4295       assembler->ReadStackPointerFromRegister(
   4296           data_.u_submatch.stack_pointer_register);
   4297       int clear_register_count = data_.u_submatch.clear_register_count;
   4298       if (clear_register_count == 0) {
   4299         on_success()->Emit(compiler, trace);
   4300         return;
   4301       }
   4302       int clear_registers_from = data_.u_submatch.clear_register_from;
   4303       Label clear_registers_backtrack;
   4304       Trace new_trace = *trace;
   4305       new_trace.set_backtrack(&clear_registers_backtrack);
   4306       on_success()->Emit(compiler, &new_trace);
   4307 
   4308       assembler->Bind(&clear_registers_backtrack);
   4309       int clear_registers_to = clear_registers_from + clear_register_count - 1;
   4310       assembler->ClearRegisters(clear_registers_from, clear_registers_to);
   4311 
   4312       DCHECK(trace->backtrack() == nullptr);
   4313       assembler->Backtrack();
   4314       return;
   4315     }
   4316     default:
   4317       UNREACHABLE();
   4318   }
   4319 }
   4320 
   4321 
   4322 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
   4323   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   4324   if (!trace->is_trivial()) {
   4325     trace->Flush(compiler, this);
   4326     return;
   4327   }
   4328 
   4329   LimitResult limit_result = LimitVersions(compiler, trace);
   4330   if (limit_result == DONE) return;
   4331   DCHECK(limit_result == CONTINUE);
   4332 
   4333   RecursionCheck rc(compiler);
   4334 
   4335   DCHECK_EQ(start_reg_ + 1, end_reg_);
   4336   if (IgnoreCase(flags_)) {
   4337     assembler->CheckNotBackReferenceIgnoreCase(
   4338         start_reg_, read_backward(), IsUnicode(flags_), trace->backtrack());
   4339   } else {
   4340     assembler->CheckNotBackReference(start_reg_, read_backward(),
   4341                                      trace->backtrack());
   4342   }
   4343   // We are going to advance backward, so we may end up at the start.
   4344   if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
   4345 
   4346   // Check that the back reference does not end inside a surrogate pair.
   4347   if (IsUnicode(flags_) && !compiler->one_byte()) {
   4348     assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack());
   4349   }
   4350   on_success()->Emit(compiler, trace);
   4351 }
   4352 
   4353 
   4354 // -------------------------------------------------------------------
   4355 // Dot/dotty output
   4356 
   4357 
   4358 #ifdef DEBUG
   4359 
   4360 
   4361 class DotPrinter: public NodeVisitor {
   4362  public:
   4363   DotPrinter(std::ostream& os, bool ignore_case)  // NOLINT
   4364       : os_(os),
   4365         ignore_case_(ignore_case) {}
   4366   void PrintNode(const char* label, RegExpNode* node);
   4367   void Visit(RegExpNode* node);
   4368   void PrintAttributes(RegExpNode* from);
   4369   void PrintOnFailure(RegExpNode* from, RegExpNode* to);
   4370 #define DECLARE_VISIT(Type)                                          \
   4371   virtual void Visit##Type(Type##Node* that);
   4372 FOR_EACH_NODE_TYPE(DECLARE_VISIT)
   4373 #undef DECLARE_VISIT
   4374  private:
   4375   std::ostream& os_;
   4376   bool ignore_case_;
   4377 };
   4378 
   4379 
   4380 void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
   4381   os_ << "digraph G {\n  graph [label=\"";
   4382   for (int i = 0; label[i]; i++) {
   4383     switch (label[i]) {
   4384       case '\\':
   4385         os_ << "\\\\";
   4386         break;
   4387       case '"':
   4388         os_ << "\"";
   4389         break;
   4390       default:
   4391         os_ << label[i];
   4392         break;
   4393     }
   4394   }
   4395   os_ << "\"];\n";
   4396   Visit(node);
   4397   os_ << "}" << std::endl;
   4398 }
   4399 
   4400 
   4401 void DotPrinter::Visit(RegExpNode* node) {
   4402   if (node->info()->visited) return;
   4403   node->info()->visited = true;
   4404   node->Accept(this);
   4405 }
   4406 
   4407 
   4408 void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
   4409   os_ << "  n" << from << " -> n" << on_failure << " [style=dotted];\n";
   4410   Visit(on_failure);
   4411 }
   4412 
   4413 
   4414 class TableEntryBodyPrinter {
   4415  public:
   4416   TableEntryBodyPrinter(std::ostream& os, ChoiceNode* choice)  // NOLINT
   4417       : os_(os),
   4418         choice_(choice) {}
   4419   void Call(uc16 from, DispatchTable::Entry entry) {
   4420     OutSet* out_set = entry.out_set();
   4421     for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
   4422       if (out_set->Get(i)) {
   4423         os_ << "    n" << choice() << ":s" << from << "o" << i << " -> n"
   4424             << choice()->alternatives()->at(i).node() << ";\n";
   4425       }
   4426     }
   4427   }
   4428  private:
   4429   ChoiceNode* choice() { return choice_; }
   4430   std::ostream& os_;
   4431   ChoiceNode* choice_;
   4432 };
   4433 
   4434 
   4435 class TableEntryHeaderPrinter {
   4436  public:
   4437   explicit TableEntryHeaderPrinter(std::ostream& os)  // NOLINT
   4438       : first_(true),
   4439         os_(os) {}
   4440   void Call(uc16 from, DispatchTable::Entry entry) {
   4441     if (first_) {
   4442       first_ = false;
   4443     } else {
   4444       os_ << "|";
   4445     }
   4446     os_ << "{\\" << AsUC16(from) << "-\\" << AsUC16(entry.to()) << "|{";
   4447     OutSet* out_set = entry.out_set();
   4448     int priority = 0;
   4449     for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
   4450       if (out_set->Get(i)) {
   4451         if (priority > 0) os_ << "|";
   4452         os_ << "<s" << from << "o" << i << "> " << priority;
   4453         priority++;
   4454       }
   4455     }
   4456     os_ << "}}";
   4457   }
   4458 
   4459  private:
   4460   bool first_;
   4461   std::ostream& os_;
   4462 };
   4463 
   4464 
   4465 class AttributePrinter {
   4466  public:
   4467   explicit AttributePrinter(std::ostream& os)  // NOLINT
   4468       : os_(os),
   4469         first_(true) {}
   4470   void PrintSeparator() {
   4471     if (first_) {
   4472       first_ = false;
   4473     } else {
   4474       os_ << "|";
   4475     }
   4476   }
   4477   void PrintBit(const char* name, bool value) {
   4478     if (!value) return;
   4479     PrintSeparator();
   4480     os_ << "{" << name << "}";
   4481   }
   4482   void PrintPositive(const char* name, int value) {
   4483     if (value < 0) return;
   4484     PrintSeparator();
   4485     os_ << "{" << name << "|" << value << "}";
   4486   }
   4487 
   4488  private:
   4489   std::ostream& os_;
   4490   bool first_;
   4491 };
   4492 
   4493 
   4494 void DotPrinter::PrintAttributes(RegExpNode* that) {
   4495   os_ << "  a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, "
   4496       << "margin=0.1, fontsize=10, label=\"{";
   4497   AttributePrinter printer(os_);
   4498   NodeInfo* info = that->info();
   4499   printer.PrintBit("NI", info->follows_newline_interest);
   4500   printer.PrintBit("WI", info->follows_word_interest);
   4501   printer.PrintBit("SI", info->follows_start_interest);
   4502   Label* label = that->label();
   4503   if (label->is_bound())
   4504     printer.PrintPositive("@", label->pos());
   4505   os_ << "}\"];\n"
   4506       << "  a" << that << " -> n" << that
   4507       << " [style=dashed, color=grey, arrowhead=none];\n";
   4508 }
   4509 
   4510 
   4511 static const bool kPrintDispatchTable = false;
   4512 void DotPrinter::VisitChoice(ChoiceNode* that) {
   4513   if (kPrintDispatchTable) {
   4514     os_ << "  n" << that << " [shape=Mrecord, label=\"";
   4515     TableEntryHeaderPrinter header_printer(os_);
   4516     that->GetTable(ignore_case_)->ForEach(&header_printer);
   4517     os_ << "\"]\n";
   4518     PrintAttributes(that);
   4519     TableEntryBodyPrinter body_printer(os_, that);
   4520     that->GetTable(ignore_case_)->ForEach(&body_printer);
   4521   } else {
   4522     os_ << "  n" << that << " [shape=Mrecord, label=\"?\"];\n";
   4523     for (int i = 0; i < that->alternatives()->length(); i++) {
   4524       GuardedAlternative alt = that->alternatives()->at(i);
   4525       os_ << "  n" << that << " -> n" << alt.node();
   4526     }
   4527   }
   4528   for (int i = 0; i < that->alternatives()->length(); i++) {
   4529     GuardedAlternative alt = that->alternatives()->at(i);
   4530     alt.node()->Accept(this);
   4531   }
   4532 }
   4533 
   4534 
   4535 void DotPrinter::VisitText(TextNode* that) {
   4536   Zone* zone = that->zone();
   4537   os_ << "  n" << that << " [label=\"";
   4538   for (int i = 0; i < that->elements()->length(); i++) {
   4539     if (i > 0) os_ << " ";
   4540     TextElement elm = that->elements()->at(i);
   4541     switch (elm.text_type()) {
   4542       case TextElement::ATOM: {
   4543         Vector<const uc16> data = elm.atom()->data();
   4544         for (int i = 0; i < data.length(); i++) {
   4545           os_ << static_cast<char>(data[i]);
   4546         }
   4547         break;
   4548       }
   4549       case TextElement::CHAR_CLASS: {
   4550         RegExpCharacterClass* node = elm.char_class();
   4551         os_ << "[";
   4552         if (node->is_negated()) os_ << "^";
   4553         for (int j = 0; j < node->ranges(zone)->length(); j++) {
   4554           CharacterRange range = node->ranges(zone)->at(j);
   4555           os_ << AsUC16(range.from()) << "-" << AsUC16(range.to());
   4556         }
   4557         os_ << "]";
   4558         break;
   4559       }
   4560       default:
   4561         UNREACHABLE();
   4562     }
   4563   }
   4564   os_ << "\", shape=box, peripheries=2];\n";
   4565   PrintAttributes(that);
   4566   os_ << "  n" << that << " -> n" << that->on_success() << ";\n";
   4567   Visit(that->on_success());
   4568 }
   4569 
   4570 
   4571 void DotPrinter::VisitBackReference(BackReferenceNode* that) {
   4572   os_ << "  n" << that << " [label=\"$" << that->start_register() << "..$"
   4573       << that->end_register() << "\", shape=doubleoctagon];\n";
   4574   PrintAttributes(that);
   4575   os_ << "  n" << that << " -> n" << that->on_success() << ";\n";
   4576   Visit(that->on_success());
   4577 }
   4578 
   4579 
   4580 void DotPrinter::VisitEnd(EndNode* that) {
   4581   os_ << "  n" << that << " [style=bold, shape=point];\n";
   4582   PrintAttributes(that);
   4583 }
   4584 
   4585 
   4586 void DotPrinter::VisitAssertion(AssertionNode* that) {
   4587   os_ << "  n" << that << " [";
   4588   switch (that->assertion_type()) {
   4589     case AssertionNode::AT_END:
   4590       os_ << "label=\"$\", shape=septagon";
   4591       break;
   4592     case AssertionNode::AT_START:
   4593       os_ << "label=\"^\", shape=septagon";
   4594       break;
   4595     case AssertionNode::AT_BOUNDARY:
   4596       os_ << "label=\"\\b\", shape=septagon";
   4597       break;
   4598     case AssertionNode::AT_NON_BOUNDARY:
   4599       os_ << "label=\"\\B\", shape=septagon";
   4600       break;
   4601     case AssertionNode::AFTER_NEWLINE:
   4602       os_ << "label=\"(?<=\\n)\", shape=septagon";
   4603       break;
   4604   }
   4605   os_ << "];\n";
   4606   PrintAttributes(that);
   4607   RegExpNode* successor = that->on_success();
   4608   os_ << "  n" << that << " -> n" << successor << ";\n";
   4609   Visit(successor);
   4610 }
   4611 
   4612 
   4613 void DotPrinter::VisitAction(ActionNode* that) {
   4614   os_ << "  n" << that << " [";
   4615   switch (that->action_type_) {
   4616     case ActionNode::SET_REGISTER:
   4617       os_ << "label=\"$" << that->data_.u_store_register.reg
   4618           << ":=" << that->data_.u_store_register.value << "\", shape=octagon";
   4619       break;
   4620     case ActionNode::INCREMENT_REGISTER:
   4621       os_ << "label=\"$" << that->data_.u_increment_register.reg
   4622           << "++\", shape=octagon";
   4623       break;
   4624     case ActionNode::STORE_POSITION:
   4625       os_ << "label=\"$" << that->data_.u_position_register.reg
   4626           << ":=$pos\", shape=octagon";
   4627       break;
   4628     case ActionNode::BEGIN_SUBMATCH:
   4629       os_ << "label=\"$" << that->data_.u_submatch.current_position_register
   4630           << ":=$pos,begin\", shape=septagon";
   4631       break;
   4632     case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
   4633       os_ << "label=\"escape\", shape=septagon";
   4634       break;
   4635     case ActionNode::EMPTY_MATCH_CHECK:
   4636       os_ << "label=\"$" << that->data_.u_empty_match_check.start_register
   4637           << "=$pos?,$" << that->data_.u_empty_match_check.repetition_register
   4638           << "<" << that->data_.u_empty_match_check.repetition_limit
   4639           << "?\", shape=septagon";
   4640       break;
   4641     case ActionNode::CLEAR_CAPTURES: {
   4642       os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from
   4643           << " to $" << that->data_.u_clear_captures.range_to
   4644           << "\", shape=septagon";
   4645       break;
   4646     }
   4647   }
   4648   os_ << "];\n";
   4649   PrintAttributes(that);
   4650   RegExpNode* successor = that->on_success();
   4651   os_ << "  n" << that << " -> n" << successor << ";\n";
   4652   Visit(successor);
   4653 }
   4654 
   4655 
   4656 class DispatchTableDumper {
   4657  public:
   4658   explicit DispatchTableDumper(std::ostream& os) : os_(os) {}
   4659   void Call(uc16 key, DispatchTable::Entry entry);
   4660  private:
   4661   std::ostream& os_;
   4662 };
   4663 
   4664 
   4665 void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
   4666   os_ << "[" << AsUC16(key) << "-" << AsUC16(entry.to()) << "]: {";
   4667   OutSet* set = entry.out_set();
   4668   bool first = true;
   4669   for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
   4670     if (set->Get(i)) {
   4671       if (first) {
   4672         first = false;
   4673       } else {
   4674         os_ << ", ";
   4675       }
   4676       os_ << i;
   4677     }
   4678   }
   4679   os_ << "}\n";
   4680 }
   4681 
   4682 
   4683 void DispatchTable::Dump() {
   4684   OFStream os(stderr);
   4685   DispatchTableDumper dumper(os);
   4686   tree()->ForEach(&dumper);
   4687 }
   4688 
   4689 
   4690 void RegExpEngine::DotPrint(const char* label,
   4691                             RegExpNode* node,
   4692                             bool ignore_case) {
   4693   StdoutStream os;
   4694   DotPrinter printer(os, ignore_case);
   4695   printer.PrintNode(label, node);
   4696 }
   4697 
   4698 
   4699 #endif  // DEBUG
   4700 
   4701 
   4702 // -------------------------------------------------------------------
   4703 // Tree to graph conversion
   4704 
   4705 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
   4706                                RegExpNode* on_success) {
   4707   ZoneList<TextElement>* elms =
   4708       new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone());
   4709   elms->Add(TextElement::Atom(this), compiler->zone());
   4710   return new (compiler->zone())
   4711       TextNode(elms, compiler->read_backward(), on_success);
   4712 }
   4713 
   4714 
   4715 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
   4716                                RegExpNode* on_success) {
   4717   return new (compiler->zone())
   4718       TextNode(elements(), compiler->read_backward(), on_success);
   4719 }
   4720 
   4721 
   4722 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
   4723                                  const int* special_class,
   4724                                  int length) {
   4725   length--;  // Remove final marker.
   4726   DCHECK_EQ(kRangeEndMarker, special_class[length]);
   4727   DCHECK_NE(0, ranges->length());
   4728   DCHECK_NE(0, length);
   4729   DCHECK_NE(0, special_class[0]);
   4730   if (ranges->length() != (length >> 1) + 1) {
   4731     return false;
   4732   }
   4733   CharacterRange range = ranges->at(0);
   4734   if (range.from() != 0) {
   4735     return false;
   4736   }
   4737   for (int i = 0; i < length; i += 2) {
   4738     if (special_class[i] != (range.to() + 1)) {
   4739       return false;
   4740     }
   4741     range = ranges->at((i >> 1) + 1);
   4742     if (special_class[i+1] != range.from()) {
   4743       return false;
   4744     }
   4745   }
   4746   if (range.to() != String::kMaxCodePoint) {
   4747     return false;
   4748   }
   4749   return true;
   4750 }
   4751 
   4752 
   4753 static bool CompareRanges(ZoneList<CharacterRange>* ranges,
   4754                           const int* special_class,
   4755                           int length) {
   4756   length--;  // Remove final marker.
   4757   DCHECK_EQ(kRangeEndMarker, special_class[length]);
   4758   if (ranges->length() * 2 != length) {
   4759     return false;
   4760   }
   4761   for (int i = 0; i < length; i += 2) {
   4762     CharacterRange range = ranges->at(i >> 1);
   4763     if (range.from() != special_class[i] ||
   4764         range.to() != special_class[i + 1] - 1) {
   4765       return false;
   4766     }
   4767   }
   4768   return true;
   4769 }
   4770 
   4771 
   4772 bool RegExpCharacterClass::is_standard(Zone* zone) {
   4773   // TODO(lrn): Remove need for this function, by not throwing away information
   4774   // along the way.
   4775   if (is_negated()) {
   4776     return false;
   4777   }
   4778   if (set_.is_standard()) {
   4779     return true;
   4780   }
   4781   if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
   4782     set_.set_standard_set_type('s');
   4783     return true;
   4784   }
   4785   if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
   4786     set_.set_standard_set_type('S');
   4787     return true;
   4788   }
   4789   if (CompareInverseRanges(set_.ranges(zone),
   4790                            kLineTerminatorRanges,
   4791                            kLineTerminatorRangeCount)) {
   4792     set_.set_standard_set_type('.');
   4793     return true;
   4794   }
   4795   if (CompareRanges(set_.ranges(zone),
   4796                     kLineTerminatorRanges,
   4797                     kLineTerminatorRangeCount)) {
   4798     set_.set_standard_set_type('n');
   4799     return true;
   4800   }
   4801   if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
   4802     set_.set_standard_set_type('w');
   4803     return true;
   4804   }
   4805   if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
   4806     set_.set_standard_set_type('W');
   4807     return true;
   4808   }
   4809   return false;
   4810 }
   4811 
   4812 
   4813 UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone,
   4814                                            ZoneList<CharacterRange>* base)
   4815     : zone_(zone),
   4816       table_(zone),
   4817       bmp_(nullptr),
   4818       lead_surrogates_(nullptr),
   4819       trail_surrogates_(nullptr),
   4820       non_bmp_(nullptr) {
   4821   // The unicode range splitter categorizes given character ranges into:
   4822   // - Code points from the BMP representable by one code unit.
   4823   // - Code points outside the BMP that need to be split into surrogate pairs.
   4824   // - Lone lead surrogates.
   4825   // - Lone trail surrogates.
   4826   // Lone surrogates are valid code points, even though no actual characters.
   4827   // They require special matching to make sure we do not split surrogate pairs.
   4828   // We use the dispatch table to accomplish this. The base range is split up
   4829   // by the table by the overlay ranges, and the Call callback is used to
   4830   // filter and collect ranges for each category.
   4831   for (int i = 0; i < base->length(); i++) {
   4832     table_.AddRange(base->at(i), kBase, zone_);
   4833   }
   4834   // Add overlay ranges.
   4835   table_.AddRange(CharacterRange::Range(0, kLeadSurrogateStart - 1),
   4836                   kBmpCodePoints, zone_);
   4837   table_.AddRange(CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd),
   4838                   kLeadSurrogates, zone_);
   4839   table_.AddRange(
   4840       CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
   4841       kTrailSurrogates, zone_);
   4842   table_.AddRange(
   4843       CharacterRange::Range(kTrailSurrogateEnd + 1, kNonBmpStart - 1),
   4844       kBmpCodePoints, zone_);
   4845   table_.AddRange(CharacterRange::Range(kNonBmpStart, kNonBmpEnd),
   4846                   kNonBmpCodePoints, zone_);
   4847   table_.ForEach(this);
   4848 }
   4849 
   4850 
   4851 void UnicodeRangeSplitter::Call(uc32 from, DispatchTable::Entry entry) {
   4852   OutSet* outset = entry.out_set();
   4853   if (!outset->Get(kBase)) return;
   4854   ZoneList<CharacterRange>** target = nullptr;
   4855   if (outset->Get(kBmpCodePoints)) {
   4856     target = &bmp_;
   4857   } else if (outset->Get(kLeadSurrogates)) {
   4858     target = &lead_surrogates_;
   4859   } else if (outset->Get(kTrailSurrogates)) {
   4860     target = &trail_surrogates_;
   4861   } else {
   4862     DCHECK(outset->Get(kNonBmpCodePoints));
   4863     target = &non_bmp_;
   4864   }
   4865   if (*target == nullptr)
   4866     *target = new (zone_) ZoneList<CharacterRange>(2, zone_);
   4867   (*target)->Add(CharacterRange::Range(entry.from(), entry.to()), zone_);
   4868 }
   4869 
   4870 void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,
   4871                       RegExpNode* on_success, UnicodeRangeSplitter* splitter) {
   4872   ZoneList<CharacterRange>* bmp = splitter->bmp();
   4873   if (bmp == nullptr) return;
   4874   JSRegExp::Flags default_flags = JSRegExp::Flags();
   4875   result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
   4876       compiler->zone(), bmp, compiler->read_backward(), on_success,
   4877       default_flags)));
   4878 }
   4879 
   4880 void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
   4881                              RegExpNode* on_success,
   4882                              UnicodeRangeSplitter* splitter) {
   4883   ZoneList<CharacterRange>* non_bmp = splitter->non_bmp();
   4884   if (non_bmp == nullptr) return;
   4885   DCHECK(!compiler->one_byte());
   4886   Zone* zone = compiler->zone();
   4887   JSRegExp::Flags default_flags = JSRegExp::Flags();
   4888   CharacterRange::Canonicalize(non_bmp);
   4889   for (int i = 0; i < non_bmp->length(); i++) {
   4890     // Match surrogate pair.
   4891     // E.g. [\u10005-\u11005] becomes
   4892     //      \ud800[\udc05-\udfff]|
   4893     //      [\ud801-\ud803][\udc00-\udfff]|
   4894     //      \ud804[\udc00-\udc05]
   4895     uc32 from = non_bmp->at(i).from();
   4896     uc32 to = non_bmp->at(i).to();
   4897     uc16 from_l = unibrow::Utf16::LeadSurrogate(from);
   4898     uc16 from_t = unibrow::Utf16::TrailSurrogate(from);
   4899     uc16 to_l = unibrow::Utf16::LeadSurrogate(to);
   4900     uc16 to_t = unibrow::Utf16::TrailSurrogate(to);
   4901     if (from_l == to_l) {
   4902       // The lead surrogate is the same.
   4903       result->AddAlternative(
   4904           GuardedAlternative(TextNode::CreateForSurrogatePair(
   4905               zone, CharacterRange::Singleton(from_l),
   4906               CharacterRange::Range(from_t, to_t), compiler->read_backward(),
   4907               on_success, default_flags)));
   4908     } else {
   4909       if (from_t != kTrailSurrogateStart) {
   4910         // Add [from_l][from_t-\udfff]
   4911         result->AddAlternative(
   4912             GuardedAlternative(TextNode::CreateForSurrogatePair(
   4913                 zone, CharacterRange::Singleton(from_l),
   4914                 CharacterRange::Range(from_t, kTrailSurrogateEnd),
   4915                 compiler->read_backward(), on_success, default_flags)));
   4916         from_l++;
   4917       }
   4918       if (to_t != kTrailSurrogateEnd) {
   4919         // Add [to_l][\udc00-to_t]
   4920         result->AddAlternative(
   4921             GuardedAlternative(TextNode::CreateForSurrogatePair(
   4922                 zone, CharacterRange::Singleton(to_l),
   4923                 CharacterRange::Range(kTrailSurrogateStart, to_t),
   4924                 compiler->read_backward(), on_success, default_flags)));
   4925         to_l--;
   4926       }
   4927       if (from_l <= to_l) {
   4928         // Add [from_l-to_l][\udc00-\udfff]
   4929         result->AddAlternative(
   4930             GuardedAlternative(TextNode::CreateForSurrogatePair(
   4931                 zone, CharacterRange::Range(from_l, to_l),
   4932                 CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
   4933                 compiler->read_backward(), on_success, default_flags)));
   4934       }
   4935     }
   4936   }
   4937 }
   4938 
   4939 RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
   4940     RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,
   4941     ZoneList<CharacterRange>* match, RegExpNode* on_success, bool read_backward,
   4942     JSRegExp::Flags flags) {
   4943   Zone* zone = compiler->zone();
   4944   RegExpNode* match_node = TextNode::CreateForCharacterRanges(
   4945       zone, match, read_backward, on_success, flags);
   4946   int stack_register = compiler->UnicodeLookaroundStackRegister();
   4947   int position_register = compiler->UnicodeLookaroundPositionRegister();
   4948   RegExpLookaround::Builder lookaround(false, match_node, stack_register,
   4949                                        position_register);
   4950   RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
   4951       zone, lookbehind, !read_backward, lookaround.on_match_success(), flags);
   4952   return lookaround.ForMatch(negative_match);
   4953 }
   4954 
   4955 RegExpNode* MatchAndNegativeLookaroundInReadDirection(
   4956     RegExpCompiler* compiler, ZoneList<CharacterRange>* match,
   4957     ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,
   4958     bool read_backward, JSRegExp::Flags flags) {
   4959   Zone* zone = compiler->zone();
   4960   int stack_register = compiler->UnicodeLookaroundStackRegister();
   4961   int position_register = compiler->UnicodeLookaroundPositionRegister();
   4962   RegExpLookaround::Builder lookaround(false, on_success, stack_register,
   4963                                        position_register);
   4964   RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
   4965       zone, lookahead, read_backward, lookaround.on_match_success(), flags);
   4966   return TextNode::CreateForCharacterRanges(
   4967       zone, match, read_backward, lookaround.ForMatch(negative_match), flags);
   4968 }
   4969 
   4970 void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
   4971                            RegExpNode* on_success,
   4972                            UnicodeRangeSplitter* splitter) {
   4973   JSRegExp::Flags default_flags = JSRegExp::Flags();
   4974   ZoneList<CharacterRange>* lead_surrogates = splitter->lead_surrogates();
   4975   if (lead_surrogates == nullptr) return;
   4976   Zone* zone = compiler->zone();
   4977   // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
   4978   ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
   4979       zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
   4980 
   4981   RegExpNode* match;
   4982   if (compiler->read_backward()) {
   4983     // Reading backward. Assert that reading forward, there is no trail
   4984     // surrogate, and then backward match the lead surrogate.
   4985     match = NegativeLookaroundAgainstReadDirectionAndMatch(
   4986         compiler, trail_surrogates, lead_surrogates, on_success, true,
   4987         default_flags);
   4988   } else {
   4989     // Reading forward. Forward match the lead surrogate and assert that
   4990     // no trail surrogate follows.
   4991     match = MatchAndNegativeLookaroundInReadDirection(
   4992         compiler, lead_surrogates, trail_surrogates, on_success, false,
   4993         default_flags);
   4994   }
   4995   result->AddAlternative(GuardedAlternative(match));
   4996 }
   4997 
   4998 void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
   4999                             RegExpNode* on_success,
   5000                             UnicodeRangeSplitter* splitter) {
   5001   JSRegExp::Flags default_flags = JSRegExp::Flags();
   5002   ZoneList<CharacterRange>* trail_surrogates = splitter->trail_surrogates();
   5003   if (trail_surrogates == nullptr) return;
   5004   Zone* zone = compiler->zone();
   5005   // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
   5006   ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
   5007       zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
   5008 
   5009   RegExpNode* match;
   5010   if (compiler->read_backward()) {
   5011     // Reading backward. Backward match the trail surrogate and assert that no
   5012     // lead surrogate precedes it.
   5013     match = MatchAndNegativeLookaroundInReadDirection(
   5014         compiler, trail_surrogates, lead_surrogates, on_success, true,
   5015         default_flags);
   5016   } else {
   5017     // Reading forward. Assert that reading backward, there is no lead
   5018     // surrogate, and then forward match the trail surrogate.
   5019     match = NegativeLookaroundAgainstReadDirectionAndMatch(
   5020         compiler, lead_surrogates, trail_surrogates, on_success, false,
   5021         default_flags);
   5022   }
   5023   result->AddAlternative(GuardedAlternative(match));
   5024 }
   5025 
   5026 RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
   5027                               RegExpNode* on_success) {
   5028   // This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
   5029   DCHECK(!compiler->read_backward());
   5030   Zone* zone = compiler->zone();
   5031   // Advance any character. If the character happens to be a lead surrogate and
   5032   // we advanced into the middle of a surrogate pair, it will work out, as
   5033   // nothing will match from there. We will have to advance again, consuming
   5034   // the associated trail surrogate.
   5035   ZoneList<CharacterRange>* range = CharacterRange::List(
   5036       zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit));
   5037   JSRegExp::Flags default_flags = JSRegExp::Flags();
   5038   return TextNode::CreateForCharacterRanges(zone, range, false, on_success,
   5039                                             default_flags);
   5040 }
   5041 
   5042 void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, Zone* zone) {
   5043 #ifdef V8_INTL_SUPPORT
   5044   DCHECK(CharacterRange::IsCanonical(ranges));
   5045 
   5046   // Micro-optimization to avoid passing large ranges to UnicodeSet::closeOver.
   5047   // See also https://crbug.com/v8/6727.
   5048   // TODO(jgruber): This only covers the special case of the {0,0x10FFFF} range,
   5049   // which we use frequently internally. But large ranges can also easily be
   5050   // created by the user. We might want to have a more general caching mechanism
   5051   // for such ranges.
   5052   if (ranges->length() == 1 && ranges->at(0).IsEverything(kNonBmpEnd)) return;
   5053 
   5054   // Use ICU to compute the case fold closure over the ranges.
   5055   icu::UnicodeSet set;
   5056   for (int i = 0; i < ranges->length(); i++) {
   5057     set.add(ranges->at(i).from(), ranges->at(i).to());
   5058   }
   5059   ranges->Clear();
   5060   set.closeOver(USET_CASE_INSENSITIVE);
   5061   // Full case mapping map single characters to multiple characters.
   5062   // Those are represented as strings in the set. Remove them so that
   5063   // we end up with only simple and common case mappings.
   5064   set.removeAllStrings();
   5065   for (int i = 0; i < set.getRangeCount(); i++) {
   5066     ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
   5067                 zone);
   5068   }
   5069   // No errors and everything we collected have been ranges.
   5070   CharacterRange::Canonicalize(ranges);
   5071 #endif  // V8_INTL_SUPPORT
   5072 }
   5073 
   5074 
   5075 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
   5076                                          RegExpNode* on_success) {
   5077   set_.Canonicalize();
   5078   Zone* zone = compiler->zone();
   5079   ZoneList<CharacterRange>* ranges = this->ranges(zone);
   5080   if (NeedsUnicodeCaseEquivalents(flags_)) {
   5081     AddUnicodeCaseEquivalents(ranges, zone);
   5082   }
   5083   if (IsUnicode(flags_) && !compiler->one_byte() &&
   5084       !contains_split_surrogate()) {
   5085     if (is_negated()) {
   5086       ZoneList<CharacterRange>* negated =
   5087           new (zone) ZoneList<CharacterRange>(2, zone);
   5088       CharacterRange::Negate(ranges, negated, zone);
   5089       ranges = negated;
   5090     }
   5091     if (ranges->length() == 0) {
   5092       JSRegExp::Flags default_flags;
   5093       RegExpCharacterClass* fail =
   5094           new (zone) RegExpCharacterClass(zone, ranges, default_flags);
   5095       return new (zone) TextNode(fail, compiler->read_backward(), on_success);
   5096     }
   5097     if (standard_type() == '*') {
   5098       return UnanchoredAdvance(compiler, on_success);
   5099     } else {
   5100       ChoiceNode* result = new (zone) ChoiceNode(2, zone);
   5101       UnicodeRangeSplitter splitter(zone, ranges);
   5102       AddBmpCharacters(compiler, result, on_success, &splitter);
   5103       AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
   5104       AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
   5105       AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
   5106       return result;
   5107     }
   5108   } else {
   5109     return new (zone) TextNode(this, compiler->read_backward(), on_success);
   5110   }
   5111 }
   5112 
   5113 
   5114 int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
   5115   RegExpAtom* atom1 = (*a)->AsAtom();
   5116   RegExpAtom* atom2 = (*b)->AsAtom();
   5117   uc16 character1 = atom1->data().at(0);
   5118   uc16 character2 = atom2->data().at(0);
   5119   if (character1 < character2) return -1;
   5120   if (character1 > character2) return 1;
   5121   return 0;
   5122 }
   5123 
   5124 
   5125 static unibrow::uchar Canonical(
   5126     unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
   5127     unibrow::uchar c) {
   5128   unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
   5129   int length = canonicalize->get(c, '\0', chars);
   5130   DCHECK_LE(length, 1);
   5131   unibrow::uchar canonical = c;
   5132   if (length == 1) canonical = chars[0];
   5133   return canonical;
   5134 }
   5135 
   5136 
   5137 int CompareFirstCharCaseIndependent(
   5138     unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
   5139     RegExpTree* const* a, RegExpTree* const* b) {
   5140   RegExpAtom* atom1 = (*a)->AsAtom();
   5141   RegExpAtom* atom2 = (*b)->AsAtom();
   5142   unibrow::uchar character1 = atom1->data().at(0);
   5143   unibrow::uchar character2 = atom2->data().at(0);
   5144   if (character1 == character2) return 0;
   5145   if (character1 >= 'a' || character2 >= 'a') {
   5146     character1 = Canonical(canonicalize, character1);
   5147     character2 = Canonical(canonicalize, character2);
   5148   }
   5149   return static_cast<int>(character1) - static_cast<int>(character2);
   5150 }
   5151 
   5152 
   5153 // We can stable sort runs of atoms, since the order does not matter if they
   5154 // start with different characters.
   5155 // Returns true if any consecutive atoms were found.
   5156 bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
   5157   ZoneList<RegExpTree*>* alternatives = this->alternatives();
   5158   int length = alternatives->length();
   5159   bool found_consecutive_atoms = false;
   5160   for (int i = 0; i < length; i++) {
   5161     while (i < length) {
   5162       RegExpTree* alternative = alternatives->at(i);
   5163       if (alternative->IsAtom()) break;
   5164       i++;
   5165     }
   5166     // i is length or it is the index of an atom.
   5167     if (i == length) break;
   5168     int first_atom = i;
   5169     JSRegExp::Flags flags = alternatives->at(i)->AsAtom()->flags();
   5170     i++;
   5171     while (i < length) {
   5172       RegExpTree* alternative = alternatives->at(i);
   5173       if (!alternative->IsAtom()) break;
   5174       if (alternative->AsAtom()->flags() != flags) break;
   5175       i++;
   5176     }
   5177     // Sort atoms to get ones with common prefixes together.
   5178     // This step is more tricky if we are in a case-independent regexp,
   5179     // because it would change /is|I/ to /I|is/, and order matters when
   5180     // the regexp parts don't match only disjoint starting points. To fix
   5181     // this we have a version of CompareFirstChar that uses case-
   5182     // independent character classes for comparison.
   5183     DCHECK_LT(first_atom, alternatives->length());
   5184     DCHECK_LE(i, alternatives->length());
   5185     DCHECK_LE(first_atom, i);
   5186     if (IgnoreCase(flags)) {
   5187       unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
   5188           compiler->isolate()->regexp_macro_assembler_canonicalize();
   5189       auto compare_closure =
   5190           [canonicalize](RegExpTree* const* a, RegExpTree* const* b) {
   5191             return CompareFirstCharCaseIndependent(canonicalize, a, b);
   5192           };
   5193       alternatives->StableSort(compare_closure, first_atom, i - first_atom);
   5194     } else {
   5195       alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
   5196     }
   5197     if (i - first_atom > 1) found_consecutive_atoms = true;
   5198   }
   5199   return found_consecutive_atoms;
   5200 }
   5201 
   5202 
   5203 // Optimizes ab|ac|az to a(?:b|c|d).
   5204 void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
   5205   Zone* zone = compiler->zone();
   5206   ZoneList<RegExpTree*>* alternatives = this->alternatives();
   5207   int length = alternatives->length();
   5208 
   5209   int write_posn = 0;
   5210   int i = 0;
   5211   while (i < length) {
   5212     RegExpTree* alternative = alternatives->at(i);
   5213     if (!alternative->IsAtom()) {
   5214       alternatives->at(write_posn++) = alternatives->at(i);
   5215       i++;
   5216       continue;
   5217     }
   5218     RegExpAtom* const atom = alternative->AsAtom();
   5219     JSRegExp::Flags flags = atom->flags();
   5220     unibrow::uchar common_prefix = atom->data().at(0);
   5221     int first_with_prefix = i;
   5222     int prefix_length = atom->length();
   5223     i++;
   5224     while (i < length) {
   5225       alternative = alternatives->at(i);
   5226       if (!alternative->IsAtom()) break;
   5227       RegExpAtom* const atom = alternative->AsAtom();
   5228       if (atom->flags() != flags) break;
   5229       unibrow::uchar new_prefix = atom->data().at(0);
   5230       if (new_prefix != common_prefix) {
   5231         if (!IgnoreCase(flags)) break;
   5232         unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
   5233             compiler->isolate()->regexp_macro_assembler_canonicalize();
   5234         new_prefix = Canonical(canonicalize, new_prefix);
   5235         common_prefix = Canonical(canonicalize, common_prefix);
   5236         if (new_prefix != common_prefix) break;
   5237       }
   5238       prefix_length = Min(prefix_length, atom->length());
   5239       i++;
   5240     }
   5241     if (i > first_with_prefix + 2) {
   5242       // Found worthwhile run of alternatives with common prefix of at least one
   5243       // character.  The sorting function above did not sort on more than one
   5244       // character for reasons of correctness, but there may still be a longer
   5245       // common prefix if the terms were similar or presorted in the input.
   5246       // Find out how long the common prefix is.
   5247       int run_length = i - first_with_prefix;
   5248       RegExpAtom* const atom = alternatives->at(first_with_prefix)->AsAtom();
   5249       for (int j = 1; j < run_length && prefix_length > 1; j++) {
   5250         RegExpAtom* old_atom =
   5251             alternatives->at(j + first_with_prefix)->AsAtom();
   5252         for (int k = 1; k < prefix_length; k++) {
   5253           if (atom->data().at(k) != old_atom->data().at(k)) {
   5254             prefix_length = k;
   5255             break;
   5256           }
   5257         }
   5258       }
   5259       RegExpAtom* prefix = new (zone)
   5260           RegExpAtom(atom->data().SubVector(0, prefix_length), flags);
   5261       ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone);
   5262       pair->Add(prefix, zone);
   5263       ZoneList<RegExpTree*>* suffixes =
   5264           new (zone) ZoneList<RegExpTree*>(run_length, zone);
   5265       for (int j = 0; j < run_length; j++) {
   5266         RegExpAtom* old_atom =
   5267             alternatives->at(j + first_with_prefix)->AsAtom();
   5268         int len = old_atom->length();
   5269         if (len == prefix_length) {
   5270           suffixes->Add(new (zone) RegExpEmpty(), zone);
   5271         } else {
   5272           RegExpTree* suffix = new (zone) RegExpAtom(
   5273               old_atom->data().SubVector(prefix_length, old_atom->length()),
   5274               flags);
   5275           suffixes->Add(suffix, zone);
   5276         }
   5277       }
   5278       pair->Add(new (zone) RegExpDisjunction(suffixes), zone);
   5279       alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair);
   5280     } else {
   5281       // Just copy any non-worthwhile alternatives.
   5282       for (int j = first_with_prefix; j < i; j++) {
   5283         alternatives->at(write_posn++) = alternatives->at(j);
   5284       }
   5285     }
   5286   }
   5287   alternatives->Rewind(write_posn);  // Trim end of array.
   5288 }
   5289 
   5290 
   5291 // Optimizes b|c|z to [bcz].
   5292 void RegExpDisjunction::FixSingleCharacterDisjunctions(
   5293     RegExpCompiler* compiler) {
   5294   Zone* zone = compiler->zone();
   5295   ZoneList<RegExpTree*>* alternatives = this->alternatives();
   5296   int length = alternatives->length();
   5297 
   5298   int write_posn = 0;
   5299   int i = 0;
   5300   while (i < length) {
   5301     RegExpTree* alternative = alternatives->at(i);
   5302     if (!alternative->IsAtom()) {
   5303       alternatives->at(write_posn++) = alternatives->at(i);
   5304       i++;
   5305       continue;
   5306     }
   5307     RegExpAtom* const atom = alternative->AsAtom();
   5308     if (atom->length() != 1) {
   5309       alternatives->at(write_posn++) = alternatives->at(i);
   5310       i++;
   5311       continue;
   5312     }
   5313     JSRegExp::Flags flags = atom->flags();
   5314     DCHECK_IMPLIES(IsUnicode(flags),
   5315                    !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
   5316     bool contains_trail_surrogate =
   5317         unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
   5318     int first_in_run = i;
   5319     i++;
   5320     // Find a run of single-character atom alternatives that have identical
   5321     // flags (case independence and unicode-ness).
   5322     while (i < length) {
   5323       alternative = alternatives->at(i);
   5324       if (!alternative->IsAtom()) break;
   5325       RegExpAtom* const atom = alternative->AsAtom();
   5326       if (atom->length() != 1) break;
   5327       if (atom->flags() != flags) break;
   5328       DCHECK_IMPLIES(IsUnicode(flags),
   5329                      !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
   5330       contains_trail_surrogate |=
   5331           unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
   5332       i++;
   5333     }
   5334     if (i > first_in_run + 1) {
   5335       // Found non-trivial run of single-character alternatives.
   5336       int run_length = i - first_in_run;
   5337       ZoneList<CharacterRange>* ranges =
   5338           new (zone) ZoneList<CharacterRange>(2, zone);
   5339       for (int j = 0; j < run_length; j++) {
   5340         RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
   5341         DCHECK_EQ(old_atom->length(), 1);
   5342         ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
   5343       }
   5344       RegExpCharacterClass::CharacterClassFlags character_class_flags;
   5345       if (IsUnicode(flags) && contains_trail_surrogate) {
   5346         character_class_flags = RegExpCharacterClass::CONTAINS_SPLIT_SURROGATE;
   5347       }
   5348       alternatives->at(write_posn++) = new (zone)
   5349           RegExpCharacterClass(zone, ranges, flags, character_class_flags);
   5350     } else {
   5351       // Just copy any trivial alternatives.
   5352       for (int j = first_in_run; j < i; j++) {
   5353         alternatives->at(write_posn++) = alternatives->at(j);
   5354       }
   5355     }
   5356   }
   5357   alternatives->Rewind(write_posn);  // Trim end of array.
   5358 }
   5359 
   5360 
   5361 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
   5362                                       RegExpNode* on_success) {
   5363   ZoneList<RegExpTree*>* alternatives = this->alternatives();
   5364 
   5365   if (alternatives->length() > 2) {
   5366     bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
   5367     if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
   5368     FixSingleCharacterDisjunctions(compiler);
   5369     if (alternatives->length() == 1) {
   5370       return alternatives->at(0)->ToNode(compiler, on_success);
   5371     }
   5372   }
   5373 
   5374   int length = alternatives->length();
   5375 
   5376   ChoiceNode* result =
   5377       new(compiler->zone()) ChoiceNode(length, compiler->zone());
   5378   for (int i = 0; i < length; i++) {
   5379     GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
   5380                                                                on_success));
   5381     result->AddAlternative(alternative);
   5382   }
   5383   return result;
   5384 }
   5385 
   5386 
   5387 RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
   5388                                      RegExpNode* on_success) {
   5389   return ToNode(min(),
   5390                 max(),
   5391                 is_greedy(),
   5392                 body(),
   5393                 compiler,
   5394                 on_success);
   5395 }
   5396 
   5397 
   5398 // Scoped object to keep track of how much we unroll quantifier loops in the
   5399 // regexp graph generator.
   5400 class RegExpExpansionLimiter {
   5401  public:
   5402   static const int kMaxExpansionFactor = 6;
   5403   RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
   5404       : compiler_(compiler),
   5405         saved_expansion_factor_(compiler->current_expansion_factor()),
   5406         ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
   5407     DCHECK_LT(0, factor);
   5408     if (ok_to_expand_) {
   5409       if (factor > kMaxExpansionFactor) {
   5410         // Avoid integer overflow of the current expansion factor.
   5411         ok_to_expand_ = false;
   5412         compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
   5413       } else {
   5414         int new_factor = saved_expansion_factor_ * factor;
   5415         ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
   5416         compiler->set_current_expansion_factor(new_factor);
   5417       }
   5418     }
   5419   }
   5420 
   5421   ~RegExpExpansionLimiter() {
   5422     compiler_->set_current_expansion_factor(saved_expansion_factor_);
   5423   }
   5424 
   5425   bool ok_to_expand() { return ok_to_expand_; }
   5426 
   5427  private:
   5428   RegExpCompiler* compiler_;
   5429   int saved_expansion_factor_;
   5430   bool ok_to_expand_;
   5431 
   5432   DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
   5433 };
   5434 
   5435 
   5436 RegExpNode* RegExpQuantifier::ToNode(int min,
   5437                                      int max,
   5438                                      bool is_greedy,
   5439                                      RegExpTree* body,
   5440                                      RegExpCompiler* compiler,
   5441                                      RegExpNode* on_success,
   5442                                      bool not_at_start) {
   5443   // x{f, t} becomes this:
   5444   //
   5445   //             (r++)<-.
   5446   //               |     `
   5447   //               |     (x)
   5448   //               v     ^
   5449   //      (r=0)-->(?)---/ [if r < t]
   5450   //               |
   5451   //   [if r >= f] \----> ...
   5452   //
   5453 
   5454   // 15.10.2.5 RepeatMatcher algorithm.
   5455   // The parser has already eliminated the case where max is 0.  In the case
   5456   // where max_match is zero the parser has removed the quantifier if min was
   5457   // > 0 and removed the atom if min was 0.  See AddQuantifierToAtom.
   5458 
   5459   // If we know that we cannot match zero length then things are a little
   5460   // simpler since we don't need to make the special zero length match check
   5461   // from step 2.1.  If the min and max are small we can unroll a little in
   5462   // this case.
   5463   static const int kMaxUnrolledMinMatches = 3;  // Unroll (foo)+ and (foo){3,}
   5464   static const int kMaxUnrolledMaxMatches = 3;  // Unroll (foo)? and (foo){x,3}
   5465   if (max == 0) return on_success;  // This can happen due to recursion.
   5466   bool body_can_be_empty = (body->min_match() == 0);
   5467   int body_start_reg = RegExpCompiler::kNoRegister;
   5468   Interval capture_registers = body->CaptureRegisters();
   5469   bool needs_capture_clearing = !capture_registers.is_empty();
   5470   Zone* zone = compiler->zone();
   5471 
   5472   if (body_can_be_empty) {
   5473     body_start_reg = compiler->AllocateRegister();
   5474   } else if (compiler->optimize() && !needs_capture_clearing) {
   5475     // Only unroll if there are no captures and the body can't be
   5476     // empty.
   5477     {
   5478       RegExpExpansionLimiter limiter(
   5479           compiler, min + ((max != min) ? 1 : 0));
   5480       if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
   5481         int new_max = (max == kInfinity) ? max : max - min;
   5482         // Recurse once to get the loop or optional matches after the fixed
   5483         // ones.
   5484         RegExpNode* answer = ToNode(
   5485             0, new_max, is_greedy, body, compiler, on_success, true);
   5486         // Unroll the forced matches from 0 to min.  This can cause chains of
   5487         // TextNodes (which the parser does not generate).  These should be
   5488         // combined if it turns out they hinder good code generation.
   5489         for (int i = 0; i < min; i++) {
   5490           answer = body->ToNode(compiler, answer);
   5491         }
   5492         return answer;
   5493       }
   5494     }
   5495     if (max <= kMaxUnrolledMaxMatches && min == 0) {
   5496       DCHECK_LT(0, max);  // Due to the 'if' above.
   5497       RegExpExpansionLimiter limiter(compiler, max);
   5498       if (limiter.ok_to_expand()) {
   5499         // Unroll the optional matches up to max.
   5500         RegExpNode* answer = on_success;
   5501         for (int i = 0; i < max; i++) {
   5502           ChoiceNode* alternation = new(zone) ChoiceNode(2, zone);
   5503           if (is_greedy) {
   5504             alternation->AddAlternative(
   5505                 GuardedAlternative(body->ToNode(compiler, answer)));
   5506             alternation->AddAlternative(GuardedAlternative(on_success));
   5507           } else {
   5508             alternation->AddAlternative(GuardedAlternative(on_success));
   5509             alternation->AddAlternative(
   5510                 GuardedAlternative(body->ToNode(compiler, answer)));
   5511           }
   5512           answer = alternation;
   5513           if (not_at_start && !compiler->read_backward()) {
   5514             alternation->set_not_at_start();
   5515           }
   5516         }
   5517         return answer;
   5518       }
   5519     }
   5520   }
   5521   bool has_min = min > 0;
   5522   bool has_max = max < RegExpTree::kInfinity;
   5523   bool needs_counter = has_min || has_max;
   5524   int reg_ctr = needs_counter
   5525       ? compiler->AllocateRegister()
   5526       : RegExpCompiler::kNoRegister;
   5527   LoopChoiceNode* center = new (zone)
   5528       LoopChoiceNode(body->min_match() == 0, compiler->read_backward(), zone);
   5529   if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
   5530   RegExpNode* loop_return = needs_counter
   5531       ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
   5532       : static_cast<RegExpNode*>(center);
   5533   if (body_can_be_empty) {
   5534     // If the body can be empty we need to check if it was and then
   5535     // backtrack.
   5536     loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
   5537                                               reg_ctr,
   5538                                               min,
   5539                                               loop_return);
   5540   }
   5541   RegExpNode* body_node = body->ToNode(compiler, loop_return);
   5542   if (body_can_be_empty) {
   5543     // If the body can be empty we need to store the start position
   5544     // so we can bail out if it was empty.
   5545     body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
   5546   }
   5547   if (needs_capture_clearing) {
   5548     // Before entering the body of this loop we need to clear captures.
   5549     body_node = ActionNode::ClearCaptures(capture_registers, body_node);
   5550   }
   5551   GuardedAlternative body_alt(body_node);
   5552   if (has_max) {
   5553     Guard* body_guard =
   5554         new(zone) Guard(reg_ctr, Guard::LT, max);
   5555     body_alt.AddGuard(body_guard, zone);
   5556   }
   5557   GuardedAlternative rest_alt(on_success);
   5558   if (has_min) {
   5559     Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min);
   5560     rest_alt.AddGuard(rest_guard, zone);
   5561   }
   5562   if (is_greedy) {
   5563     center->AddLoopAlternative(body_alt);
   5564     center->AddContinueAlternative(rest_alt);
   5565   } else {
   5566     center->AddContinueAlternative(rest_alt);
   5567     center->AddLoopAlternative(body_alt);
   5568   }
   5569   if (needs_counter) {
   5570     return ActionNode::SetRegister(reg_ctr, 0, center);
   5571   } else {
   5572     return center;
   5573   }
   5574 }
   5575 
   5576 namespace {
   5577 // Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and
   5578 //         \B to (?<=\w)(?=\w)|(?<=\W)(?=\W)
   5579 RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,
   5580                                           RegExpNode* on_success,
   5581                                           RegExpAssertion::AssertionType type,
   5582                                           JSRegExp::Flags flags) {
   5583   DCHECK(NeedsUnicodeCaseEquivalents(flags));
   5584   Zone* zone = compiler->zone();
   5585   ZoneList<CharacterRange>* word_range =
   5586       new (zone) ZoneList<CharacterRange>(2, zone);
   5587   CharacterRange::AddClassEscape('w', word_range, true, zone);
   5588   int stack_register = compiler->UnicodeLookaroundStackRegister();
   5589   int position_register = compiler->UnicodeLookaroundPositionRegister();
   5590   ChoiceNode* result = new (zone) ChoiceNode(2, zone);
   5591   // Add two choices. The (non-)boundary could start with a word or
   5592   // a non-word-character.
   5593   for (int i = 0; i < 2; i++) {
   5594     bool lookbehind_for_word = i == 0;
   5595     bool lookahead_for_word =
   5596         (type == RegExpAssertion::BOUNDARY) ^ lookbehind_for_word;
   5597     // Look to the left.
   5598     RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success,
   5599                                          stack_register, position_register);
   5600     RegExpNode* backward = TextNode::CreateForCharacterRanges(
   5601         zone, word_range, true, lookbehind.on_match_success(), flags);
   5602     // Look to the right.
   5603     RegExpLookaround::Builder lookahead(lookahead_for_word,
   5604                                         lookbehind.ForMatch(backward),
   5605                                         stack_register, position_register);
   5606     RegExpNode* forward = TextNode::CreateForCharacterRanges(
   5607         zone, word_range, false, lookahead.on_match_success(), flags);
   5608     result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward)));
   5609   }
   5610   return result;
   5611 }
   5612 }  // anonymous namespace
   5613 
   5614 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
   5615                                     RegExpNode* on_success) {
   5616   NodeInfo info;
   5617   Zone* zone = compiler->zone();
   5618 
   5619   switch (assertion_type()) {
   5620     case START_OF_LINE:
   5621       return AssertionNode::AfterNewline(on_success);
   5622     case START_OF_INPUT:
   5623       return AssertionNode::AtStart(on_success);
   5624     case BOUNDARY:
   5625       return NeedsUnicodeCaseEquivalents(flags_)
   5626                  ? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY,
   5627                                                  flags_)
   5628                  : AssertionNode::AtBoundary(on_success);
   5629     case NON_BOUNDARY:
   5630       return NeedsUnicodeCaseEquivalents(flags_)
   5631                  ? BoundaryAssertionAsLookaround(compiler, on_success,
   5632                                                  NON_BOUNDARY, flags_)
   5633                  : AssertionNode::AtNonBoundary(on_success);
   5634     case END_OF_INPUT:
   5635       return AssertionNode::AtEnd(on_success);
   5636     case END_OF_LINE: {
   5637       // Compile $ in multiline regexps as an alternation with a positive
   5638       // lookahead in one side and an end-of-input on the other side.
   5639       // We need two registers for the lookahead.
   5640       int stack_pointer_register = compiler->AllocateRegister();
   5641       int position_register = compiler->AllocateRegister();
   5642       // The ChoiceNode to distinguish between a newline and end-of-input.
   5643       ChoiceNode* result = new(zone) ChoiceNode(2, zone);
   5644       // Create a newline atom.
   5645       ZoneList<CharacterRange>* newline_ranges =
   5646           new(zone) ZoneList<CharacterRange>(3, zone);
   5647       CharacterRange::AddClassEscape('n', newline_ranges, false, zone);
   5648       JSRegExp::Flags default_flags = JSRegExp::Flags();
   5649       RegExpCharacterClass* newline_atom =
   5650           new (zone) RegExpCharacterClass('n', default_flags);
   5651       TextNode* newline_matcher = new (zone) TextNode(
   5652           newline_atom, false, ActionNode::PositiveSubmatchSuccess(
   5653                                    stack_pointer_register, position_register,
   5654                                    0,   // No captures inside.
   5655                                    -1,  // Ignored if no captures.
   5656                                    on_success));
   5657       // Create an end-of-input matcher.
   5658       RegExpNode* end_of_line = ActionNode::BeginSubmatch(
   5659           stack_pointer_register,
   5660           position_register,
   5661           newline_matcher);
   5662       // Add the two alternatives to the ChoiceNode.
   5663       GuardedAlternative eol_alternative(end_of_line);
   5664       result->AddAlternative(eol_alternative);
   5665       GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
   5666       result->AddAlternative(end_alternative);
   5667       return result;
   5668     }
   5669     default:
   5670       UNREACHABLE();
   5671   }
   5672   return on_success;
   5673 }
   5674 
   5675 
   5676 RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
   5677                                         RegExpNode* on_success) {
   5678   return new (compiler->zone())
   5679       BackReferenceNode(RegExpCapture::StartRegister(index()),
   5680                         RegExpCapture::EndRegister(index()), flags_,
   5681                         compiler->read_backward(), on_success);
   5682 }
   5683 
   5684 
   5685 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
   5686                                 RegExpNode* on_success) {
   5687   return on_success;
   5688 }
   5689 
   5690 
   5691 RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,
   5692                                    int stack_pointer_register,
   5693                                    int position_register,
   5694                                    int capture_register_count,
   5695                                    int capture_register_start)
   5696     : is_positive_(is_positive),
   5697       on_success_(on_success),
   5698       stack_pointer_register_(stack_pointer_register),
   5699       position_register_(position_register) {
   5700   if (is_positive_) {
   5701     on_match_success_ = ActionNode::PositiveSubmatchSuccess(
   5702         stack_pointer_register, position_register, capture_register_count,
   5703         capture_register_start, on_success_);
   5704   } else {
   5705     Zone* zone = on_success_->zone();
   5706     on_match_success_ = new (zone) NegativeSubmatchSuccess(
   5707         stack_pointer_register, position_register, capture_register_count,
   5708         capture_register_start, zone);
   5709   }
   5710 }
   5711 
   5712 
   5713 RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
   5714   if (is_positive_) {
   5715     return ActionNode::BeginSubmatch(stack_pointer_register_,
   5716                                      position_register_, match);
   5717   } else {
   5718     Zone* zone = on_success_->zone();
   5719     // We use a ChoiceNode to represent the negative lookaround. The first
   5720     // alternative is the negative match. On success, the end node backtracks.
   5721     // On failure, the second alternative is tried and leads to success.
   5722     // NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the
   5723     // first exit when calculating quick checks.
   5724     ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode(
   5725         GuardedAlternative(match), GuardedAlternative(on_success_), zone);
   5726     return ActionNode::BeginSubmatch(stack_pointer_register_,
   5727                                      position_register_, choice_node);
   5728   }
   5729 }
   5730 
   5731 
   5732 RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
   5733                                      RegExpNode* on_success) {
   5734   int stack_pointer_register = compiler->AllocateRegister();
   5735   int position_register = compiler->AllocateRegister();
   5736 
   5737   const int registers_per_capture = 2;
   5738   const int register_of_first_capture = 2;
   5739   int register_count = capture_count_ * registers_per_capture;
   5740   int register_start =
   5741     register_of_first_capture + capture_from_ * registers_per_capture;
   5742 
   5743   RegExpNode* result;
   5744   bool was_reading_backward = compiler->read_backward();
   5745   compiler->set_read_backward(type() == LOOKBEHIND);
   5746   Builder builder(is_positive(), on_success, stack_pointer_register,
   5747                   position_register, register_count, register_start);
   5748   RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
   5749   result = builder.ForMatch(match);
   5750   compiler->set_read_backward(was_reading_backward);
   5751   return result;
   5752 }
   5753 
   5754 
   5755 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
   5756                                   RegExpNode* on_success) {
   5757   return ToNode(body(), index(), compiler, on_success);
   5758 }
   5759 
   5760 
   5761 RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
   5762                                   int index,
   5763                                   RegExpCompiler* compiler,
   5764                                   RegExpNode* on_success) {
   5765   DCHECK_NOT_NULL(body);
   5766   int start_reg = RegExpCapture::StartRegister(index);
   5767   int end_reg = RegExpCapture::EndRegister(index);
   5768   if (compiler->read_backward()) std::swap(start_reg, end_reg);
   5769   RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
   5770   RegExpNode* body_node = body->ToNode(compiler, store_end);
   5771   return ActionNode::StorePosition(start_reg, true, body_node);
   5772 }
   5773 
   5774 
   5775 RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
   5776                                       RegExpNode* on_success) {
   5777   ZoneList<RegExpTree*>* children = nodes();
   5778   RegExpNode* current = on_success;
   5779   if (compiler->read_backward()) {
   5780     for (int i = 0; i < children->length(); i++) {
   5781       current = children->at(i)->ToNode(compiler, current);
   5782     }
   5783   } else {
   5784     for (int i = children->length() - 1; i >= 0; i--) {
   5785       current = children->at(i)->ToNode(compiler, current);
   5786     }
   5787   }
   5788   return current;
   5789 }
   5790 
   5791 
   5792 static void AddClass(const int* elmv,
   5793                      int elmc,
   5794                      ZoneList<CharacterRange>* ranges,
   5795                      Zone* zone) {
   5796   elmc--;
   5797   DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
   5798   for (int i = 0; i < elmc; i += 2) {
   5799     DCHECK(elmv[i] < elmv[i + 1]);
   5800     ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone);
   5801   }
   5802 }
   5803 
   5804 
   5805 static void AddClassNegated(const int *elmv,
   5806                             int elmc,
   5807                             ZoneList<CharacterRange>* ranges,
   5808                             Zone* zone) {
   5809   elmc--;
   5810   DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
   5811   DCHECK_NE(0x0000, elmv[0]);
   5812   DCHECK_NE(String::kMaxCodePoint, elmv[elmc - 1]);
   5813   uc16 last = 0x0000;
   5814   for (int i = 0; i < elmc; i += 2) {
   5815     DCHECK(last <= elmv[i] - 1);
   5816     DCHECK(elmv[i] < elmv[i + 1]);
   5817     ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);
   5818     last = elmv[i + 1];
   5819   }
   5820   ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone);
   5821 }
   5822 
   5823 void CharacterRange::AddClassEscape(char type, ZoneList<CharacterRange>* ranges,
   5824                                     bool add_unicode_case_equivalents,
   5825                                     Zone* zone) {
   5826   if (add_unicode_case_equivalents && (type == 'w' || type == 'W')) {
   5827     // See #sec-runtime-semantics-wordcharacters-abstract-operation
   5828     // In case of unicode and ignore_case, we need to create the closure over
   5829     // case equivalent characters before negating.
   5830     ZoneList<CharacterRange>* new_ranges =
   5831         new (zone) ZoneList<CharacterRange>(2, zone);
   5832     AddClass(kWordRanges, kWordRangeCount, new_ranges, zone);
   5833     AddUnicodeCaseEquivalents(new_ranges, zone);
   5834     if (type == 'W') {
   5835       ZoneList<CharacterRange>* negated =
   5836           new (zone) ZoneList<CharacterRange>(2, zone);
   5837       CharacterRange::Negate(new_ranges, negated, zone);
   5838       new_ranges = negated;
   5839     }
   5840     ranges->AddAll(*new_ranges, zone);
   5841     return;
   5842   }
   5843   AddClassEscape(type, ranges, zone);
   5844 }
   5845 
   5846 void CharacterRange::AddClassEscape(char type, ZoneList<CharacterRange>* ranges,
   5847                                     Zone* zone) {
   5848   switch (type) {
   5849     case 's':
   5850       AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
   5851       break;
   5852     case 'S':
   5853       AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
   5854       break;
   5855     case 'w':
   5856       AddClass(kWordRanges, kWordRangeCount, ranges, zone);
   5857       break;
   5858     case 'W':
   5859       AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
   5860       break;
   5861     case 'd':
   5862       AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
   5863       break;
   5864     case 'D':
   5865       AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
   5866       break;
   5867     case '.':
   5868       AddClassNegated(kLineTerminatorRanges,
   5869                       kLineTerminatorRangeCount,
   5870                       ranges,
   5871                       zone);
   5872       break;
   5873     // This is not a character range as defined by the spec but a
   5874     // convenient shorthand for a character class that matches any
   5875     // character.
   5876     case '*':
   5877       ranges->Add(CharacterRange::Everything(), zone);
   5878       break;
   5879     // This is the set of characters matched by the $ and ^ symbols
   5880     // in multiline mode.
   5881     case 'n':
   5882       AddClass(kLineTerminatorRanges,
   5883                kLineTerminatorRangeCount,
   5884                ranges,
   5885                zone);
   5886       break;
   5887     default:
   5888       UNREACHABLE();
   5889   }
   5890 }
   5891 
   5892 
   5893 Vector<const int> CharacterRange::GetWordBounds() {
   5894   return Vector<const int>(kWordRanges, kWordRangeCount - 1);
   5895 }
   5896 
   5897 // static
   5898 void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
   5899                                         ZoneList<CharacterRange>* ranges,
   5900                                         bool is_one_byte) {
   5901   CharacterRange::Canonicalize(ranges);
   5902   int range_count = ranges->length();
   5903   for (int i = 0; i < range_count; i++) {
   5904     CharacterRange range = ranges->at(i);
   5905     uc32 bottom = range.from();
   5906     if (bottom > String::kMaxUtf16CodeUnit) continue;
   5907     uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit);
   5908     // Nothing to be done for surrogates.
   5909     if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) continue;
   5910     if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
   5911       if (bottom > String::kMaxOneByteCharCode) continue;
   5912       if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
   5913     }
   5914     unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
   5915     if (top == bottom) {
   5916       // If this is a singleton we just expand the one character.
   5917       int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
   5918       for (int i = 0; i < length; i++) {
   5919         uc32 chr = chars[i];
   5920         if (chr != bottom) {
   5921           ranges->Add(CharacterRange::Singleton(chars[i]), zone);
   5922         }
   5923       }
   5924     } else {
   5925       // If this is a range we expand the characters block by block, expanding
   5926       // contiguous subranges (blocks) one at a time.  The approach is as
   5927       // follows.  For a given start character we look up the remainder of the
   5928       // block that contains it (represented by the end point), for instance we
   5929       // find 'z' if the character is 'c'.  A block is characterized by the
   5930       // property that all characters uncanonicalize in the same way, except
   5931       // that each entry in the result is incremented by the distance from the
   5932       // first element.  So a-z is a block because 'a' uncanonicalizes to ['a',
   5933       // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k].  Once
   5934       // we've found the end point we look up its uncanonicalization and
   5935       // produce a range for each element.  For instance for [c-f] we look up
   5936       // ['z', 'Z'] and produce [c-f] and [C-F].  We then only add a range if
   5937       // it is not already contained in the input, so [c-f] will be skipped but
   5938       // [C-F] will be added.  If this range is not completely contained in a
   5939       // block we do this for all the blocks covered by the range (handling
   5940       // characters that is not in a block as a "singleton block").
   5941       unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth];
   5942       int pos = bottom;
   5943       while (pos <= top) {
   5944         int length =
   5945             isolate->jsregexp_canonrange()->get(pos, '\0', equivalents);
   5946         uc32 block_end;
   5947         if (length == 0) {
   5948           block_end = pos;
   5949         } else {
   5950           DCHECK_EQ(1, length);
   5951           block_end = equivalents[0];
   5952         }
   5953         int end = (block_end > top) ? top : block_end;
   5954         length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0',
   5955                                                          equivalents);
   5956         for (int i = 0; i < length; i++) {
   5957           uc32 c = equivalents[i];
   5958           uc32 range_from = c - (block_end - pos);
   5959           uc32 range_to = c - (block_end - end);
   5960           if (!(bottom <= range_from && range_to <= top)) {
   5961             ranges->Add(CharacterRange::Range(range_from, range_to), zone);
   5962           }
   5963         }
   5964         pos = end + 1;
   5965       }
   5966     }
   5967   }
   5968 }
   5969 
   5970 
   5971 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
   5972   DCHECK_NOT_NULL(ranges);
   5973   int n = ranges->length();
   5974   if (n <= 1) return true;
   5975   int max = ranges->at(0).to();
   5976   for (int i = 1; i < n; i++) {
   5977     CharacterRange next_range = ranges->at(i);
   5978     if (next_range.from() <= max + 1) return false;
   5979     max = next_range.to();
   5980   }
   5981   return true;
   5982 }
   5983 
   5984 
   5985 ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
   5986   if (ranges_ == nullptr) {
   5987     ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);
   5988     CharacterRange::AddClassEscape(standard_set_type_, ranges_, false, zone);
   5989   }
   5990   return ranges_;
   5991 }
   5992 
   5993 
   5994 // Move a number of elements in a zonelist to another position
   5995 // in the same list. Handles overlapping source and target areas.
   5996 static void MoveRanges(ZoneList<CharacterRange>* list,
   5997                        int from,
   5998                        int to,
   5999                        int count) {
   6000   // Ranges are potentially overlapping.
   6001   if (from < to) {
   6002     for (int i = count - 1; i >= 0; i--) {
   6003       list->at(to + i) = list->at(from + i);
   6004     }
   6005   } else {
   6006     for (int i = 0; i < count; i++) {
   6007       list->at(to + i) = list->at(from + i);
   6008     }
   6009   }
   6010 }
   6011 
   6012 
   6013 static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
   6014                                       int count,
   6015                                       CharacterRange insert) {
   6016   // Inserts a range into list[0..count[, which must be sorted
   6017   // by from value and non-overlapping and non-adjacent, using at most
   6018   // list[0..count] for the result. Returns the number of resulting
   6019   // canonicalized ranges. Inserting a range may collapse existing ranges into
   6020   // fewer ranges, so the return value can be anything in the range 1..count+1.
   6021   uc32 from = insert.from();
   6022   uc32 to = insert.to();
   6023   int start_pos = 0;
   6024   int end_pos = count;
   6025   for (int i = count - 1; i >= 0; i--) {
   6026     CharacterRange current = list->at(i);
   6027     if (current.from() > to + 1) {
   6028       end_pos = i;
   6029     } else if (current.to() + 1 < from) {
   6030       start_pos = i + 1;
   6031       break;
   6032     }
   6033   }
   6034 
   6035   // Inserted range overlaps, or is adjacent to, ranges at positions
   6036   // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
   6037   // not affected by the insertion.
   6038   // If start_pos == end_pos, the range must be inserted before start_pos.
   6039   // if start_pos < end_pos, the entire range from start_pos to end_pos
   6040   // must be merged with the insert range.
   6041 
   6042   if (start_pos == end_pos) {
   6043     // Insert between existing ranges at position start_pos.
   6044     if (start_pos < count) {
   6045       MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
   6046     }
   6047     list->at(start_pos) = insert;
   6048     return count + 1;
   6049   }
   6050   if (start_pos + 1 == end_pos) {
   6051     // Replace single existing range at position start_pos.
   6052     CharacterRange to_replace = list->at(start_pos);
   6053     int new_from = Min(to_replace.from(), from);
   6054     int new_to = Max(to_replace.to(), to);
   6055     list->at(start_pos) = CharacterRange::Range(new_from, new_to);
   6056     return count;
   6057   }
   6058   // Replace a number of existing ranges from start_pos to end_pos - 1.
   6059   // Move the remaining ranges down.
   6060 
   6061   int new_from = Min(list->at(start_pos).from(), from);
   6062   int new_to = Max(list->at(end_pos - 1).to(), to);
   6063   if (end_pos < count) {
   6064     MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
   6065   }
   6066   list->at(start_pos) = CharacterRange::Range(new_from, new_to);
   6067   return count - (end_pos - start_pos) + 1;
   6068 }
   6069 
   6070 
   6071 void CharacterSet::Canonicalize() {
   6072   // Special/default classes are always considered canonical. The result
   6073   // of calling ranges() will be sorted.
   6074   if (ranges_ == nullptr) return;
   6075   CharacterRange::Canonicalize(ranges_);
   6076 }
   6077 
   6078 
   6079 void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
   6080   if (character_ranges->length() <= 1) return;
   6081   // Check whether ranges are already canonical (increasing, non-overlapping,
   6082   // non-adjacent).
   6083   int n = character_ranges->length();
   6084   int max = character_ranges->at(0).to();
   6085   int i = 1;
   6086   while (i < n) {
   6087     CharacterRange current = character_ranges->at(i);
   6088     if (current.from() <= max + 1) {
   6089       break;
   6090     }
   6091     max = current.to();
   6092     i++;
   6093   }
   6094   // Canonical until the i'th range. If that's all of them, we are done.
   6095   if (i == n) return;
   6096 
   6097   // The ranges at index i and forward are not canonicalized. Make them so by
   6098   // doing the equivalent of insertion sort (inserting each into the previous
   6099   // list, in order).
   6100   // Notice that inserting a range can reduce the number of ranges in the
   6101   // result due to combining of adjacent and overlapping ranges.
   6102   int read = i;  // Range to insert.
   6103   int num_canonical = i;  // Length of canonicalized part of list.
   6104   do {
   6105     num_canonical = InsertRangeInCanonicalList(character_ranges,
   6106                                                num_canonical,
   6107                                                character_ranges->at(read));
   6108     read++;
   6109   } while (read < n);
   6110   character_ranges->Rewind(num_canonical);
   6111 
   6112   DCHECK(CharacterRange::IsCanonical(character_ranges));
   6113 }
   6114 
   6115 
   6116 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
   6117                             ZoneList<CharacterRange>* negated_ranges,
   6118                             Zone* zone) {
   6119   DCHECK(CharacterRange::IsCanonical(ranges));
   6120   DCHECK_EQ(0, negated_ranges->length());
   6121   int range_count = ranges->length();
   6122   uc32 from = 0;
   6123   int i = 0;
   6124   if (range_count > 0 && ranges->at(0).from() == 0) {
   6125     from = ranges->at(0).to() + 1;
   6126     i = 1;
   6127   }
   6128   while (i < range_count) {
   6129     CharacterRange range = ranges->at(i);
   6130     negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone);
   6131     from = range.to() + 1;
   6132     i++;
   6133   }
   6134   if (from < String::kMaxCodePoint) {
   6135     negated_ranges->Add(CharacterRange::Range(from, String::kMaxCodePoint),
   6136                         zone);
   6137   }
   6138 }
   6139 
   6140 
   6141 // -------------------------------------------------------------------
   6142 // Splay tree
   6143 
   6144 
   6145 OutSet* OutSet::Extend(unsigned value, Zone* zone) {
   6146   if (Get(value))
   6147     return this;
   6148   if (successors(zone) != nullptr) {
   6149     for (int i = 0; i < successors(zone)->length(); i++) {
   6150       OutSet* successor = successors(zone)->at(i);
   6151       if (successor->Get(value))
   6152         return successor;
   6153     }
   6154   } else {
   6155     successors_ = new(zone) ZoneList<OutSet*>(2, zone);
   6156   }
   6157   OutSet* result = new(zone) OutSet(first_, remaining_);
   6158   result->Set(value, zone);
   6159   successors(zone)->Add(result, zone);
   6160   return result;
   6161 }
   6162 
   6163 
   6164 void OutSet::Set(unsigned value, Zone *zone) {
   6165   if (value < kFirstLimit) {
   6166     first_ |= (1 << value);
   6167   } else {
   6168     if (remaining_ == nullptr)
   6169       remaining_ = new(zone) ZoneList<unsigned>(1, zone);
   6170     if (remaining_->is_empty() || !remaining_->Contains(value))
   6171       remaining_->Add(value, zone);
   6172   }
   6173 }
   6174 
   6175 
   6176 bool OutSet::Get(unsigned value) const {
   6177   if (value < kFirstLimit) {
   6178     return (first_ & (1 << value)) != 0;
   6179   } else if (remaining_ == nullptr) {
   6180     return false;
   6181   } else {
   6182     return remaining_->Contains(value);
   6183   }
   6184 }
   6185 
   6186 
   6187 const uc32 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
   6188 
   6189 
   6190 void DispatchTable::AddRange(CharacterRange full_range, int value,
   6191                              Zone* zone) {
   6192   CharacterRange current = full_range;
   6193   if (tree()->is_empty()) {
   6194     // If this is the first range we just insert into the table.
   6195     ZoneSplayTree<Config>::Locator loc;
   6196     bool inserted = tree()->Insert(current.from(), &loc);
   6197     DCHECK(inserted);
   6198     USE(inserted);
   6199     loc.set_value(Entry(current.from(), current.to(),
   6200                         empty()->Extend(value, zone)));
   6201     return;
   6202   }
   6203   // First see if there is a range to the left of this one that
   6204   // overlaps.
   6205   ZoneSplayTree<Config>::Locator loc;
   6206   if (tree()->FindGreatestLessThan(current.from(), &loc)) {
   6207     Entry* entry = &loc.value();
   6208     // If we've found a range that overlaps with this one, and it
   6209     // starts strictly to the left of this one, we have to fix it
   6210     // because the following code only handles ranges that start on
   6211     // or after the start point of the range we're adding.
   6212     if (entry->from() < current.from() && entry->to() >= current.from()) {
   6213       // Snap the overlapping range in half around the start point of
   6214       // the range we're adding.
   6215       CharacterRange left =
   6216           CharacterRange::Range(entry->from(), current.from() - 1);
   6217       CharacterRange right = CharacterRange::Range(current.from(), entry->to());
   6218       // The left part of the overlapping range doesn't overlap.
   6219       // Truncate the whole entry to be just the left part.
   6220       entry->set_to(left.to());
   6221       // The right part is the one that overlaps.  We add this part
   6222       // to the map and let the next step deal with merging it with
   6223       // the range we're adding.
   6224       ZoneSplayTree<Config>::Locator loc;
   6225       bool inserted = tree()->Insert(right.from(), &loc);
   6226       DCHECK(inserted);
   6227       USE(inserted);
   6228       loc.set_value(Entry(right.from(),
   6229                           right.to(),
   6230                           entry->out_set()));
   6231     }
   6232   }
   6233   while (current.is_valid()) {
   6234     if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
   6235         (loc.value().from() <= current.to()) &&
   6236         (loc.value().to() >= current.from())) {
   6237       Entry* entry = &loc.value();
   6238       // We have overlap.  If there is space between the start point of
   6239       // the range we're adding and where the overlapping range starts
   6240       // then we have to add a range covering just that space.
   6241       if (current.from() < entry->from()) {
   6242         ZoneSplayTree<Config>::Locator ins;
   6243         bool inserted = tree()->Insert(current.from(), &ins);
   6244         DCHECK(inserted);
   6245         USE(inserted);
   6246         ins.set_value(Entry(current.from(),
   6247                             entry->from() - 1,
   6248                             empty()->Extend(value, zone)));
   6249         current.set_from(entry->from());
   6250       }
   6251       DCHECK_EQ(current.from(), entry->from());
   6252       // If the overlapping range extends beyond the one we want to add
   6253       // we have to snap the right part off and add it separately.
   6254       if (entry->to() > current.to()) {
   6255         ZoneSplayTree<Config>::Locator ins;
   6256         bool inserted = tree()->Insert(current.to() + 1, &ins);
   6257         DCHECK(inserted);
   6258         USE(inserted);
   6259         ins.set_value(Entry(current.to() + 1,
   6260                             entry->to(),
   6261                             entry->out_set()));
   6262         entry->set_to(current.to());
   6263       }
   6264       DCHECK(entry->to() <= current.to());
   6265       // The overlapping range is now completely contained by the range
   6266       // we're adding so we can just update it and move the start point
   6267       // of the range we're adding just past it.
   6268       entry->AddValue(value, zone);
   6269       DCHECK(entry->to() + 1 > current.from());
   6270       current.set_from(entry->to() + 1);
   6271     } else {
   6272       // There is no overlap so we can just add the range
   6273       ZoneSplayTree<Config>::Locator ins;
   6274       bool inserted = tree()->Insert(current.from(), &ins);
   6275       DCHECK(inserted);
   6276       USE(inserted);
   6277       ins.set_value(Entry(current.from(),
   6278                           current.to(),
   6279                           empty()->Extend(value, zone)));
   6280       break;
   6281     }
   6282   }
   6283 }
   6284 
   6285 
   6286 OutSet* DispatchTable::Get(uc32 value) {
   6287   ZoneSplayTree<Config>::Locator loc;
   6288   if (!tree()->FindGreatestLessThan(value, &loc))
   6289     return empty();
   6290   Entry* entry = &loc.value();
   6291   if (value <= entry->to())
   6292     return entry->out_set();
   6293   else
   6294     return empty();
   6295 }
   6296 
   6297 
   6298 // -------------------------------------------------------------------
   6299 // Analysis
   6300 
   6301 
   6302 void Analysis::EnsureAnalyzed(RegExpNode* that) {
   6303   StackLimitCheck check(isolate());
   6304   if (check.HasOverflowed()) {
   6305     fail("Stack overflow");
   6306     return;
   6307   }
   6308   if (that->info()->been_analyzed || that->info()->being_analyzed)
   6309     return;
   6310   that->info()->being_analyzed = true;
   6311   that->Accept(this);
   6312   that->info()->being_analyzed = false;
   6313   that->info()->been_analyzed = true;
   6314 }
   6315 
   6316 
   6317 void Analysis::VisitEnd(EndNode* that) {
   6318   // nothing to do
   6319 }
   6320 
   6321 
   6322 void TextNode::CalculateOffsets() {
   6323   int element_count = elements()->length();
   6324   // Set up the offsets of the elements relative to the start.  This is a fixed
   6325   // quantity since a TextNode can only contain fixed-width things.
   6326   int cp_offset = 0;
   6327   for (int i = 0; i < element_count; i++) {
   6328     TextElement& elm = elements()->at(i);
   6329     elm.set_cp_offset(cp_offset);
   6330     cp_offset += elm.length();
   6331   }
   6332 }
   6333 
   6334 
   6335 void Analysis::VisitText(TextNode* that) {
   6336   that->MakeCaseIndependent(isolate(), is_one_byte_);
   6337   EnsureAnalyzed(that->on_success());
   6338   if (!has_failed()) {
   6339     that->CalculateOffsets();
   6340   }
   6341 }
   6342 
   6343 
   6344 void Analysis::VisitAction(ActionNode* that) {
   6345   RegExpNode* target = that->on_success();
   6346   EnsureAnalyzed(target);
   6347   if (!has_failed()) {
   6348     // If the next node is interested in what it follows then this node
   6349     // has to be interested too so it can pass the information on.
   6350     that->info()->AddFromFollowing(target->info());
   6351   }
   6352 }
   6353 
   6354 
   6355 void Analysis::VisitChoice(ChoiceNode* that) {
   6356   NodeInfo* info = that->info();
   6357   for (int i = 0; i < that->alternatives()->length(); i++) {
   6358     RegExpNode* node = that->alternatives()->at(i).node();
   6359     EnsureAnalyzed(node);
   6360     if (has_failed()) return;
   6361     // Anything the following nodes need to know has to be known by
   6362     // this node also, so it can pass it on.
   6363     info->AddFromFollowing(node->info());
   6364   }
   6365 }
   6366 
   6367 
   6368 void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
   6369   NodeInfo* info = that->info();
   6370   for (int i = 0; i < that->alternatives()->length(); i++) {
   6371     RegExpNode* node = that->alternatives()->at(i).node();
   6372     if (node != that->loop_node()) {
   6373       EnsureAnalyzed(node);
   6374       if (has_failed()) return;
   6375       info->AddFromFollowing(node->info());
   6376     }
   6377   }
   6378   // Check the loop last since it may need the value of this node
   6379   // to get a correct result.
   6380   EnsureAnalyzed(that->loop_node());
   6381   if (!has_failed()) {
   6382     info->AddFromFollowing(that->loop_node()->info());
   6383   }
   6384 }
   6385 
   6386 
   6387 void Analysis::VisitBackReference(BackReferenceNode* that) {
   6388   EnsureAnalyzed(that->on_success());
   6389 }
   6390 
   6391 
   6392 void Analysis::VisitAssertion(AssertionNode* that) {
   6393   EnsureAnalyzed(that->on_success());
   6394 }
   6395 
   6396 
   6397 void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
   6398                                      BoyerMooreLookahead* bm,
   6399                                      bool not_at_start) {
   6400   // Working out the set of characters that a backreference can match is too
   6401   // hard, so we just say that any character can match.
   6402   bm->SetRest(offset);
   6403   SaveBMInfo(bm, not_at_start, offset);
   6404 }
   6405 
   6406 
   6407 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
   6408               RegExpMacroAssembler::kTableSize);
   6409 
   6410 
   6411 void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
   6412                               BoyerMooreLookahead* bm, bool not_at_start) {
   6413   ZoneList<GuardedAlternative>* alts = alternatives();
   6414   budget = (budget - 1) / alts->length();
   6415   for (int i = 0; i < alts->length(); i++) {
   6416     GuardedAlternative& alt = alts->at(i);
   6417     if (alt.guards() != nullptr && alt.guards()->length() != 0) {
   6418       bm->SetRest(offset);  // Give up trying to fill in info.
   6419       SaveBMInfo(bm, not_at_start, offset);
   6420       return;
   6421     }
   6422     alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start);
   6423   }
   6424   SaveBMInfo(bm, not_at_start, offset);
   6425 }
   6426 
   6427 
   6428 void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
   6429                             BoyerMooreLookahead* bm, bool not_at_start) {
   6430   if (initial_offset >= bm->length()) return;
   6431   int offset = initial_offset;
   6432   int max_char = bm->max_char();
   6433   for (int i = 0; i < elements()->length(); i++) {
   6434     if (offset >= bm->length()) {
   6435       if (initial_offset == 0) set_bm_info(not_at_start, bm);
   6436       return;
   6437     }
   6438     TextElement text = elements()->at(i);
   6439     if (text.text_type() == TextElement::ATOM) {
   6440       RegExpAtom* atom = text.atom();
   6441       for (int j = 0; j < atom->length(); j++, offset++) {
   6442         if (offset >= bm->length()) {
   6443           if (initial_offset == 0) set_bm_info(not_at_start, bm);
   6444           return;
   6445         }
   6446         uc16 character = atom->data()[j];
   6447         if (IgnoreCase(atom->flags())) {
   6448           unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
   6449           int length = GetCaseIndependentLetters(
   6450               isolate, character, bm->max_char() == String::kMaxOneByteCharCode,
   6451               chars);
   6452           for (int j = 0; j < length; j++) {
   6453             bm->Set(offset, chars[j]);
   6454           }
   6455         } else {
   6456           if (character <= max_char) bm->Set(offset, character);
   6457         }
   6458       }
   6459     } else {
   6460       DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());
   6461       RegExpCharacterClass* char_class = text.char_class();
   6462       ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
   6463       if (char_class->is_negated()) {
   6464         bm->SetAll(offset);
   6465       } else {
   6466         for (int k = 0; k < ranges->length(); k++) {
   6467           CharacterRange& range = ranges->at(k);
   6468           if (range.from() > max_char) continue;
   6469           int to = Min(max_char, static_cast<int>(range.to()));
   6470           bm->SetInterval(offset, Interval(range.from(), to));
   6471         }
   6472       }
   6473       offset++;
   6474     }
   6475   }
   6476   if (offset >= bm->length()) {
   6477     if (initial_offset == 0) set_bm_info(not_at_start, bm);
   6478     return;
   6479   }
   6480   on_success()->FillInBMInfo(isolate, offset, budget - 1, bm,
   6481                              true);  // Not at start after a text node.
   6482   if (initial_offset == 0) set_bm_info(not_at_start, bm);
   6483 }
   6484 
   6485 
   6486 // -------------------------------------------------------------------
   6487 // Dispatch table construction
   6488 
   6489 
   6490 void DispatchTableConstructor::VisitEnd(EndNode* that) {
   6491   AddRange(CharacterRange::Everything());
   6492 }
   6493 
   6494 
   6495 void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
   6496   node->set_being_calculated(true);
   6497   ZoneList<GuardedAlternative>* alternatives = node->alternatives();
   6498   for (int i = 0; i < alternatives->length(); i++) {
   6499     set_choice_index(i);
   6500     alternatives->at(i).node()->Accept(this);
   6501   }
   6502   node->set_being_calculated(false);
   6503 }
   6504 
   6505 
   6506 class AddDispatchRange {
   6507  public:
   6508   explicit AddDispatchRange(DispatchTableConstructor* constructor)
   6509     : constructor_(constructor) { }
   6510   void Call(uc32 from, DispatchTable::Entry entry);
   6511  private:
   6512   DispatchTableConstructor* constructor_;
   6513 };
   6514 
   6515 
   6516 void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
   6517   constructor_->AddRange(CharacterRange::Range(from, entry.to()));
   6518 }
   6519 
   6520 
   6521 void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
   6522   if (node->being_calculated())
   6523     return;
   6524   DispatchTable* table = node->GetTable(ignore_case_);
   6525   AddDispatchRange adder(this);
   6526   table->ForEach(&adder);
   6527 }
   6528 
   6529 
   6530 void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
   6531   // TODO(160): Find the node that we refer back to and propagate its start
   6532   // set back to here.  For now we just accept anything.
   6533   AddRange(CharacterRange::Everything());
   6534 }
   6535 
   6536 
   6537 void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
   6538   RegExpNode* target = that->on_success();
   6539   target->Accept(this);
   6540 }
   6541 
   6542 
   6543 static int CompareRangeByFrom(const CharacterRange* a,
   6544                               const CharacterRange* b) {
   6545   return Compare<uc16>(a->from(), b->from());
   6546 }
   6547 
   6548 
   6549 void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
   6550   ranges->Sort(CompareRangeByFrom);
   6551   uc16 last = 0;
   6552   for (int i = 0; i < ranges->length(); i++) {
   6553     CharacterRange range = ranges->at(i);
   6554     if (last < range.from())
   6555       AddRange(CharacterRange::Range(last, range.from() - 1));
   6556     if (range.to() >= last) {
   6557       if (range.to() == String::kMaxCodePoint) {
   6558         return;
   6559       } else {
   6560         last = range.to() + 1;
   6561       }
   6562     }
   6563   }
   6564   AddRange(CharacterRange::Range(last, String::kMaxCodePoint));
   6565 }
   6566 
   6567 
   6568 void DispatchTableConstructor::VisitText(TextNode* that) {
   6569   TextElement elm = that->elements()->at(0);
   6570   switch (elm.text_type()) {
   6571     case TextElement::ATOM: {
   6572       uc16 c = elm.atom()->data()[0];
   6573       AddRange(CharacterRange::Range(c, c));
   6574       break;
   6575     }
   6576     case TextElement::CHAR_CLASS: {
   6577       RegExpCharacterClass* tree = elm.char_class();
   6578       ZoneList<CharacterRange>* ranges = tree->ranges(that->zone());
   6579       if (tree->is_negated()) {
   6580         AddInverse(ranges);
   6581       } else {
   6582         for (int i = 0; i < ranges->length(); i++)
   6583           AddRange(ranges->at(i));
   6584       }
   6585       break;
   6586     }
   6587     default: {
   6588       UNIMPLEMENTED();
   6589     }
   6590   }
   6591 }
   6592 
   6593 
   6594 void DispatchTableConstructor::VisitAction(ActionNode* that) {
   6595   RegExpNode* target = that->on_success();
   6596   target->Accept(this);
   6597 }
   6598 
   6599 RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
   6600                                               RegExpNode* on_success,
   6601                                               JSRegExp::Flags flags) {
   6602   // If the regexp matching starts within a surrogate pair, step back
   6603   // to the lead surrogate and start matching from there.
   6604   DCHECK(!compiler->read_backward());
   6605   Zone* zone = compiler->zone();
   6606   ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
   6607       zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
   6608   ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
   6609       zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
   6610 
   6611   ChoiceNode* optional_step_back = new (zone) ChoiceNode(2, zone);
   6612 
   6613   int stack_register = compiler->UnicodeLookaroundStackRegister();
   6614   int position_register = compiler->UnicodeLookaroundPositionRegister();
   6615   RegExpNode* step_back = TextNode::CreateForCharacterRanges(
   6616       zone, lead_surrogates, true, on_success, flags);
   6617   RegExpLookaround::Builder builder(true, step_back, stack_register,
   6618                                     position_register);
   6619   RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
   6620       zone, trail_surrogates, false, builder.on_match_success(), flags);
   6621 
   6622   optional_step_back->AddAlternative(
   6623       GuardedAlternative(builder.ForMatch(match_trail)));
   6624   optional_step_back->AddAlternative(GuardedAlternative(on_success));
   6625 
   6626   return optional_step_back;
   6627 }
   6628 
   6629 
   6630 RegExpEngine::CompilationResult RegExpEngine::Compile(
   6631     Isolate* isolate, Zone* zone, RegExpCompileData* data,
   6632     JSRegExp::Flags flags, Handle<String> pattern,
   6633     Handle<String> sample_subject, bool is_one_byte) {
   6634   if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
   6635     return IrregexpRegExpTooBig(isolate);
   6636   }
   6637   bool is_sticky = IsSticky(flags);
   6638   bool is_global = IsGlobal(flags);
   6639   bool is_unicode = IsUnicode(flags);
   6640   RegExpCompiler compiler(isolate, zone, data->capture_count, is_one_byte);
   6641 
   6642   if (compiler.optimize())
   6643     compiler.set_optimize(!TooMuchRegExpCode(isolate, pattern));
   6644 
   6645   // Sample some characters from the middle of the string.
   6646   static const int kSampleSize = 128;
   6647 
   6648   sample_subject = String::Flatten(isolate, sample_subject);
   6649   int chars_sampled = 0;
   6650   int half_way = (sample_subject->length() - kSampleSize) / 2;
   6651   for (int i = Max(0, half_way);
   6652        i < sample_subject->length() && chars_sampled < kSampleSize;
   6653        i++, chars_sampled++) {
   6654     compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
   6655   }
   6656 
   6657   // Wrap the body of the regexp in capture #0.
   6658   RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
   6659                                                     0,
   6660                                                     &compiler,
   6661                                                     compiler.accept());
   6662   RegExpNode* node = captured_body;
   6663   bool is_end_anchored = data->tree->IsAnchoredAtEnd();
   6664   bool is_start_anchored = data->tree->IsAnchoredAtStart();
   6665   int max_length = data->tree->max_match();
   6666   if (!is_start_anchored && !is_sticky) {
   6667     // Add a .*? at the beginning, outside the body capture, unless
   6668     // this expression is anchored at the beginning or sticky.
   6669     JSRegExp::Flags default_flags = JSRegExp::Flags();
   6670     RegExpNode* loop_node = RegExpQuantifier::ToNode(
   6671         0, RegExpTree::kInfinity, false,
   6672         new (zone) RegExpCharacterClass('*', default_flags), &compiler,
   6673         captured_body, data->contains_anchor);
   6674 
   6675     if (data->contains_anchor) {
   6676       // Unroll loop once, to take care of the case that might start
   6677       // at the start of input.
   6678       ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);
   6679       first_step_node->AddAlternative(GuardedAlternative(captured_body));
   6680       first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(
   6681           new (zone) RegExpCharacterClass('*', default_flags), false,
   6682           loop_node)));
   6683       node = first_step_node;
   6684     } else {
   6685       node = loop_node;
   6686     }
   6687   }
   6688   if (is_one_byte) {
   6689     node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
   6690     // Do it again to propagate the new nodes to places where they were not
   6691     // put because they had not been calculated yet.
   6692     if (node != nullptr) {
   6693       node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
   6694     }
   6695   } else if (is_unicode && (is_global || is_sticky)) {
   6696     node = OptionallyStepBackToLeadSurrogate(&compiler, node, flags);
   6697   }
   6698 
   6699   if (node == nullptr) node = new (zone) EndNode(EndNode::BACKTRACK, zone);
   6700   data->node = node;
   6701   Analysis analysis(isolate, is_one_byte);
   6702   analysis.EnsureAnalyzed(node);
   6703   if (analysis.has_failed()) {
   6704     const char* error_message = analysis.error_message();
   6705     return CompilationResult(isolate, error_message);
   6706   }
   6707 
   6708   // Create the correct assembler for the architecture.
   6709 #ifndef V8_INTERPRETED_REGEXP
   6710   // Native regexp implementation.
   6711 
   6712   NativeRegExpMacroAssembler::Mode mode =
   6713       is_one_byte ? NativeRegExpMacroAssembler::LATIN1
   6714                   : NativeRegExpMacroAssembler::UC16;
   6715 
   6716 #if V8_TARGET_ARCH_IA32
   6717   RegExpMacroAssemblerIA32 macro_assembler(isolate, zone, mode,
   6718                                            (data->capture_count + 1) * 2);
   6719 #elif V8_TARGET_ARCH_X64
   6720   RegExpMacroAssemblerX64 macro_assembler(isolate, zone, mode,
   6721                                           (data->capture_count + 1) * 2);
   6722 #elif V8_TARGET_ARCH_ARM
   6723   RegExpMacroAssemblerARM macro_assembler(isolate, zone, mode,
   6724                                           (data->capture_count + 1) * 2);
   6725 #elif V8_TARGET_ARCH_ARM64
   6726   RegExpMacroAssemblerARM64 macro_assembler(isolate, zone, mode,
   6727                                             (data->capture_count + 1) * 2);
   6728 #elif V8_TARGET_ARCH_S390
   6729   RegExpMacroAssemblerS390 macro_assembler(isolate, zone, mode,
   6730                                            (data->capture_count + 1) * 2);
   6731 #elif V8_TARGET_ARCH_PPC
   6732   RegExpMacroAssemblerPPC macro_assembler(isolate, zone, mode,
   6733                                           (data->capture_count + 1) * 2);
   6734 #elif V8_TARGET_ARCH_MIPS
   6735   RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
   6736                                            (data->capture_count + 1) * 2);
   6737 #elif V8_TARGET_ARCH_MIPS64
   6738   RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
   6739                                            (data->capture_count + 1) * 2);
   6740 #else
   6741 #error "Unsupported architecture"
   6742 #endif
   6743 
   6744 #else  // V8_INTERPRETED_REGEXP
   6745   // Interpreted regexp implementation.
   6746   EmbeddedVector<byte, 1024> codes;
   6747   RegExpMacroAssemblerIrregexp macro_assembler(isolate, codes, zone);
   6748 #endif  // V8_INTERPRETED_REGEXP
   6749 
   6750   macro_assembler.set_slow_safe(TooMuchRegExpCode(isolate, pattern));
   6751 
   6752   // Inserted here, instead of in Assembler, because it depends on information
   6753   // in the AST that isn't replicated in the Node structure.
   6754   static const int kMaxBacksearchLimit = 1024;
   6755   if (is_end_anchored && !is_start_anchored && !is_sticky &&
   6756       max_length < kMaxBacksearchLimit) {
   6757     macro_assembler.SetCurrentPositionFromEnd(max_length);
   6758   }
   6759 
   6760   if (is_global) {
   6761     RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
   6762     if (data->tree->min_match() > 0) {
   6763       mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
   6764     } else if (is_unicode) {
   6765       mode = RegExpMacroAssembler::GLOBAL_UNICODE;
   6766     }
   6767     macro_assembler.set_global_mode(mode);
   6768   }
   6769 
   6770   return compiler.Assemble(isolate, &macro_assembler, node, data->capture_count,
   6771                            pattern);
   6772 }
   6773 
   6774 bool RegExpEngine::TooMuchRegExpCode(Isolate* isolate, Handle<String> pattern) {
   6775   Heap* heap = isolate->heap();
   6776   bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;
   6777   if (heap->isolate()->total_regexp_code_generated() >
   6778           RegExpImpl::kRegExpCompiledLimit &&
   6779       heap->CommittedMemoryExecutable() >
   6780           RegExpImpl::kRegExpExecutableMemoryLimit) {
   6781     too_much = true;
   6782   }
   6783   return too_much;
   6784 }
   6785 
   6786 
   6787 Object* RegExpResultsCache::Lookup(Heap* heap, String* key_string,
   6788                                    Object* key_pattern,
   6789                                    FixedArray** last_match_cache,
   6790                                    ResultsCacheType type) {
   6791   FixedArray* cache;
   6792   if (!key_string->IsInternalizedString()) return Smi::kZero;
   6793   if (type == STRING_SPLIT_SUBSTRINGS) {
   6794     DCHECK(key_pattern->IsString());
   6795     if (!key_pattern->IsInternalizedString()) return Smi::kZero;
   6796     cache = heap->string_split_cache();
   6797   } else {
   6798     DCHECK(type == REGEXP_MULTIPLE_INDICES);
   6799     DCHECK(key_pattern->IsFixedArray());
   6800     cache = heap->regexp_multiple_cache();
   6801   }
   6802 
   6803   uint32_t hash = key_string->Hash();
   6804   uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
   6805                     ~(kArrayEntriesPerCacheEntry - 1));
   6806   if (cache->get(index + kStringOffset) != key_string ||
   6807       cache->get(index + kPatternOffset) != key_pattern) {
   6808     index =
   6809         ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
   6810     if (cache->get(index + kStringOffset) != key_string ||
   6811         cache->get(index + kPatternOffset) != key_pattern) {
   6812       return Smi::kZero;
   6813     }
   6814   }
   6815 
   6816   *last_match_cache = FixedArray::cast(cache->get(index + kLastMatchOffset));
   6817   return cache->get(index + kArrayOffset);
   6818 }
   6819 
   6820 
   6821 void RegExpResultsCache::Enter(Isolate* isolate, Handle<String> key_string,
   6822                                Handle<Object> key_pattern,
   6823                                Handle<FixedArray> value_array,
   6824                                Handle<FixedArray> last_match_cache,
   6825                                ResultsCacheType type) {
   6826   Factory* factory = isolate->factory();
   6827   Handle<FixedArray> cache;
   6828   if (!key_string->IsInternalizedString()) return;
   6829   if (type == STRING_SPLIT_SUBSTRINGS) {
   6830     DCHECK(key_pattern->IsString());
   6831     if (!key_pattern->IsInternalizedString()) return;
   6832     cache = factory->string_split_cache();
   6833   } else {
   6834     DCHECK(type == REGEXP_MULTIPLE_INDICES);
   6835     DCHECK(key_pattern->IsFixedArray());
   6836     cache = factory->regexp_multiple_cache();
   6837   }
   6838 
   6839   uint32_t hash = key_string->Hash();
   6840   uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
   6841                     ~(kArrayEntriesPerCacheEntry - 1));
   6842   if (cache->get(index + kStringOffset) == Smi::kZero) {
   6843     cache->set(index + kStringOffset, *key_string);
   6844     cache->set(index + kPatternOffset, *key_pattern);
   6845     cache->set(index + kArrayOffset, *value_array);
   6846     cache->set(index + kLastMatchOffset, *last_match_cache);
   6847   } else {
   6848     uint32_t index2 =
   6849         ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
   6850     if (cache->get(index2 + kStringOffset) == Smi::kZero) {
   6851       cache->set(index2 + kStringOffset, *key_string);
   6852       cache->set(index2 + kPatternOffset, *key_pattern);
   6853       cache->set(index2 + kArrayOffset, *value_array);
   6854       cache->set(index2 + kLastMatchOffset, *last_match_cache);
   6855     } else {
   6856       cache->set(index2 + kStringOffset, Smi::kZero);
   6857       cache->set(index2 + kPatternOffset, Smi::kZero);
   6858       cache->set(index2 + kArrayOffset, Smi::kZero);
   6859       cache->set(index2 + kLastMatchOffset, Smi::kZero);
   6860       cache->set(index + kStringOffset, *key_string);
   6861       cache->set(index + kPatternOffset, *key_pattern);
   6862       cache->set(index + kArrayOffset, *value_array);
   6863       cache->set(index + kLastMatchOffset, *last_match_cache);
   6864     }
   6865   }
   6866   // If the array is a reasonably short list of substrings, convert it into a
   6867   // list of internalized strings.
   6868   if (type == STRING_SPLIT_SUBSTRINGS && value_array->length() < 100) {
   6869     for (int i = 0; i < value_array->length(); i++) {
   6870       Handle<String> str(String::cast(value_array->get(i)), isolate);
   6871       Handle<String> internalized_str = factory->InternalizeString(str);
   6872       value_array->set(i, *internalized_str);
   6873     }
   6874   }
   6875   // Convert backing store to a copy-on-write array.
   6876   value_array->set_map_no_write_barrier(
   6877       ReadOnlyRoots(isolate).fixed_cow_array_map());
   6878 }
   6879 
   6880 
   6881 void RegExpResultsCache::Clear(FixedArray* cache) {
   6882   for (int i = 0; i < kRegExpResultsCacheSize; i++) {
   6883     cache->set(i, Smi::kZero);
   6884   }
   6885 }
   6886 
   6887 }  // namespace internal
   6888 }  // namespace v8
   6889