Home | History | Annotate | Download | only in src
      1 // Copyright 2011 the V8 project authors. All rights reserved.
      2 // Redistribution and use in source and binary forms, with or without
      3 // modification, are permitted provided that the following conditions are
      4 // met:
      5 //
      6 //     * Redistributions of source code must retain the above copyright
      7 //       notice, this list of conditions and the following disclaimer.
      8 //     * Redistributions in binary form must reproduce the above
      9 //       copyright notice, this list of conditions and the following
     10 //       disclaimer in the documentation and/or other materials provided
     11 //       with the distribution.
     12 //     * Neither the name of Google Inc. nor the names of its
     13 //       contributors may be used to endorse or promote products derived
     14 //       from this software without specific prior written permission.
     15 //
     16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27 
     28 #include "v8.h"
     29 
     30 #include "ast.h"
     31 #include "compiler.h"
     32 #include "execution.h"
     33 #include "factory.h"
     34 #include "jsregexp.h"
     35 #include "platform.h"
     36 #include "string-search.h"
     37 #include "runtime.h"
     38 #include "compilation-cache.h"
     39 #include "string-stream.h"
     40 #include "parser.h"
     41 #include "regexp-macro-assembler.h"
     42 #include "regexp-macro-assembler-tracer.h"
     43 #include "regexp-macro-assembler-irregexp.h"
     44 #include "regexp-stack.h"
     45 
     46 #ifndef V8_INTERPRETED_REGEXP
     47 #if V8_TARGET_ARCH_IA32
     48 #include "ia32/regexp-macro-assembler-ia32.h"
     49 #elif V8_TARGET_ARCH_X64
     50 #include "x64/regexp-macro-assembler-x64.h"
     51 #elif V8_TARGET_ARCH_ARM
     52 #include "arm/regexp-macro-assembler-arm.h"
     53 #elif V8_TARGET_ARCH_MIPS
     54 #include "mips/regexp-macro-assembler-mips.h"
     55 #else
     56 #error Unsupported target architecture.
     57 #endif
     58 #endif
     59 
     60 #include "interpreter-irregexp.h"
     61 
     62 
     63 namespace v8 {
     64 namespace internal {
     65 
     66 Handle<Object> RegExpImpl::CreateRegExpLiteral(Handle<JSFunction> constructor,
     67                                                Handle<String> pattern,
     68                                                Handle<String> flags,
     69                                                bool* has_pending_exception) {
     70   // Call the construct code with 2 arguments.
     71   Handle<Object> argv[] = { pattern, flags };
     72   return Execution::New(constructor, ARRAY_SIZE(argv), argv,
     73                         has_pending_exception);
     74 }
     75 
     76 
     77 static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) {
     78   int flags = JSRegExp::NONE;
     79   for (int i = 0; i < str->length(); i++) {
     80     switch (str->Get(i)) {
     81       case 'i':
     82         flags |= JSRegExp::IGNORE_CASE;
     83         break;
     84       case 'g':
     85         flags |= JSRegExp::GLOBAL;
     86         break;
     87       case 'm':
     88         flags |= JSRegExp::MULTILINE;
     89         break;
     90     }
     91   }
     92   return JSRegExp::Flags(flags);
     93 }
     94 
     95 
     96 static inline void ThrowRegExpException(Handle<JSRegExp> re,
     97                                         Handle<String> pattern,
     98                                         Handle<String> error_text,
     99                                         const char* message) {
    100   Isolate* isolate = re->GetIsolate();
    101   Factory* factory = isolate->factory();
    102   Handle<FixedArray> elements = factory->NewFixedArray(2);
    103   elements->set(0, *pattern);
    104   elements->set(1, *error_text);
    105   Handle<JSArray> array = factory->NewJSArrayWithElements(elements);
    106   Handle<Object> regexp_err = factory->NewSyntaxError(message, array);
    107   isolate->Throw(*regexp_err);
    108 }
    109 
    110 
    111 // Generic RegExp methods. Dispatches to implementation specific methods.
    112 
    113 
    114 Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
    115                                    Handle<String> pattern,
    116                                    Handle<String> flag_str) {
    117   Isolate* isolate = re->GetIsolate();
    118   JSRegExp::Flags flags = RegExpFlagsFromString(flag_str);
    119   CompilationCache* compilation_cache = isolate->compilation_cache();
    120   Handle<FixedArray> cached = compilation_cache->LookupRegExp(pattern, flags);
    121   bool in_cache = !cached.is_null();
    122   LOG(isolate, RegExpCompileEvent(re, in_cache));
    123 
    124   Handle<Object> result;
    125   if (in_cache) {
    126     re->set_data(*cached);
    127     return re;
    128   }
    129   pattern = FlattenGetString(pattern);
    130   ZoneScope zone_scope(isolate, DELETE_ON_EXIT);
    131   PostponeInterruptsScope postpone(isolate);
    132   RegExpCompileData parse_result;
    133   FlatStringReader reader(isolate, pattern);
    134   if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(),
    135                                  &parse_result)) {
    136     // Throw an exception if we fail to parse the pattern.
    137     ThrowRegExpException(re,
    138                          pattern,
    139                          parse_result.error,
    140                          "malformed_regexp");
    141     return Handle<Object>::null();
    142   }
    143 
    144   if (parse_result.simple && !flags.is_ignore_case()) {
    145     // Parse-tree is a single atom that is equal to the pattern.
    146     AtomCompile(re, pattern, flags, pattern);
    147   } else if (parse_result.tree->IsAtom() &&
    148       !flags.is_ignore_case() &&
    149       parse_result.capture_count == 0) {
    150     RegExpAtom* atom = parse_result.tree->AsAtom();
    151     Vector<const uc16> atom_pattern = atom->data();
    152     Handle<String> atom_string =
    153         isolate->factory()->NewStringFromTwoByte(atom_pattern);
    154     AtomCompile(re, pattern, flags, atom_string);
    155   } else {
    156     IrregexpInitialize(re, pattern, flags, parse_result.capture_count);
    157   }
    158   ASSERT(re->data()->IsFixedArray());
    159   // Compilation succeeded so the data is set on the regexp
    160   // and we can store it in the cache.
    161   Handle<FixedArray> data(FixedArray::cast(re->data()));
    162   compilation_cache->PutRegExp(pattern, flags, data);
    163 
    164   return re;
    165 }
    166 
    167 
    168 Handle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
    169                                 Handle<String> subject,
    170                                 int index,
    171                                 Handle<JSArray> last_match_info) {
    172   switch (regexp->TypeTag()) {
    173     case JSRegExp::ATOM:
    174       return AtomExec(regexp, subject, index, last_match_info);
    175     case JSRegExp::IRREGEXP: {
    176       Handle<Object> result =
    177           IrregexpExec(regexp, subject, index, last_match_info);
    178       ASSERT(!result.is_null() ||
    179              regexp->GetIsolate()->has_pending_exception());
    180       return result;
    181     }
    182     default:
    183       UNREACHABLE();
    184       return Handle<Object>::null();
    185   }
    186 }
    187 
    188 
    189 // RegExp Atom implementation: Simple string search using indexOf.
    190 
    191 
    192 void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
    193                              Handle<String> pattern,
    194                              JSRegExp::Flags flags,
    195                              Handle<String> match_pattern) {
    196   re->GetIsolate()->factory()->SetRegExpAtomData(re,
    197                                                  JSRegExp::ATOM,
    198                                                  pattern,
    199                                                  flags,
    200                                                  match_pattern);
    201 }
    202 
    203 
    204 static void SetAtomLastCapture(FixedArray* array,
    205                                String* subject,
    206                                int from,
    207                                int to) {
    208   NoHandleAllocation no_handles;
    209   RegExpImpl::SetLastCaptureCount(array, 2);
    210   RegExpImpl::SetLastSubject(array, subject);
    211   RegExpImpl::SetLastInput(array, subject);
    212   RegExpImpl::SetCapture(array, 0, from);
    213   RegExpImpl::SetCapture(array, 1, to);
    214 }
    215 
    216 
    217 Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
    218                                     Handle<String> subject,
    219                                     int index,
    220                                     Handle<JSArray> last_match_info) {
    221   Isolate* isolate = re->GetIsolate();
    222 
    223   ASSERT(0 <= index);
    224   ASSERT(index <= subject->length());
    225 
    226   if (!subject->IsFlat()) FlattenString(subject);
    227   AssertNoAllocation no_heap_allocation;  // ensure vectors stay valid
    228 
    229   String* needle = String::cast(re->DataAt(JSRegExp::kAtomPatternIndex));
    230   int needle_len = needle->length();
    231   ASSERT(needle->IsFlat());
    232 
    233   if (needle_len != 0) {
    234     if (index + needle_len > subject->length()) {
    235       return isolate->factory()->null_value();
    236     }
    237 
    238     String::FlatContent needle_content = needle->GetFlatContent();
    239     String::FlatContent subject_content = subject->GetFlatContent();
    240     ASSERT(needle_content.IsFlat());
    241     ASSERT(subject_content.IsFlat());
    242     // dispatch on type of strings
    243     index = (needle_content.IsAscii()
    244              ? (subject_content.IsAscii()
    245                 ? SearchString(isolate,
    246                                subject_content.ToAsciiVector(),
    247                                needle_content.ToAsciiVector(),
    248                                index)
    249                 : SearchString(isolate,
    250                                subject_content.ToUC16Vector(),
    251                                needle_content.ToAsciiVector(),
    252                                index))
    253              : (subject_content.IsAscii()
    254                 ? SearchString(isolate,
    255                                subject_content.ToAsciiVector(),
    256                                needle_content.ToUC16Vector(),
    257                                index)
    258                 : SearchString(isolate,
    259                                subject_content.ToUC16Vector(),
    260                                needle_content.ToUC16Vector(),
    261                                index)));
    262     if (index == -1) return isolate->factory()->null_value();
    263   }
    264   ASSERT(last_match_info->HasFastElements());
    265 
    266   {
    267     NoHandleAllocation no_handles;
    268     FixedArray* array = FixedArray::cast(last_match_info->elements());
    269     SetAtomLastCapture(array, *subject, index, index + needle_len);
    270   }
    271   return last_match_info;
    272 }
    273 
    274 
    275 // Irregexp implementation.
    276 
    277 // Ensures that the regexp object contains a compiled version of the
    278 // source for either ASCII or non-ASCII strings.
    279 // If the compiled version doesn't already exist, it is compiled
    280 // from the source pattern.
    281 // If compilation fails, an exception is thrown and this function
    282 // returns false.
    283 bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, bool is_ascii) {
    284   Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii));
    285 #ifdef V8_INTERPRETED_REGEXP
    286   if (compiled_code->IsByteArray()) return true;
    287 #else  // V8_INTERPRETED_REGEXP (RegExp native code)
    288   if (compiled_code->IsCode()) return true;
    289 #endif
    290   // We could potentially have marked this as flushable, but have kept
    291   // a saved version if we did not flush it yet.
    292   Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_ascii));
    293   if (saved_code->IsCode()) {
    294     // Reinstate the code in the original place.
    295     re->SetDataAt(JSRegExp::code_index(is_ascii), saved_code);
    296     ASSERT(compiled_code->IsSmi());
    297     return true;
    298   }
    299   return CompileIrregexp(re, is_ascii);
    300 }
    301 
    302 
    303 static bool CreateRegExpErrorObjectAndThrow(Handle<JSRegExp> re,
    304                                             bool is_ascii,
    305                                             Handle<String> error_message,
    306                                             Isolate* isolate) {
    307   Factory* factory = isolate->factory();
    308   Handle<FixedArray> elements = factory->NewFixedArray(2);
    309   elements->set(0, re->Pattern());
    310   elements->set(1, *error_message);
    311   Handle<JSArray> array = factory->NewJSArrayWithElements(elements);
    312   Handle<Object> regexp_err =
    313       factory->NewSyntaxError("malformed_regexp", array);
    314   isolate->Throw(*regexp_err);
    315   return false;
    316 }
    317 
    318 
    319 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, bool is_ascii) {
    320   // Compile the RegExp.
    321   Isolate* isolate = re->GetIsolate();
    322   ZoneScope zone_scope(isolate, DELETE_ON_EXIT);
    323   PostponeInterruptsScope postpone(isolate);
    324   // If we had a compilation error the last time this is saved at the
    325   // saved code index.
    326   Object* entry = re->DataAt(JSRegExp::code_index(is_ascii));
    327   // When arriving here entry can only be a smi, either representing an
    328   // uncompiled regexp, a previous compilation error, or code that has
    329   // been flushed.
    330   ASSERT(entry->IsSmi());
    331   int entry_value = Smi::cast(entry)->value();
    332   ASSERT(entry_value == JSRegExp::kUninitializedValue ||
    333          entry_value == JSRegExp::kCompilationErrorValue ||
    334          (entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0));
    335 
    336   if (entry_value == JSRegExp::kCompilationErrorValue) {
    337     // A previous compilation failed and threw an error which we store in
    338     // the saved code index (we store the error message, not the actual
    339     // error). Recreate the error object and throw it.
    340     Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_ascii));
    341     ASSERT(error_string->IsString());
    342     Handle<String> error_message(String::cast(error_string));
    343     CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate);
    344     return false;
    345   }
    346 
    347   JSRegExp::Flags flags = re->GetFlags();
    348 
    349   Handle<String> pattern(re->Pattern());
    350   if (!pattern->IsFlat()) FlattenString(pattern);
    351   RegExpCompileData compile_data;
    352   FlatStringReader reader(isolate, pattern);
    353   if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(),
    354                                  &compile_data)) {
    355     // Throw an exception if we fail to parse the pattern.
    356     // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
    357     ThrowRegExpException(re,
    358                          pattern,
    359                          compile_data.error,
    360                          "malformed_regexp");
    361     return false;
    362   }
    363   RegExpEngine::CompilationResult result =
    364       RegExpEngine::Compile(&compile_data,
    365                             flags.is_ignore_case(),
    366                             flags.is_multiline(),
    367                             pattern,
    368                             is_ascii);
    369   if (result.error_message != NULL) {
    370     // Unable to compile regexp.
    371     Handle<String> error_message =
    372         isolate->factory()->NewStringFromUtf8(CStrVector(result.error_message));
    373     CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate);
    374     return false;
    375   }
    376 
    377   Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
    378   data->set(JSRegExp::code_index(is_ascii), result.code);
    379   int register_max = IrregexpMaxRegisterCount(*data);
    380   if (result.num_registers > register_max) {
    381     SetIrregexpMaxRegisterCount(*data, result.num_registers);
    382   }
    383 
    384   return true;
    385 }
    386 
    387 
    388 int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
    389   return Smi::cast(
    390       re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
    391 }
    392 
    393 
    394 void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
    395   re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
    396 }
    397 
    398 
    399 int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
    400   return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
    401 }
    402 
    403 
    404 int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
    405   return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
    406 }
    407 
    408 
    409 ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_ascii) {
    410   return ByteArray::cast(re->get(JSRegExp::code_index(is_ascii)));
    411 }
    412 
    413 
    414 Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_ascii) {
    415   return Code::cast(re->get(JSRegExp::code_index(is_ascii)));
    416 }
    417 
    418 
    419 void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re,
    420                                     Handle<String> pattern,
    421                                     JSRegExp::Flags flags,
    422                                     int capture_count) {
    423   // Initialize compiled code entries to null.
    424   re->GetIsolate()->factory()->SetRegExpIrregexpData(re,
    425                                                      JSRegExp::IRREGEXP,
    426                                                      pattern,
    427                                                      flags,
    428                                                      capture_count);
    429 }
    430 
    431 
    432 int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp,
    433                                 Handle<String> subject) {
    434   if (!subject->IsFlat()) FlattenString(subject);
    435 
    436   // Check the asciiness of the underlying storage.
    437   bool is_ascii = subject->IsAsciiRepresentationUnderneath();
    438   if (!EnsureCompiledIrregexp(regexp, is_ascii)) return -1;
    439 
    440 #ifdef V8_INTERPRETED_REGEXP
    441   // Byte-code regexp needs space allocated for all its registers.
    442   return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data()));
    443 #else  // V8_INTERPRETED_REGEXP
    444   // Native regexp only needs room to output captures. Registers are handled
    445   // internally.
    446   return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
    447 #endif  // V8_INTERPRETED_REGEXP
    448 }
    449 
    450 
    451 RegExpImpl::IrregexpResult RegExpImpl::IrregexpExecOnce(
    452     Handle<JSRegExp> regexp,
    453     Handle<String> subject,
    454     int index,
    455     Vector<int> output) {
    456   Isolate* isolate = regexp->GetIsolate();
    457 
    458   Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
    459 
    460   ASSERT(index >= 0);
    461   ASSERT(index <= subject->length());
    462   ASSERT(subject->IsFlat());
    463 
    464   bool is_ascii = subject->IsAsciiRepresentationUnderneath();
    465 
    466 #ifndef V8_INTERPRETED_REGEXP
    467   ASSERT(output.length() >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
    468   do {
    469     EnsureCompiledIrregexp(regexp, is_ascii);
    470     Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii), isolate);
    471     NativeRegExpMacroAssembler::Result res =
    472         NativeRegExpMacroAssembler::Match(code,
    473                                           subject,
    474                                           output.start(),
    475                                           output.length(),
    476                                           index,
    477                                           isolate);
    478     if (res != NativeRegExpMacroAssembler::RETRY) {
    479       ASSERT(res != NativeRegExpMacroAssembler::EXCEPTION ||
    480              isolate->has_pending_exception());
    481       STATIC_ASSERT(
    482           static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
    483       STATIC_ASSERT(
    484           static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
    485       STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)
    486                     == RE_EXCEPTION);
    487       return static_cast<IrregexpResult>(res);
    488     }
    489     // If result is RETRY, the string has changed representation, and we
    490     // must restart from scratch.
    491     // In this case, it means we must make sure we are prepared to handle
    492     // the, potentially, different subject (the string can switch between
    493     // being internal and external, and even between being ASCII and UC16,
    494     // but the characters are always the same).
    495     IrregexpPrepare(regexp, subject);
    496     is_ascii = subject->IsAsciiRepresentationUnderneath();
    497   } while (true);
    498   UNREACHABLE();
    499   return RE_EXCEPTION;
    500 #else  // V8_INTERPRETED_REGEXP
    501 
    502   ASSERT(output.length() >= IrregexpNumberOfRegisters(*irregexp));
    503   // We must have done EnsureCompiledIrregexp, so we can get the number of
    504   // registers.
    505   int* register_vector = output.start();
    506   int number_of_capture_registers =
    507       (IrregexpNumberOfCaptures(*irregexp) + 1) * 2;
    508   for (int i = number_of_capture_registers - 1; i >= 0; i--) {
    509     register_vector[i] = -1;
    510   }
    511   Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_ascii), isolate);
    512 
    513   IrregexpResult result = IrregexpInterpreter::Match(isolate,
    514                                                      byte_codes,
    515                                                      subject,
    516                                                      register_vector,
    517                                                      index);
    518   if (result == RE_EXCEPTION) {
    519     ASSERT(!isolate->has_pending_exception());
    520     isolate->StackOverflow();
    521   }
    522   return result;
    523 #endif  // V8_INTERPRETED_REGEXP
    524 }
    525 
    526 
    527 Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> jsregexp,
    528                                         Handle<String> subject,
    529                                         int previous_index,
    530                                         Handle<JSArray> last_match_info) {
    531   Isolate* isolate = jsregexp->GetIsolate();
    532   ASSERT_EQ(jsregexp->TypeTag(), JSRegExp::IRREGEXP);
    533 
    534   // Prepare space for the return values.
    535 #ifdef V8_INTERPRETED_REGEXP
    536 #ifdef DEBUG
    537   if (FLAG_trace_regexp_bytecodes) {
    538     String* pattern = jsregexp->Pattern();
    539     PrintF("\n\nRegexp match:   /%s/\n\n", *(pattern->ToCString()));
    540     PrintF("\n\nSubject string: '%s'\n\n", *(subject->ToCString()));
    541   }
    542 #endif
    543 #endif
    544   int required_registers = RegExpImpl::IrregexpPrepare(jsregexp, subject);
    545   if (required_registers < 0) {
    546     // Compiling failed with an exception.
    547     ASSERT(isolate->has_pending_exception());
    548     return Handle<Object>::null();
    549   }
    550 
    551   OffsetsVector registers(required_registers, isolate);
    552 
    553   IrregexpResult res = RegExpImpl::IrregexpExecOnce(
    554       jsregexp, subject, previous_index, Vector<int>(registers.vector(),
    555                                                      registers.length()));
    556   if (res == RE_SUCCESS) {
    557     int capture_register_count =
    558         (IrregexpNumberOfCaptures(FixedArray::cast(jsregexp->data())) + 1) * 2;
    559     last_match_info->EnsureSize(capture_register_count + kLastMatchOverhead);
    560     AssertNoAllocation no_gc;
    561     int* register_vector = registers.vector();
    562     FixedArray* array = FixedArray::cast(last_match_info->elements());
    563     for (int i = 0; i < capture_register_count; i += 2) {
    564       SetCapture(array, i, register_vector[i]);
    565       SetCapture(array, i + 1, register_vector[i + 1]);
    566     }
    567     SetLastCaptureCount(array, capture_register_count);
    568     SetLastSubject(array, *subject);
    569     SetLastInput(array, *subject);
    570     return last_match_info;
    571   }
    572   if (res == RE_EXCEPTION) {
    573     ASSERT(isolate->has_pending_exception());
    574     return Handle<Object>::null();
    575   }
    576   ASSERT(res == RE_FAILURE);
    577   return isolate->factory()->null_value();
    578 }
    579 
    580 
    581 // -------------------------------------------------------------------
    582 // Implementation of the Irregexp regular expression engine.
    583 //
    584 // The Irregexp regular expression engine is intended to be a complete
    585 // implementation of ECMAScript regular expressions.  It generates either
    586 // bytecodes or native code.
    587 
    588 //   The Irregexp regexp engine is structured in three steps.
    589 //   1) The parser generates an abstract syntax tree.  See ast.cc.
    590 //   2) From the AST a node network is created.  The nodes are all
    591 //      subclasses of RegExpNode.  The nodes represent states when
    592 //      executing a regular expression.  Several optimizations are
    593 //      performed on the node network.
    594 //   3) From the nodes we generate either byte codes or native code
    595 //      that can actually execute the regular expression (perform
    596 //      the search).  The code generation step is described in more
    597 //      detail below.
    598 
    599 // Code generation.
    600 //
    601 //   The nodes are divided into four main categories.
    602 //   * Choice nodes
    603 //        These represent places where the regular expression can
    604 //        match in more than one way.  For example on entry to an
    605 //        alternation (foo|bar) or a repetition (*, +, ? or {}).
    606 //   * Action nodes
    607 //        These represent places where some action should be
    608 //        performed.  Examples include recording the current position
    609 //        in the input string to a register (in order to implement
    610 //        captures) or other actions on register for example in order
    611 //        to implement the counters needed for {} repetitions.
    612 //   * Matching nodes
    613 //        These attempt to match some element part of the input string.
    614 //        Examples of elements include character classes, plain strings
    615 //        or back references.
    616 //   * End nodes
    617 //        These are used to implement the actions required on finding
    618 //        a successful match or failing to find a match.
    619 //
    620 //   The code generated (whether as byte codes or native code) maintains
    621 //   some state as it runs.  This consists of the following elements:
    622 //
    623 //   * The capture registers.  Used for string captures.
    624 //   * Other registers.  Used for counters etc.
    625 //   * The current position.
    626 //   * The stack of backtracking information.  Used when a matching node
    627 //     fails to find a match and needs to try an alternative.
    628 //
    629 // Conceptual regular expression execution model:
    630 //
    631 //   There is a simple conceptual model of regular expression execution
    632 //   which will be presented first.  The actual code generated is a more
    633 //   efficient simulation of the simple conceptual model:
    634 //
    635 //   * Choice nodes are implemented as follows:
    636 //     For each choice except the last {
    637 //       push current position
    638 //       push backtrack code location
    639 //       <generate code to test for choice>
    640 //       backtrack code location:
    641 //       pop current position
    642 //     }
    643 //     <generate code to test for last choice>
    644 //
    645 //   * Actions nodes are generated as follows
    646 //     <push affected registers on backtrack stack>
    647 //     <generate code to perform action>
    648 //     push backtrack code location
    649 //     <generate code to test for following nodes>
    650 //     backtrack code location:
    651 //     <pop affected registers to restore their state>
    652 //     <pop backtrack location from stack and go to it>
    653 //
    654 //   * Matching nodes are generated as follows:
    655 //     if input string matches at current position
    656 //       update current position
    657 //       <generate code to test for following nodes>
    658 //     else
    659 //       <pop backtrack location from stack and go to it>
    660 //
    661 //   Thus it can be seen that the current position is saved and restored
    662 //   by the choice nodes, whereas the registers are saved and restored by
    663 //   by the action nodes that manipulate them.
    664 //
    665 //   The other interesting aspect of this model is that nodes are generated
    666 //   at the point where they are needed by a recursive call to Emit().  If
    667 //   the node has already been code generated then the Emit() call will
    668 //   generate a jump to the previously generated code instead.  In order to
    669 //   limit recursion it is possible for the Emit() function to put the node
    670 //   on a work list for later generation and instead generate a jump.  The
    671 //   destination of the jump is resolved later when the code is generated.
    672 //
    673 // Actual regular expression code generation.
    674 //
    675 //   Code generation is actually more complicated than the above.  In order
    676 //   to improve the efficiency of the generated code some optimizations are
    677 //   performed
    678 //
    679 //   * Choice nodes have 1-character lookahead.
    680 //     A choice node looks at the following character and eliminates some of
    681 //     the choices immediately based on that character.  This is not yet
    682 //     implemented.
    683 //   * Simple greedy loops store reduced backtracking information.
    684 //     A quantifier like /.*foo/m will greedily match the whole input.  It will
    685 //     then need to backtrack to a point where it can match "foo".  The naive
    686 //     implementation of this would push each character position onto the
    687 //     backtracking stack, then pop them off one by one.  This would use space
    688 //     proportional to the length of the input string.  However since the "."
    689 //     can only match in one way and always has a constant length (in this case
    690 //     of 1) it suffices to store the current position on the top of the stack
    691 //     once.  Matching now becomes merely incrementing the current position and
    692 //     backtracking becomes decrementing the current position and checking the
    693 //     result against the stored current position.  This is faster and saves
    694 //     space.
    695 //   * The current state is virtualized.
    696 //     This is used to defer expensive operations until it is clear that they
    697 //     are needed and to generate code for a node more than once, allowing
    698 //     specialized an efficient versions of the code to be created. This is
    699 //     explained in the section below.
    700 //
    701 // Execution state virtualization.
    702 //
    703 //   Instead of emitting code, nodes that manipulate the state can record their
    704 //   manipulation in an object called the Trace.  The Trace object can record a
    705 //   current position offset, an optional backtrack code location on the top of
    706 //   the virtualized backtrack stack and some register changes.  When a node is
    707 //   to be emitted it can flush the Trace or update it.  Flushing the Trace
    708 //   will emit code to bring the actual state into line with the virtual state.
    709 //   Avoiding flushing the state can postpone some work (e.g. updates of capture
    710 //   registers).  Postponing work can save time when executing the regular
    711 //   expression since it may be found that the work never has to be done as a
    712 //   failure to match can occur.  In addition it is much faster to jump to a
    713 //   known backtrack code location than it is to pop an unknown backtrack
    714 //   location from the stack and jump there.
    715 //
    716 //   The virtual state found in the Trace affects code generation.  For example
    717 //   the virtual state contains the difference between the actual current
    718 //   position and the virtual current position, and matching code needs to use
    719 //   this offset to attempt a match in the correct location of the input
    720 //   string.  Therefore code generated for a non-trivial trace is specialized
    721 //   to that trace.  The code generator therefore has the ability to generate
    722 //   code for each node several times.  In order to limit the size of the
    723 //   generated code there is an arbitrary limit on how many specialized sets of
    724 //   code may be generated for a given node.  If the limit is reached, the
    725 //   trace is flushed and a generic version of the code for a node is emitted.
    726 //   This is subsequently used for that node.  The code emitted for non-generic
    727 //   trace is not recorded in the node and so it cannot currently be reused in
    728 //   the event that code generation is requested for an identical trace.
    729 
    730 
    731 void RegExpTree::AppendToText(RegExpText* text) {
    732   UNREACHABLE();
    733 }
    734 
    735 
    736 void RegExpAtom::AppendToText(RegExpText* text) {
    737   text->AddElement(TextElement::Atom(this));
    738 }
    739 
    740 
    741 void RegExpCharacterClass::AppendToText(RegExpText* text) {
    742   text->AddElement(TextElement::CharClass(this));
    743 }
    744 
    745 
    746 void RegExpText::AppendToText(RegExpText* text) {
    747   for (int i = 0; i < elements()->length(); i++)
    748     text->AddElement(elements()->at(i));
    749 }
    750 
    751 
    752 TextElement TextElement::Atom(RegExpAtom* atom) {
    753   TextElement result = TextElement(ATOM);
    754   result.data.u_atom = atom;
    755   return result;
    756 }
    757 
    758 
    759 TextElement TextElement::CharClass(
    760       RegExpCharacterClass* char_class) {
    761   TextElement result = TextElement(CHAR_CLASS);
    762   result.data.u_char_class = char_class;
    763   return result;
    764 }
    765 
    766 
    767 int TextElement::length() {
    768   if (type == ATOM) {
    769     return data.u_atom->length();
    770   } else {
    771     ASSERT(type == CHAR_CLASS);
    772     return 1;
    773   }
    774 }
    775 
    776 
    777 DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
    778   if (table_ == NULL) {
    779     table_ = new DispatchTable();
    780     DispatchTableConstructor cons(table_, ignore_case);
    781     cons.BuildTable(this);
    782   }
    783   return table_;
    784 }
    785 
    786 
    787 class RegExpCompiler {
    788  public:
    789   RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii);
    790 
    791   int AllocateRegister() {
    792     if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
    793       reg_exp_too_big_ = true;
    794       return next_register_;
    795     }
    796     return next_register_++;
    797   }
    798 
    799   RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
    800                                            RegExpNode* start,
    801                                            int capture_count,
    802                                            Handle<String> pattern);
    803 
    804   inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
    805 
    806   static const int kImplementationOffset = 0;
    807   static const int kNumberOfRegistersOffset = 0;
    808   static const int kCodeOffset = 1;
    809 
    810   RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
    811   EndNode* accept() { return accept_; }
    812 
    813   static const int kMaxRecursion = 100;
    814   inline int recursion_depth() { return recursion_depth_; }
    815   inline void IncrementRecursionDepth() { recursion_depth_++; }
    816   inline void DecrementRecursionDepth() { recursion_depth_--; }
    817 
    818   void SetRegExpTooBig() { reg_exp_too_big_ = true; }
    819 
    820   inline bool ignore_case() { return ignore_case_; }
    821   inline bool ascii() { return ascii_; }
    822 
    823   int current_expansion_factor() { return current_expansion_factor_; }
    824   void set_current_expansion_factor(int value) {
    825     current_expansion_factor_ = value;
    826   }
    827 
    828   static const int kNoRegister = -1;
    829 
    830  private:
    831   EndNode* accept_;
    832   int next_register_;
    833   List<RegExpNode*>* work_list_;
    834   int recursion_depth_;
    835   RegExpMacroAssembler* macro_assembler_;
    836   bool ignore_case_;
    837   bool ascii_;
    838   bool reg_exp_too_big_;
    839   int current_expansion_factor_;
    840 };
    841 
    842 
    843 class RecursionCheck {
    844  public:
    845   explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
    846     compiler->IncrementRecursionDepth();
    847   }
    848   ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
    849  private:
    850   RegExpCompiler* compiler_;
    851 };
    852 
    853 
    854 static RegExpEngine::CompilationResult IrregexpRegExpTooBig() {
    855   return RegExpEngine::CompilationResult("RegExp too big");
    856 }
    857 
    858 
    859 // Attempts to compile the regexp using an Irregexp code generator.  Returns
    860 // a fixed array or a null handle depending on whether it succeeded.
    861 RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii)
    862     : next_register_(2 * (capture_count + 1)),
    863       work_list_(NULL),
    864       recursion_depth_(0),
    865       ignore_case_(ignore_case),
    866       ascii_(ascii),
    867       reg_exp_too_big_(false),
    868       current_expansion_factor_(1) {
    869   accept_ = new EndNode(EndNode::ACCEPT);
    870   ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
    871 }
    872 
    873 
    874 RegExpEngine::CompilationResult RegExpCompiler::Assemble(
    875     RegExpMacroAssembler* macro_assembler,
    876     RegExpNode* start,
    877     int capture_count,
    878     Handle<String> pattern) {
    879   Heap* heap = pattern->GetHeap();
    880 
    881   bool use_slow_safe_regexp_compiler = false;
    882   if (heap->total_regexp_code_generated() >
    883           RegExpImpl::kRegWxpCompiledLimit &&
    884       heap->isolate()->memory_allocator()->SizeExecutable() >
    885           RegExpImpl::kRegExpExecutableMemoryLimit) {
    886     use_slow_safe_regexp_compiler = true;
    887   }
    888 
    889   macro_assembler->set_slow_safe(use_slow_safe_regexp_compiler);
    890 
    891 #ifdef DEBUG
    892   if (FLAG_trace_regexp_assembler)
    893     macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler);
    894   else
    895 #endif
    896     macro_assembler_ = macro_assembler;
    897 
    898   List <RegExpNode*> work_list(0);
    899   work_list_ = &work_list;
    900   Label fail;
    901   macro_assembler_->PushBacktrack(&fail);
    902   Trace new_trace;
    903   start->Emit(this, &new_trace);
    904   macro_assembler_->Bind(&fail);
    905   macro_assembler_->Fail();
    906   while (!work_list.is_empty()) {
    907     work_list.RemoveLast()->Emit(this, &new_trace);
    908   }
    909   if (reg_exp_too_big_) return IrregexpRegExpTooBig();
    910 
    911   Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
    912   heap->IncreaseTotalRegexpCodeGenerated(code->Size());
    913   work_list_ = NULL;
    914 #ifdef DEBUG
    915   if (FLAG_print_code) {
    916     Handle<Code>::cast(code)->Disassemble(*pattern->ToCString());
    917   }
    918   if (FLAG_trace_regexp_assembler) {
    919     delete macro_assembler_;
    920   }
    921 #endif
    922   return RegExpEngine::CompilationResult(*code, next_register_);
    923 }
    924 
    925 
    926 bool Trace::DeferredAction::Mentions(int that) {
    927   if (type() == ActionNode::CLEAR_CAPTURES) {
    928     Interval range = static_cast<DeferredClearCaptures*>(this)->range();
    929     return range.Contains(that);
    930   } else {
    931     return reg() == that;
    932   }
    933 }
    934 
    935 
    936 bool Trace::mentions_reg(int reg) {
    937   for (DeferredAction* action = actions_;
    938        action != NULL;
    939        action = action->next()) {
    940     if (action->Mentions(reg))
    941       return true;
    942   }
    943   return false;
    944 }
    945 
    946 
    947 bool Trace::GetStoredPosition(int reg, int* cp_offset) {
    948   ASSERT_EQ(0, *cp_offset);
    949   for (DeferredAction* action = actions_;
    950        action != NULL;
    951        action = action->next()) {
    952     if (action->Mentions(reg)) {
    953       if (action->type() == ActionNode::STORE_POSITION) {
    954         *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
    955         return true;
    956       } else {
    957         return false;
    958       }
    959     }
    960   }
    961   return false;
    962 }
    963 
    964 
    965 int Trace::FindAffectedRegisters(OutSet* affected_registers) {
    966   int max_register = RegExpCompiler::kNoRegister;
    967   for (DeferredAction* action = actions_;
    968        action != NULL;
    969        action = action->next()) {
    970     if (action->type() == ActionNode::CLEAR_CAPTURES) {
    971       Interval range = static_cast<DeferredClearCaptures*>(action)->range();
    972       for (int i = range.from(); i <= range.to(); i++)
    973         affected_registers->Set(i);
    974       if (range.to() > max_register) max_register = range.to();
    975     } else {
    976       affected_registers->Set(action->reg());
    977       if (action->reg() > max_register) max_register = action->reg();
    978     }
    979   }
    980   return max_register;
    981 }
    982 
    983 
    984 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
    985                                      int max_register,
    986                                      OutSet& registers_to_pop,
    987                                      OutSet& registers_to_clear) {
    988   for (int reg = max_register; reg >= 0; reg--) {
    989     if (registers_to_pop.Get(reg)) assembler->PopRegister(reg);
    990     else if (registers_to_clear.Get(reg)) {
    991       int clear_to = reg;
    992       while (reg > 0 && registers_to_clear.Get(reg - 1)) {
    993         reg--;
    994       }
    995       assembler->ClearRegisters(reg, clear_to);
    996     }
    997   }
    998 }
    999 
   1000 
   1001 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
   1002                                    int max_register,
   1003                                    OutSet& affected_registers,
   1004                                    OutSet* registers_to_pop,
   1005                                    OutSet* registers_to_clear) {
   1006   // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
   1007   const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
   1008 
   1009   // Count pushes performed to force a stack limit check occasionally.
   1010   int pushes = 0;
   1011 
   1012   for (int reg = 0; reg <= max_register; reg++) {
   1013     if (!affected_registers.Get(reg)) {
   1014       continue;
   1015     }
   1016 
   1017     // The chronologically first deferred action in the trace
   1018     // is used to infer the action needed to restore a register
   1019     // to its previous state (or not, if it's safe to ignore it).
   1020     enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
   1021     DeferredActionUndoType undo_action = IGNORE;
   1022 
   1023     int value = 0;
   1024     bool absolute = false;
   1025     bool clear = false;
   1026     int store_position = -1;
   1027     // This is a little tricky because we are scanning the actions in reverse
   1028     // historical order (newest first).
   1029     for (DeferredAction* action = actions_;
   1030          action != NULL;
   1031          action = action->next()) {
   1032       if (action->Mentions(reg)) {
   1033         switch (action->type()) {
   1034           case ActionNode::SET_REGISTER: {
   1035             Trace::DeferredSetRegister* psr =
   1036                 static_cast<Trace::DeferredSetRegister*>(action);
   1037             if (!absolute) {
   1038               value += psr->value();
   1039               absolute = true;
   1040             }
   1041             // SET_REGISTER is currently only used for newly introduced loop
   1042             // counters. They can have a significant previous value if they
   1043             // occour in a loop. TODO(lrn): Propagate this information, so
   1044             // we can set undo_action to IGNORE if we know there is no value to
   1045             // restore.
   1046             undo_action = RESTORE;
   1047             ASSERT_EQ(store_position, -1);
   1048             ASSERT(!clear);
   1049             break;
   1050           }
   1051           case ActionNode::INCREMENT_REGISTER:
   1052             if (!absolute) {
   1053               value++;
   1054             }
   1055             ASSERT_EQ(store_position, -1);
   1056             ASSERT(!clear);
   1057             undo_action = RESTORE;
   1058             break;
   1059           case ActionNode::STORE_POSITION: {
   1060             Trace::DeferredCapture* pc =
   1061                 static_cast<Trace::DeferredCapture*>(action);
   1062             if (!clear && store_position == -1) {
   1063               store_position = pc->cp_offset();
   1064             }
   1065 
   1066             // For captures we know that stores and clears alternate.
   1067             // Other register, are never cleared, and if the occur
   1068             // inside a loop, they might be assigned more than once.
   1069             if (reg <= 1) {
   1070               // Registers zero and one, aka "capture zero", is
   1071               // always set correctly if we succeed. There is no
   1072               // need to undo a setting on backtrack, because we
   1073               // will set it again or fail.
   1074               undo_action = IGNORE;
   1075             } else {
   1076               undo_action = pc->is_capture() ? CLEAR : RESTORE;
   1077             }
   1078             ASSERT(!absolute);
   1079             ASSERT_EQ(value, 0);
   1080             break;
   1081           }
   1082           case ActionNode::CLEAR_CAPTURES: {
   1083             // Since we're scanning in reverse order, if we've already
   1084             // set the position we have to ignore historically earlier
   1085             // clearing operations.
   1086             if (store_position == -1) {
   1087               clear = true;
   1088             }
   1089             undo_action = RESTORE;
   1090             ASSERT(!absolute);
   1091             ASSERT_EQ(value, 0);
   1092             break;
   1093           }
   1094           default:
   1095             UNREACHABLE();
   1096             break;
   1097         }
   1098       }
   1099     }
   1100     // Prepare for the undo-action (e.g., push if it's going to be popped).
   1101     if (undo_action == RESTORE) {
   1102       pushes++;
   1103       RegExpMacroAssembler::StackCheckFlag stack_check =
   1104           RegExpMacroAssembler::kNoStackLimitCheck;
   1105       if (pushes == push_limit) {
   1106         stack_check = RegExpMacroAssembler::kCheckStackLimit;
   1107         pushes = 0;
   1108       }
   1109 
   1110       assembler->PushRegister(reg, stack_check);
   1111       registers_to_pop->Set(reg);
   1112     } else if (undo_action == CLEAR) {
   1113       registers_to_clear->Set(reg);
   1114     }
   1115     // Perform the chronologically last action (or accumulated increment)
   1116     // for the register.
   1117     if (store_position != -1) {
   1118       assembler->WriteCurrentPositionToRegister(reg, store_position);
   1119     } else if (clear) {
   1120       assembler->ClearRegisters(reg, reg);
   1121     } else if (absolute) {
   1122       assembler->SetRegister(reg, value);
   1123     } else if (value != 0) {
   1124       assembler->AdvanceRegister(reg, value);
   1125     }
   1126   }
   1127 }
   1128 
   1129 
   1130 // This is called as we come into a loop choice node and some other tricky
   1131 // nodes.  It normalizes the state of the code generator to ensure we can
   1132 // generate generic code.
   1133 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
   1134   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   1135 
   1136   ASSERT(!is_trivial());
   1137 
   1138   if (actions_ == NULL && backtrack() == NULL) {
   1139     // Here we just have some deferred cp advances to fix and we are back to
   1140     // a normal situation.  We may also have to forget some information gained
   1141     // through a quick check that was already performed.
   1142     if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
   1143     // Create a new trivial state and generate the node with that.
   1144     Trace new_state;
   1145     successor->Emit(compiler, &new_state);
   1146     return;
   1147   }
   1148 
   1149   // Generate deferred actions here along with code to undo them again.
   1150   OutSet affected_registers;
   1151 
   1152   if (backtrack() != NULL) {
   1153     // Here we have a concrete backtrack location.  These are set up by choice
   1154     // nodes and so they indicate that we have a deferred save of the current
   1155     // position which we may need to emit here.
   1156     assembler->PushCurrentPosition();
   1157   }
   1158 
   1159   int max_register = FindAffectedRegisters(&affected_registers);
   1160   OutSet registers_to_pop;
   1161   OutSet registers_to_clear;
   1162   PerformDeferredActions(assembler,
   1163                          max_register,
   1164                          affected_registers,
   1165                          &registers_to_pop,
   1166                          &registers_to_clear);
   1167   if (cp_offset_ != 0) {
   1168     assembler->AdvanceCurrentPosition(cp_offset_);
   1169   }
   1170 
   1171   // Create a new trivial state and generate the node with that.
   1172   Label undo;
   1173   assembler->PushBacktrack(&undo);
   1174   Trace new_state;
   1175   successor->Emit(compiler, &new_state);
   1176 
   1177   // On backtrack we need to restore state.
   1178   assembler->Bind(&undo);
   1179   RestoreAffectedRegisters(assembler,
   1180                            max_register,
   1181                            registers_to_pop,
   1182                            registers_to_clear);
   1183   if (backtrack() == NULL) {
   1184     assembler->Backtrack();
   1185   } else {
   1186     assembler->PopCurrentPosition();
   1187     assembler->GoTo(backtrack());
   1188   }
   1189 }
   1190 
   1191 
   1192 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
   1193   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   1194 
   1195   // Omit flushing the trace. We discard the entire stack frame anyway.
   1196 
   1197   if (!label()->is_bound()) {
   1198     // We are completely independent of the trace, since we ignore it,
   1199     // so this code can be used as the generic version.
   1200     assembler->Bind(label());
   1201   }
   1202 
   1203   // Throw away everything on the backtrack stack since the start
   1204   // of the negative submatch and restore the character position.
   1205   assembler->ReadCurrentPositionFromRegister(current_position_register_);
   1206   assembler->ReadStackPointerFromRegister(stack_pointer_register_);
   1207   if (clear_capture_count_ > 0) {
   1208     // Clear any captures that might have been performed during the success
   1209     // of the body of the negative look-ahead.
   1210     int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
   1211     assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
   1212   }
   1213   // Now that we have unwound the stack we find at the top of the stack the
   1214   // backtrack that the BeginSubmatch node got.
   1215   assembler->Backtrack();
   1216 }
   1217 
   1218 
   1219 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
   1220   if (!trace->is_trivial()) {
   1221     trace->Flush(compiler, this);
   1222     return;
   1223   }
   1224   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   1225   if (!label()->is_bound()) {
   1226     assembler->Bind(label());
   1227   }
   1228   switch (action_) {
   1229     case ACCEPT:
   1230       assembler->Succeed();
   1231       return;
   1232     case BACKTRACK:
   1233       assembler->GoTo(trace->backtrack());
   1234       return;
   1235     case NEGATIVE_SUBMATCH_SUCCESS:
   1236       // This case is handled in a different virtual method.
   1237       UNREACHABLE();
   1238   }
   1239   UNIMPLEMENTED();
   1240 }
   1241 
   1242 
   1243 void GuardedAlternative::AddGuard(Guard* guard) {
   1244   if (guards_ == NULL)
   1245     guards_ = new ZoneList<Guard*>(1);
   1246   guards_->Add(guard);
   1247 }
   1248 
   1249 
   1250 ActionNode* ActionNode::SetRegister(int reg,
   1251                                     int val,
   1252                                     RegExpNode* on_success) {
   1253   ActionNode* result = new ActionNode(SET_REGISTER, on_success);
   1254   result->data_.u_store_register.reg = reg;
   1255   result->data_.u_store_register.value = val;
   1256   return result;
   1257 }
   1258 
   1259 
   1260 ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
   1261   ActionNode* result = new ActionNode(INCREMENT_REGISTER, on_success);
   1262   result->data_.u_increment_register.reg = reg;
   1263   return result;
   1264 }
   1265 
   1266 
   1267 ActionNode* ActionNode::StorePosition(int reg,
   1268                                       bool is_capture,
   1269                                       RegExpNode* on_success) {
   1270   ActionNode* result = new ActionNode(STORE_POSITION, on_success);
   1271   result->data_.u_position_register.reg = reg;
   1272   result->data_.u_position_register.is_capture = is_capture;
   1273   return result;
   1274 }
   1275 
   1276 
   1277 ActionNode* ActionNode::ClearCaptures(Interval range,
   1278                                       RegExpNode* on_success) {
   1279   ActionNode* result = new ActionNode(CLEAR_CAPTURES, on_success);
   1280   result->data_.u_clear_captures.range_from = range.from();
   1281   result->data_.u_clear_captures.range_to = range.to();
   1282   return result;
   1283 }
   1284 
   1285 
   1286 ActionNode* ActionNode::BeginSubmatch(int stack_reg,
   1287                                       int position_reg,
   1288                                       RegExpNode* on_success) {
   1289   ActionNode* result = new ActionNode(BEGIN_SUBMATCH, on_success);
   1290   result->data_.u_submatch.stack_pointer_register = stack_reg;
   1291   result->data_.u_submatch.current_position_register = position_reg;
   1292   return result;
   1293 }
   1294 
   1295 
   1296 ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
   1297                                                 int position_reg,
   1298                                                 int clear_register_count,
   1299                                                 int clear_register_from,
   1300                                                 RegExpNode* on_success) {
   1301   ActionNode* result = new ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
   1302   result->data_.u_submatch.stack_pointer_register = stack_reg;
   1303   result->data_.u_submatch.current_position_register = position_reg;
   1304   result->data_.u_submatch.clear_register_count = clear_register_count;
   1305   result->data_.u_submatch.clear_register_from = clear_register_from;
   1306   return result;
   1307 }
   1308 
   1309 
   1310 ActionNode* ActionNode::EmptyMatchCheck(int start_register,
   1311                                         int repetition_register,
   1312                                         int repetition_limit,
   1313                                         RegExpNode* on_success) {
   1314   ActionNode* result = new ActionNode(EMPTY_MATCH_CHECK, on_success);
   1315   result->data_.u_empty_match_check.start_register = start_register;
   1316   result->data_.u_empty_match_check.repetition_register = repetition_register;
   1317   result->data_.u_empty_match_check.repetition_limit = repetition_limit;
   1318   return result;
   1319 }
   1320 
   1321 
   1322 #define DEFINE_ACCEPT(Type)                                          \
   1323   void Type##Node::Accept(NodeVisitor* visitor) {                    \
   1324     visitor->Visit##Type(this);                                      \
   1325   }
   1326 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
   1327 #undef DEFINE_ACCEPT
   1328 
   1329 
   1330 void LoopChoiceNode::Accept(NodeVisitor* visitor) {
   1331   visitor->VisitLoopChoice(this);
   1332 }
   1333 
   1334 
   1335 // -------------------------------------------------------------------
   1336 // Emit code.
   1337 
   1338 
   1339 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
   1340                                Guard* guard,
   1341                                Trace* trace) {
   1342   switch (guard->op()) {
   1343     case Guard::LT:
   1344       ASSERT(!trace->mentions_reg(guard->reg()));
   1345       macro_assembler->IfRegisterGE(guard->reg(),
   1346                                     guard->value(),
   1347                                     trace->backtrack());
   1348       break;
   1349     case Guard::GEQ:
   1350       ASSERT(!trace->mentions_reg(guard->reg()));
   1351       macro_assembler->IfRegisterLT(guard->reg(),
   1352                                     guard->value(),
   1353                                     trace->backtrack());
   1354       break;
   1355   }
   1356 }
   1357 
   1358 
   1359 // Returns the number of characters in the equivalence class, omitting those
   1360 // that cannot occur in the source string because it is ASCII.
   1361 static int GetCaseIndependentLetters(Isolate* isolate,
   1362                                      uc16 character,
   1363                                      bool ascii_subject,
   1364                                      unibrow::uchar* letters) {
   1365   int length =
   1366       isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
   1367   // Unibrow returns 0 or 1 for characters where case independence is
   1368   // trivial.
   1369   if (length == 0) {
   1370     letters[0] = character;
   1371     length = 1;
   1372   }
   1373   if (!ascii_subject || character <= String::kMaxAsciiCharCode) {
   1374     return length;
   1375   }
   1376   // The standard requires that non-ASCII characters cannot have ASCII
   1377   // character codes in their equivalence class.
   1378   return 0;
   1379 }
   1380 
   1381 
   1382 static inline bool EmitSimpleCharacter(Isolate* isolate,
   1383                                        RegExpCompiler* compiler,
   1384                                        uc16 c,
   1385                                        Label* on_failure,
   1386                                        int cp_offset,
   1387                                        bool check,
   1388                                        bool preloaded) {
   1389   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   1390   bool bound_checked = false;
   1391   if (!preloaded) {
   1392     assembler->LoadCurrentCharacter(
   1393         cp_offset,
   1394         on_failure,
   1395         check);
   1396     bound_checked = true;
   1397   }
   1398   assembler->CheckNotCharacter(c, on_failure);
   1399   return bound_checked;
   1400 }
   1401 
   1402 
   1403 // Only emits non-letters (things that don't have case).  Only used for case
   1404 // independent matches.
   1405 static inline bool EmitAtomNonLetter(Isolate* isolate,
   1406                                      RegExpCompiler* compiler,
   1407                                      uc16 c,
   1408                                      Label* on_failure,
   1409                                      int cp_offset,
   1410                                      bool check,
   1411                                      bool preloaded) {
   1412   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
   1413   bool ascii = compiler->ascii();
   1414   unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
   1415   int length = GetCaseIndependentLetters(isolate, c, ascii, chars);
   1416   if (length < 1) {
   1417     // This can't match.  Must be an ASCII subject and a non-ASCII character.
   1418     // We do not need to do anything since the ASCII pass already handled this.
   1419     return false;  // Bounds not checked.
   1420   }
   1421   bool checked = false;
   1422   // We handle the length > 1 case in a later pass.
   1423   if (length == 1) {
   1424     if (ascii && c > String::kMaxAsciiCharCodeU) {
   1425       // Can't match - see above.
   1426       return false;  // Bounds not checked.
   1427     }
   1428     if (!preloaded) {
   1429       macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
   1430       checked = check;
   1431     }
   1432     macro_assembler->CheckNotCharacter(c, on_failure);
   1433   }
   1434   return checked;
   1435 }
   1436 
   1437 
   1438 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
   1439                                       bool ascii,
   1440                                       uc16 c1,
   1441                                       uc16 c2,
   1442                                       Label* on_failure) {
   1443   uc16 char_mask;
   1444   if (ascii) {
   1445     char_mask = String::kMaxAsciiCharCode;
   1446   } else {
   1447     char_mask = String::kMaxUtf16CodeUnit;
   1448   }
   1449   uc16 exor = c1 ^ c2;
   1450   // Check whether exor has only one bit set.
   1451   if (((exor - 1) & exor) == 0) {
   1452     // If c1 and c2 differ only by one bit.
   1453     // Ecma262UnCanonicalize always gives the highest number last.
   1454     ASSERT(c2 > c1);
   1455     uc16 mask = char_mask ^ exor;
   1456     macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
   1457     return true;
   1458   }
   1459   ASSERT(c2 > c1);
   1460   uc16 diff = c2 - c1;
   1461   if (((diff - 1) & diff) == 0 && c1 >= diff) {
   1462     // If the characters differ by 2^n but don't differ by one bit then
   1463     // subtract the difference from the found character, then do the or
   1464     // trick.  We avoid the theoretical case where negative numbers are
   1465     // involved in order to simplify code generation.
   1466     uc16 mask = char_mask ^ diff;
   1467     macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
   1468                                                     diff,
   1469                                                     mask,
   1470                                                     on_failure);
   1471     return true;
   1472   }
   1473   return false;
   1474 }
   1475 
   1476 
   1477 typedef bool EmitCharacterFunction(Isolate* isolate,
   1478                                    RegExpCompiler* compiler,
   1479                                    uc16 c,
   1480                                    Label* on_failure,
   1481                                    int cp_offset,
   1482                                    bool check,
   1483                                    bool preloaded);
   1484 
   1485 // Only emits letters (things that have case).  Only used for case independent
   1486 // matches.
   1487 static inline bool EmitAtomLetter(Isolate* isolate,
   1488                                   RegExpCompiler* compiler,
   1489                                   uc16 c,
   1490                                   Label* on_failure,
   1491                                   int cp_offset,
   1492                                   bool check,
   1493                                   bool preloaded) {
   1494   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
   1495   bool ascii = compiler->ascii();
   1496   unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
   1497   int length = GetCaseIndependentLetters(isolate, c, ascii, chars);
   1498   if (length <= 1) return false;
   1499   // We may not need to check against the end of the input string
   1500   // if this character lies before a character that matched.
   1501   if (!preloaded) {
   1502     macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
   1503   }
   1504   Label ok;
   1505   ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
   1506   switch (length) {
   1507     case 2: {
   1508       if (ShortCutEmitCharacterPair(macro_assembler,
   1509                                     ascii,
   1510                                     chars[0],
   1511                                     chars[1],
   1512                                     on_failure)) {
   1513       } else {
   1514         macro_assembler->CheckCharacter(chars[0], &ok);
   1515         macro_assembler->CheckNotCharacter(chars[1], on_failure);
   1516         macro_assembler->Bind(&ok);
   1517       }
   1518       break;
   1519     }
   1520     case 4:
   1521       macro_assembler->CheckCharacter(chars[3], &ok);
   1522       // Fall through!
   1523     case 3:
   1524       macro_assembler->CheckCharacter(chars[0], &ok);
   1525       macro_assembler->CheckCharacter(chars[1], &ok);
   1526       macro_assembler->CheckNotCharacter(chars[2], on_failure);
   1527       macro_assembler->Bind(&ok);
   1528       break;
   1529     default:
   1530       UNREACHABLE();
   1531       break;
   1532   }
   1533   return true;
   1534 }
   1535 
   1536 
   1537 static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
   1538                           RegExpCharacterClass* cc,
   1539                           bool ascii,
   1540                           Label* on_failure,
   1541                           int cp_offset,
   1542                           bool check_offset,
   1543                           bool preloaded) {
   1544   ZoneList<CharacterRange>* ranges = cc->ranges();
   1545   int max_char;
   1546   if (ascii) {
   1547     max_char = String::kMaxAsciiCharCode;
   1548   } else {
   1549     max_char = String::kMaxUtf16CodeUnit;
   1550   }
   1551 
   1552   Label success;
   1553 
   1554   Label* char_is_in_class =
   1555       cc->is_negated() ? on_failure : &success;
   1556 
   1557   int range_count = ranges->length();
   1558 
   1559   int last_valid_range = range_count - 1;
   1560   while (last_valid_range >= 0) {
   1561     CharacterRange& range = ranges->at(last_valid_range);
   1562     if (range.from() <= max_char) {
   1563       break;
   1564     }
   1565     last_valid_range--;
   1566   }
   1567 
   1568   if (last_valid_range < 0) {
   1569     if (!cc->is_negated()) {
   1570       // TODO(plesner): We can remove this when the node level does our
   1571       // ASCII optimizations for us.
   1572       macro_assembler->GoTo(on_failure);
   1573     }
   1574     if (check_offset) {
   1575       macro_assembler->CheckPosition(cp_offset, on_failure);
   1576     }
   1577     return;
   1578   }
   1579 
   1580   if (last_valid_range == 0 &&
   1581       !cc->is_negated() &&
   1582       ranges->at(0).IsEverything(max_char)) {
   1583     // This is a common case hit by non-anchored expressions.
   1584     if (check_offset) {
   1585       macro_assembler->CheckPosition(cp_offset, on_failure);
   1586     }
   1587     return;
   1588   }
   1589 
   1590   if (!preloaded) {
   1591     macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
   1592   }
   1593 
   1594   if (cc->is_standard() &&
   1595         macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
   1596                                                     on_failure)) {
   1597       return;
   1598   }
   1599 
   1600   for (int i = 0; i < last_valid_range; i++) {
   1601     CharacterRange& range = ranges->at(i);
   1602     Label next_range;
   1603     uc16 from = range.from();
   1604     uc16 to = range.to();
   1605     if (from > max_char) {
   1606       continue;
   1607     }
   1608     if (to > max_char) to = max_char;
   1609     if (to == from) {
   1610       macro_assembler->CheckCharacter(to, char_is_in_class);
   1611     } else {
   1612       if (from != 0) {
   1613         macro_assembler->CheckCharacterLT(from, &next_range);
   1614       }
   1615       if (to != max_char) {
   1616         macro_assembler->CheckCharacterLT(to + 1, char_is_in_class);
   1617       } else {
   1618         macro_assembler->GoTo(char_is_in_class);
   1619       }
   1620     }
   1621     macro_assembler->Bind(&next_range);
   1622   }
   1623 
   1624   CharacterRange& range = ranges->at(last_valid_range);
   1625   uc16 from = range.from();
   1626   uc16 to = range.to();
   1627 
   1628   if (to > max_char) to = max_char;
   1629   ASSERT(to >= from);
   1630 
   1631   if (to == from) {
   1632     if (cc->is_negated()) {
   1633       macro_assembler->CheckCharacter(to, on_failure);
   1634     } else {
   1635       macro_assembler->CheckNotCharacter(to, on_failure);
   1636     }
   1637   } else {
   1638     if (from != 0) {
   1639       if (cc->is_negated()) {
   1640         macro_assembler->CheckCharacterLT(from, &success);
   1641       } else {
   1642         macro_assembler->CheckCharacterLT(from, on_failure);
   1643       }
   1644     }
   1645     if (to != String::kMaxUtf16CodeUnit) {
   1646       if (cc->is_negated()) {
   1647         macro_assembler->CheckCharacterLT(to + 1, on_failure);
   1648       } else {
   1649         macro_assembler->CheckCharacterGT(to, on_failure);
   1650       }
   1651     } else {
   1652       if (cc->is_negated()) {
   1653         macro_assembler->GoTo(on_failure);
   1654       }
   1655     }
   1656   }
   1657   macro_assembler->Bind(&success);
   1658 }
   1659 
   1660 
   1661 RegExpNode::~RegExpNode() {
   1662 }
   1663 
   1664 
   1665 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
   1666                                                   Trace* trace) {
   1667   // If we are generating a greedy loop then don't stop and don't reuse code.
   1668   if (trace->stop_node() != NULL) {
   1669     return CONTINUE;
   1670   }
   1671 
   1672   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
   1673   if (trace->is_trivial()) {
   1674     if (label_.is_bound()) {
   1675       // We are being asked to generate a generic version, but that's already
   1676       // been done so just go to it.
   1677       macro_assembler->GoTo(&label_);
   1678       return DONE;
   1679     }
   1680     if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
   1681       // To avoid too deep recursion we push the node to the work queue and just
   1682       // generate a goto here.
   1683       compiler->AddWork(this);
   1684       macro_assembler->GoTo(&label_);
   1685       return DONE;
   1686     }
   1687     // Generate generic version of the node and bind the label for later use.
   1688     macro_assembler->Bind(&label_);
   1689     return CONTINUE;
   1690   }
   1691 
   1692   // We are being asked to make a non-generic version.  Keep track of how many
   1693   // non-generic versions we generate so as not to overdo it.
   1694   trace_count_++;
   1695   if (FLAG_regexp_optimization &&
   1696       trace_count_ < kMaxCopiesCodeGenerated &&
   1697       compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {
   1698     return CONTINUE;
   1699   }
   1700 
   1701   // If we get here code has been generated for this node too many times or
   1702   // recursion is too deep.  Time to switch to a generic version.  The code for
   1703   // generic versions above can handle deep recursion properly.
   1704   trace->Flush(compiler, this);
   1705   return DONE;
   1706 }
   1707 
   1708 
   1709 int ActionNode::EatsAtLeast(int still_to_find,
   1710                             int recursion_depth,
   1711                             bool not_at_start) {
   1712   if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
   1713   if (type_ == POSITIVE_SUBMATCH_SUCCESS) return 0;  // Rewinds input!
   1714   return on_success()->EatsAtLeast(still_to_find,
   1715                                    recursion_depth + 1,
   1716                                    not_at_start);
   1717 }
   1718 
   1719 
   1720 int AssertionNode::EatsAtLeast(int still_to_find,
   1721                                int recursion_depth,
   1722                                bool not_at_start) {
   1723   if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
   1724   // If we know we are not at the start and we are asked "how many characters
   1725   // will you match if you succeed?" then we can answer anything since false
   1726   // implies false.  So lets just return the max answer (still_to_find) since
   1727   // that won't prevent us from preloading a lot of characters for the other
   1728   // branches in the node graph.
   1729   if (type() == AT_START && not_at_start) return still_to_find;
   1730   return on_success()->EatsAtLeast(still_to_find,
   1731                                    recursion_depth + 1,
   1732                                    not_at_start);
   1733 }
   1734 
   1735 
   1736 int BackReferenceNode::EatsAtLeast(int still_to_find,
   1737                                    int recursion_depth,
   1738                                    bool not_at_start) {
   1739   if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
   1740   return on_success()->EatsAtLeast(still_to_find,
   1741                                    recursion_depth + 1,
   1742                                    not_at_start);
   1743 }
   1744 
   1745 
   1746 int TextNode::EatsAtLeast(int still_to_find,
   1747                           int recursion_depth,
   1748                           bool not_at_start) {
   1749   int answer = Length();
   1750   if (answer >= still_to_find) return answer;
   1751   if (recursion_depth > RegExpCompiler::kMaxRecursion) return answer;
   1752   // We are not at start after this node so we set the last argument to 'true'.
   1753   return answer + on_success()->EatsAtLeast(still_to_find - answer,
   1754                                             recursion_depth + 1,
   1755                                             true);
   1756 }
   1757 
   1758 
   1759 int NegativeLookaheadChoiceNode::EatsAtLeast(int still_to_find,
   1760                                              int recursion_depth,
   1761                                              bool not_at_start) {
   1762   if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
   1763   // Alternative 0 is the negative lookahead, alternative 1 is what comes
   1764   // afterwards.
   1765   RegExpNode* node = alternatives_->at(1).node();
   1766   return node->EatsAtLeast(still_to_find, recursion_depth + 1, not_at_start);
   1767 }
   1768 
   1769 
   1770 void NegativeLookaheadChoiceNode::GetQuickCheckDetails(
   1771     QuickCheckDetails* details,
   1772     RegExpCompiler* compiler,
   1773     int filled_in,
   1774     bool not_at_start) {
   1775   // Alternative 0 is the negative lookahead, alternative 1 is what comes
   1776   // afterwards.
   1777   RegExpNode* node = alternatives_->at(1).node();
   1778   return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
   1779 }
   1780 
   1781 
   1782 int ChoiceNode::EatsAtLeastHelper(int still_to_find,
   1783                                   int recursion_depth,
   1784                                   RegExpNode* ignore_this_node,
   1785                                   bool not_at_start) {
   1786   if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
   1787   int min = 100;
   1788   int choice_count = alternatives_->length();
   1789   for (int i = 0; i < choice_count; i++) {
   1790     RegExpNode* node = alternatives_->at(i).node();
   1791     if (node == ignore_this_node) continue;
   1792     int node_eats_at_least = node->EatsAtLeast(still_to_find,
   1793                                                recursion_depth + 1,
   1794                                                not_at_start);
   1795     if (node_eats_at_least < min) min = node_eats_at_least;
   1796   }
   1797   return min;
   1798 }
   1799 
   1800 
   1801 int LoopChoiceNode::EatsAtLeast(int still_to_find,
   1802                                 int recursion_depth,
   1803                                 bool not_at_start) {
   1804   return EatsAtLeastHelper(still_to_find,
   1805                            recursion_depth,
   1806                            loop_node_,
   1807                            not_at_start);
   1808 }
   1809 
   1810 
   1811 int ChoiceNode::EatsAtLeast(int still_to_find,
   1812                             int recursion_depth,
   1813                             bool not_at_start) {
   1814   return EatsAtLeastHelper(still_to_find,
   1815                            recursion_depth,
   1816                            NULL,
   1817                            not_at_start);
   1818 }
   1819 
   1820 
   1821 // Takes the left-most 1-bit and smears it out, setting all bits to its right.
   1822 static inline uint32_t SmearBitsRight(uint32_t v) {
   1823   v |= v >> 1;
   1824   v |= v >> 2;
   1825   v |= v >> 4;
   1826   v |= v >> 8;
   1827   v |= v >> 16;
   1828   return v;
   1829 }
   1830 
   1831 
   1832 bool QuickCheckDetails::Rationalize(bool asc) {
   1833   bool found_useful_op = false;
   1834   uint32_t char_mask;
   1835   if (asc) {
   1836     char_mask = String::kMaxAsciiCharCode;
   1837   } else {
   1838     char_mask = String::kMaxUtf16CodeUnit;
   1839   }
   1840   mask_ = 0;
   1841   value_ = 0;
   1842   int char_shift = 0;
   1843   for (int i = 0; i < characters_; i++) {
   1844     Position* pos = &positions_[i];
   1845     if ((pos->mask & String::kMaxAsciiCharCode) != 0) {
   1846       found_useful_op = true;
   1847     }
   1848     mask_ |= (pos->mask & char_mask) << char_shift;
   1849     value_ |= (pos->value & char_mask) << char_shift;
   1850     char_shift += asc ? 8 : 16;
   1851   }
   1852   return found_useful_op;
   1853 }
   1854 
   1855 
   1856 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
   1857                                 Trace* trace,
   1858                                 bool preload_has_checked_bounds,
   1859                                 Label* on_possible_success,
   1860                                 QuickCheckDetails* details,
   1861                                 bool fall_through_on_failure) {
   1862   if (details->characters() == 0) return false;
   1863   GetQuickCheckDetails(details, compiler, 0, trace->at_start() == Trace::FALSE);
   1864   if (details->cannot_match()) return false;
   1865   if (!details->Rationalize(compiler->ascii())) return false;
   1866   ASSERT(details->characters() == 1 ||
   1867          compiler->macro_assembler()->CanReadUnaligned());
   1868   uint32_t mask = details->mask();
   1869   uint32_t value = details->value();
   1870 
   1871   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   1872 
   1873   if (trace->characters_preloaded() != details->characters()) {
   1874     assembler->LoadCurrentCharacter(trace->cp_offset(),
   1875                                     trace->backtrack(),
   1876                                     !preload_has_checked_bounds,
   1877                                     details->characters());
   1878   }
   1879 
   1880 
   1881   bool need_mask = true;
   1882 
   1883   if (details->characters() == 1) {
   1884     // If number of characters preloaded is 1 then we used a byte or 16 bit
   1885     // load so the value is already masked down.
   1886     uint32_t char_mask;
   1887     if (compiler->ascii()) {
   1888       char_mask = String::kMaxAsciiCharCode;
   1889     } else {
   1890       char_mask = String::kMaxUtf16CodeUnit;
   1891     }
   1892     if ((mask & char_mask) == char_mask) need_mask = false;
   1893     mask &= char_mask;
   1894   } else {
   1895     // For 2-character preloads in ASCII mode or 1-character preloads in
   1896     // TWO_BYTE mode we also use a 16 bit load with zero extend.
   1897     if (details->characters() == 2 && compiler->ascii()) {
   1898       if ((mask & 0x7f7f) == 0x7f7f) need_mask = false;
   1899     } else if (details->characters() == 1 && !compiler->ascii()) {
   1900       if ((mask & 0xffff) == 0xffff) need_mask = false;
   1901     } else {
   1902       if (mask == 0xffffffff) need_mask = false;
   1903     }
   1904   }
   1905 
   1906   if (fall_through_on_failure) {
   1907     if (need_mask) {
   1908       assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
   1909     } else {
   1910       assembler->CheckCharacter(value, on_possible_success);
   1911     }
   1912   } else {
   1913     if (need_mask) {
   1914       assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
   1915     } else {
   1916       assembler->CheckNotCharacter(value, trace->backtrack());
   1917     }
   1918   }
   1919   return true;
   1920 }
   1921 
   1922 
   1923 // Here is the meat of GetQuickCheckDetails (see also the comment on the
   1924 // super-class in the .h file).
   1925 //
   1926 // We iterate along the text object, building up for each character a
   1927 // mask and value that can be used to test for a quick failure to match.
   1928 // The masks and values for the positions will be combined into a single
   1929 // machine word for the current character width in order to be used in
   1930 // generating a quick check.
   1931 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
   1932                                     RegExpCompiler* compiler,
   1933                                     int characters_filled_in,
   1934                                     bool not_at_start) {
   1935   Isolate* isolate = Isolate::Current();
   1936   ASSERT(characters_filled_in < details->characters());
   1937   int characters = details->characters();
   1938   int char_mask;
   1939   if (compiler->ascii()) {
   1940     char_mask = String::kMaxAsciiCharCode;
   1941   } else {
   1942     char_mask = String::kMaxUtf16CodeUnit;
   1943   }
   1944   for (int k = 0; k < elms_->length(); k++) {
   1945     TextElement elm = elms_->at(k);
   1946     if (elm.type == TextElement::ATOM) {
   1947       Vector<const uc16> quarks = elm.data.u_atom->data();
   1948       for (int i = 0; i < characters && i < quarks.length(); i++) {
   1949         QuickCheckDetails::Position* pos =
   1950             details->positions(characters_filled_in);
   1951         uc16 c = quarks[i];
   1952         if (c > char_mask) {
   1953           // If we expect a non-ASCII character from an ASCII string,
   1954           // there is no way we can match. Not even case independent
   1955           // matching can turn an ASCII character into non-ASCII or
   1956           // vice versa.
   1957           details->set_cannot_match();
   1958           pos->determines_perfectly = false;
   1959           return;
   1960         }
   1961         if (compiler->ignore_case()) {
   1962           unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
   1963           int length = GetCaseIndependentLetters(isolate, c, compiler->ascii(),
   1964                                                  chars);
   1965           ASSERT(length != 0);  // Can only happen if c > char_mask (see above).
   1966           if (length == 1) {
   1967             // This letter has no case equivalents, so it's nice and simple
   1968             // and the mask-compare will determine definitely whether we have
   1969             // a match at this character position.
   1970             pos->mask = char_mask;
   1971             pos->value = c;
   1972             pos->determines_perfectly = true;
   1973           } else {
   1974             uint32_t common_bits = char_mask;
   1975             uint32_t bits = chars[0];
   1976             for (int j = 1; j < length; j++) {
   1977               uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
   1978               common_bits ^= differing_bits;
   1979               bits &= common_bits;
   1980             }
   1981             // If length is 2 and common bits has only one zero in it then
   1982             // our mask and compare instruction will determine definitely
   1983             // whether we have a match at this character position.  Otherwise
   1984             // it can only be an approximate check.
   1985             uint32_t one_zero = (common_bits | ~char_mask);
   1986             if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
   1987               pos->determines_perfectly = true;
   1988             }
   1989             pos->mask = common_bits;
   1990             pos->value = bits;
   1991           }
   1992         } else {
   1993           // Don't ignore case.  Nice simple case where the mask-compare will
   1994           // determine definitely whether we have a match at this character
   1995           // position.
   1996           pos->mask = char_mask;
   1997           pos->value = c;
   1998           pos->determines_perfectly = true;
   1999         }
   2000         characters_filled_in++;
   2001         ASSERT(characters_filled_in <= details->characters());
   2002         if (characters_filled_in == details->characters()) {
   2003           return;
   2004         }
   2005       }
   2006     } else {
   2007       QuickCheckDetails::Position* pos =
   2008           details->positions(characters_filled_in);
   2009       RegExpCharacterClass* tree = elm.data.u_char_class;
   2010       ZoneList<CharacterRange>* ranges = tree->ranges();
   2011       if (tree->is_negated()) {
   2012         // A quick check uses multi-character mask and compare.  There is no
   2013         // useful way to incorporate a negative char class into this scheme
   2014         // so we just conservatively create a mask and value that will always
   2015         // succeed.
   2016         pos->mask = 0;
   2017         pos->value = 0;
   2018       } else {
   2019         int first_range = 0;
   2020         while (ranges->at(first_range).from() > char_mask) {
   2021           first_range++;
   2022           if (first_range == ranges->length()) {
   2023             details->set_cannot_match();
   2024             pos->determines_perfectly = false;
   2025             return;
   2026           }
   2027         }
   2028         CharacterRange range = ranges->at(first_range);
   2029         uc16 from = range.from();
   2030         uc16 to = range.to();
   2031         if (to > char_mask) {
   2032           to = char_mask;
   2033         }
   2034         uint32_t differing_bits = (from ^ to);
   2035         // A mask and compare is only perfect if the differing bits form a
   2036         // number like 00011111 with one single block of trailing 1s.
   2037         if ((differing_bits & (differing_bits + 1)) == 0 &&
   2038              from + differing_bits == to) {
   2039           pos->determines_perfectly = true;
   2040         }
   2041         uint32_t common_bits = ~SmearBitsRight(differing_bits);
   2042         uint32_t bits = (from & common_bits);
   2043         for (int i = first_range + 1; i < ranges->length(); i++) {
   2044           CharacterRange range = ranges->at(i);
   2045           uc16 from = range.from();
   2046           uc16 to = range.to();
   2047           if (from > char_mask) continue;
   2048           if (to > char_mask) to = char_mask;
   2049           // Here we are combining more ranges into the mask and compare
   2050           // value.  With each new range the mask becomes more sparse and
   2051           // so the chances of a false positive rise.  A character class
   2052           // with multiple ranges is assumed never to be equivalent to a
   2053           // mask and compare operation.
   2054           pos->determines_perfectly = false;
   2055           uint32_t new_common_bits = (from ^ to);
   2056           new_common_bits = ~SmearBitsRight(new_common_bits);
   2057           common_bits &= new_common_bits;
   2058           bits &= new_common_bits;
   2059           uint32_t differing_bits = (from & common_bits) ^ bits;
   2060           common_bits ^= differing_bits;
   2061           bits &= common_bits;
   2062         }
   2063         pos->mask = common_bits;
   2064         pos->value = bits;
   2065       }
   2066       characters_filled_in++;
   2067       ASSERT(characters_filled_in <= details->characters());
   2068       if (characters_filled_in == details->characters()) {
   2069         return;
   2070       }
   2071     }
   2072   }
   2073   ASSERT(characters_filled_in != details->characters());
   2074   on_success()-> GetQuickCheckDetails(details,
   2075                                       compiler,
   2076                                       characters_filled_in,
   2077                                       true);
   2078 }
   2079 
   2080 
   2081 void QuickCheckDetails::Clear() {
   2082   for (int i = 0; i < characters_; i++) {
   2083     positions_[i].mask = 0;
   2084     positions_[i].value = 0;
   2085     positions_[i].determines_perfectly = false;
   2086   }
   2087   characters_ = 0;
   2088 }
   2089 
   2090 
   2091 void QuickCheckDetails::Advance(int by, bool ascii) {
   2092   ASSERT(by >= 0);
   2093   if (by >= characters_) {
   2094     Clear();
   2095     return;
   2096   }
   2097   for (int i = 0; i < characters_ - by; i++) {
   2098     positions_[i] = positions_[by + i];
   2099   }
   2100   for (int i = characters_ - by; i < characters_; i++) {
   2101     positions_[i].mask = 0;
   2102     positions_[i].value = 0;
   2103     positions_[i].determines_perfectly = false;
   2104   }
   2105   characters_ -= by;
   2106   // We could change mask_ and value_ here but we would never advance unless
   2107   // they had already been used in a check and they won't be used again because
   2108   // it would gain us nothing.  So there's no point.
   2109 }
   2110 
   2111 
   2112 void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
   2113   ASSERT(characters_ == other->characters_);
   2114   if (other->cannot_match_) {
   2115     return;
   2116   }
   2117   if (cannot_match_) {
   2118     *this = *other;
   2119     return;
   2120   }
   2121   for (int i = from_index; i < characters_; i++) {
   2122     QuickCheckDetails::Position* pos = positions(i);
   2123     QuickCheckDetails::Position* other_pos = other->positions(i);
   2124     if (pos->mask != other_pos->mask ||
   2125         pos->value != other_pos->value ||
   2126         !other_pos->determines_perfectly) {
   2127       // Our mask-compare operation will be approximate unless we have the
   2128       // exact same operation on both sides of the alternation.
   2129       pos->determines_perfectly = false;
   2130     }
   2131     pos->mask &= other_pos->mask;
   2132     pos->value &= pos->mask;
   2133     other_pos->value &= pos->mask;
   2134     uc16 differing_bits = (pos->value ^ other_pos->value);
   2135     pos->mask &= ~differing_bits;
   2136     pos->value &= pos->mask;
   2137   }
   2138 }
   2139 
   2140 
   2141 class VisitMarker {
   2142  public:
   2143   explicit VisitMarker(NodeInfo* info) : info_(info) {
   2144     ASSERT(!info->visited);
   2145     info->visited = true;
   2146   }
   2147   ~VisitMarker() {
   2148     info_->visited = false;
   2149   }
   2150  private:
   2151   NodeInfo* info_;
   2152 };
   2153 
   2154 
   2155 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
   2156                                           RegExpCompiler* compiler,
   2157                                           int characters_filled_in,
   2158                                           bool not_at_start) {
   2159   if (body_can_be_zero_length_ || info()->visited) return;
   2160   VisitMarker marker(info());
   2161   return ChoiceNode::GetQuickCheckDetails(details,
   2162                                           compiler,
   2163                                           characters_filled_in,
   2164                                           not_at_start);
   2165 }
   2166 
   2167 
   2168 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
   2169                                       RegExpCompiler* compiler,
   2170                                       int characters_filled_in,
   2171                                       bool not_at_start) {
   2172   not_at_start = (not_at_start || not_at_start_);
   2173   int choice_count = alternatives_->length();
   2174   ASSERT(choice_count > 0);
   2175   alternatives_->at(0).node()->GetQuickCheckDetails(details,
   2176                                                     compiler,
   2177                                                     characters_filled_in,
   2178                                                     not_at_start);
   2179   for (int i = 1; i < choice_count; i++) {
   2180     QuickCheckDetails new_details(details->characters());
   2181     RegExpNode* node = alternatives_->at(i).node();
   2182     node->GetQuickCheckDetails(&new_details, compiler,
   2183                                characters_filled_in,
   2184                                not_at_start);
   2185     // Here we merge the quick match details of the two branches.
   2186     details->Merge(&new_details, characters_filled_in);
   2187   }
   2188 }
   2189 
   2190 
   2191 // Check for [0-9A-Z_a-z].
   2192 static void EmitWordCheck(RegExpMacroAssembler* assembler,
   2193                           Label* word,
   2194                           Label* non_word,
   2195                           bool fall_through_on_word) {
   2196   if (assembler->CheckSpecialCharacterClass(
   2197           fall_through_on_word ? 'w' : 'W',
   2198           fall_through_on_word ? non_word : word)) {
   2199     // Optimized implementation available.
   2200     return;
   2201   }
   2202   assembler->CheckCharacterGT('z', non_word);
   2203   assembler->CheckCharacterLT('0', non_word);
   2204   assembler->CheckCharacterGT('a' - 1, word);
   2205   assembler->CheckCharacterLT('9' + 1, word);
   2206   assembler->CheckCharacterLT('A', non_word);
   2207   assembler->CheckCharacterLT('Z' + 1, word);
   2208   if (fall_through_on_word) {
   2209     assembler->CheckNotCharacter('_', non_word);
   2210   } else {
   2211     assembler->CheckCharacter('_', word);
   2212   }
   2213 }
   2214 
   2215 
   2216 // Emit the code to check for a ^ in multiline mode (1-character lookbehind
   2217 // that matches newline or the start of input).
   2218 static void EmitHat(RegExpCompiler* compiler,
   2219                     RegExpNode* on_success,
   2220                     Trace* trace) {
   2221   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   2222   // We will be loading the previous character into the current character
   2223   // register.
   2224   Trace new_trace(*trace);
   2225   new_trace.InvalidateCurrentCharacter();
   2226 
   2227   Label ok;
   2228   if (new_trace.cp_offset() == 0) {
   2229     // The start of input counts as a newline in this context, so skip to
   2230     // ok if we are at the start.
   2231     assembler->CheckAtStart(&ok);
   2232   }
   2233   // We already checked that we are not at the start of input so it must be
   2234   // OK to load the previous character.
   2235   assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
   2236                                   new_trace.backtrack(),
   2237                                   false);
   2238   if (!assembler->CheckSpecialCharacterClass('n',
   2239                                              new_trace.backtrack())) {
   2240     // Newline means \n, \r, 0x2028 or 0x2029.
   2241     if (!compiler->ascii()) {
   2242       assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
   2243     }
   2244     assembler->CheckCharacter('\n', &ok);
   2245     assembler->CheckNotCharacter('\r', new_trace.backtrack());
   2246   }
   2247   assembler->Bind(&ok);
   2248   on_success->Emit(compiler, &new_trace);
   2249 }
   2250 
   2251 
   2252 // Emit the code to handle \b and \B (word-boundary or non-word-boundary)
   2253 // when we know whether the next character must be a word character or not.
   2254 static void EmitHalfBoundaryCheck(AssertionNode::AssertionNodeType type,
   2255                                   RegExpCompiler* compiler,
   2256                                   RegExpNode* on_success,
   2257                                   Trace* trace) {
   2258   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   2259   Label done;
   2260 
   2261   Trace new_trace(*trace);
   2262 
   2263   bool expect_word_character = (type == AssertionNode::AFTER_WORD_CHARACTER);
   2264   Label* on_word = expect_word_character ? &done : new_trace.backtrack();
   2265   Label* on_non_word = expect_word_character ? new_trace.backtrack() : &done;
   2266 
   2267   // Check whether previous character was a word character.
   2268   switch (trace->at_start()) {
   2269     case Trace::TRUE:
   2270       if (expect_word_character) {
   2271         assembler->GoTo(on_non_word);
   2272       }
   2273       break;
   2274     case Trace::UNKNOWN:
   2275       ASSERT_EQ(0, trace->cp_offset());
   2276       assembler->CheckAtStart(on_non_word);
   2277       // Fall through.
   2278     case Trace::FALSE:
   2279       int prev_char_offset = trace->cp_offset() - 1;
   2280       assembler->LoadCurrentCharacter(prev_char_offset, NULL, false, 1);
   2281       EmitWordCheck(assembler, on_word, on_non_word, expect_word_character);
   2282       // We may or may not have loaded the previous character.
   2283       new_trace.InvalidateCurrentCharacter();
   2284   }
   2285 
   2286   assembler->Bind(&done);
   2287 
   2288   on_success->Emit(compiler, &new_trace);
   2289 }
   2290 
   2291 
   2292 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
   2293 static void EmitBoundaryCheck(AssertionNode::AssertionNodeType type,
   2294                               RegExpCompiler* compiler,
   2295                               RegExpNode* on_success,
   2296                               Trace* trace) {
   2297   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   2298   Label before_non_word;
   2299   Label before_word;
   2300   if (trace->characters_preloaded() != 1) {
   2301     assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
   2302   }
   2303   // Fall through on non-word.
   2304   EmitWordCheck(assembler, &before_word, &before_non_word, false);
   2305 
   2306   // We will be loading the previous character into the current character
   2307   // register.
   2308   Trace new_trace(*trace);
   2309   new_trace.InvalidateCurrentCharacter();
   2310 
   2311   Label ok;
   2312   Label* boundary;
   2313   Label* not_boundary;
   2314   if (type == AssertionNode::AT_BOUNDARY) {
   2315     boundary = &ok;
   2316     not_boundary = new_trace.backtrack();
   2317   } else {
   2318     not_boundary = &ok;
   2319     boundary = new_trace.backtrack();
   2320   }
   2321 
   2322   // Next character is not a word character.
   2323   assembler->Bind(&before_non_word);
   2324   if (new_trace.cp_offset() == 0) {
   2325     // The start of input counts as a non-word character, so the question is
   2326     // decided if we are at the start.
   2327     assembler->CheckAtStart(not_boundary);
   2328   }
   2329   // We already checked that we are not at the start of input so it must be
   2330   // OK to load the previous character.
   2331   assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
   2332                                   &ok,  // Unused dummy label in this call.
   2333                                   false);
   2334   // Fall through on non-word.
   2335   EmitWordCheck(assembler, boundary, not_boundary, false);
   2336   assembler->GoTo(not_boundary);
   2337 
   2338   // Next character is a word character.
   2339   assembler->Bind(&before_word);
   2340   if (new_trace.cp_offset() == 0) {
   2341     // The start of input counts as a non-word character, so the question is
   2342     // decided if we are at the start.
   2343     assembler->CheckAtStart(boundary);
   2344   }
   2345   // We already checked that we are not at the start of input so it must be
   2346   // OK to load the previous character.
   2347   assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
   2348                                   &ok,  // Unused dummy label in this call.
   2349                                   false);
   2350   bool fall_through_on_word = (type == AssertionNode::AT_NON_BOUNDARY);
   2351   EmitWordCheck(assembler, not_boundary, boundary, fall_through_on_word);
   2352 
   2353   assembler->Bind(&ok);
   2354 
   2355   on_success->Emit(compiler, &new_trace);
   2356 }
   2357 
   2358 
   2359 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
   2360                                          RegExpCompiler* compiler,
   2361                                          int filled_in,
   2362                                          bool not_at_start) {
   2363   if (type_ == AT_START && not_at_start) {
   2364     details->set_cannot_match();
   2365     return;
   2366   }
   2367   return on_success()->GetQuickCheckDetails(details,
   2368                                             compiler,
   2369                                             filled_in,
   2370                                             not_at_start);
   2371 }
   2372 
   2373 
   2374 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
   2375   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   2376   switch (type_) {
   2377     case AT_END: {
   2378       Label ok;
   2379       assembler->CheckPosition(trace->cp_offset(), &ok);
   2380       assembler->GoTo(trace->backtrack());
   2381       assembler->Bind(&ok);
   2382       break;
   2383     }
   2384     case AT_START: {
   2385       if (trace->at_start() == Trace::FALSE) {
   2386         assembler->GoTo(trace->backtrack());
   2387         return;
   2388       }
   2389       if (trace->at_start() == Trace::UNKNOWN) {
   2390         assembler->CheckNotAtStart(trace->backtrack());
   2391         Trace at_start_trace = *trace;
   2392         at_start_trace.set_at_start(true);
   2393         on_success()->Emit(compiler, &at_start_trace);
   2394         return;
   2395       }
   2396     }
   2397     break;
   2398     case AFTER_NEWLINE:
   2399       EmitHat(compiler, on_success(), trace);
   2400       return;
   2401     case AT_BOUNDARY:
   2402     case AT_NON_BOUNDARY: {
   2403       EmitBoundaryCheck(type_, compiler, on_success(), trace);
   2404       return;
   2405     }
   2406     case AFTER_WORD_CHARACTER:
   2407     case AFTER_NONWORD_CHARACTER: {
   2408       EmitHalfBoundaryCheck(type_, compiler, on_success(), trace);
   2409     }
   2410   }
   2411   on_success()->Emit(compiler, trace);
   2412 }
   2413 
   2414 
   2415 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
   2416   if (quick_check == NULL) return false;
   2417   if (offset >= quick_check->characters()) return false;
   2418   return quick_check->positions(offset)->determines_perfectly;
   2419 }
   2420 
   2421 
   2422 static void UpdateBoundsCheck(int index, int* checked_up_to) {
   2423   if (index > *checked_up_to) {
   2424     *checked_up_to = index;
   2425   }
   2426 }
   2427 
   2428 
   2429 // We call this repeatedly to generate code for each pass over the text node.
   2430 // The passes are in increasing order of difficulty because we hope one
   2431 // of the first passes will fail in which case we are saved the work of the
   2432 // later passes.  for example for the case independent regexp /%[asdfghjkl]a/
   2433 // we will check the '%' in the first pass, the case independent 'a' in the
   2434 // second pass and the character class in the last pass.
   2435 //
   2436 // The passes are done from right to left, so for example to test for /bar/
   2437 // we will first test for an 'r' with offset 2, then an 'a' with offset 1
   2438 // and then a 'b' with offset 0.  This means we can avoid the end-of-input
   2439 // bounds check most of the time.  In the example we only need to check for
   2440 // end-of-input when loading the putative 'r'.
   2441 //
   2442 // A slight complication involves the fact that the first character may already
   2443 // be fetched into a register by the previous node.  In this case we want to
   2444 // do the test for that character first.  We do this in separate passes.  The
   2445 // 'preloaded' argument indicates that we are doing such a 'pass'.  If such a
   2446 // pass has been performed then subsequent passes will have true in
   2447 // first_element_checked to indicate that that character does not need to be
   2448 // checked again.
   2449 //
   2450 // In addition to all this we are passed a Trace, which can
   2451 // contain an AlternativeGeneration object.  In this AlternativeGeneration
   2452 // object we can see details of any quick check that was already passed in
   2453 // order to get to the code we are now generating.  The quick check can involve
   2454 // loading characters, which means we do not need to recheck the bounds
   2455 // up to the limit the quick check already checked.  In addition the quick
   2456 // check can have involved a mask and compare operation which may simplify
   2457 // or obviate the need for further checks at some character positions.
   2458 void TextNode::TextEmitPass(RegExpCompiler* compiler,
   2459                             TextEmitPassType pass,
   2460                             bool preloaded,
   2461                             Trace* trace,
   2462                             bool first_element_checked,
   2463                             int* checked_up_to) {
   2464   Isolate* isolate = Isolate::Current();
   2465   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   2466   bool ascii = compiler->ascii();
   2467   Label* backtrack = trace->backtrack();
   2468   QuickCheckDetails* quick_check = trace->quick_check_performed();
   2469   int element_count = elms_->length();
   2470   for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
   2471     TextElement elm = elms_->at(i);
   2472     int cp_offset = trace->cp_offset() + elm.cp_offset;
   2473     if (elm.type == TextElement::ATOM) {
   2474       Vector<const uc16> quarks = elm.data.u_atom->data();
   2475       for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
   2476         if (first_element_checked && i == 0 && j == 0) continue;
   2477         if (DeterminedAlready(quick_check, elm.cp_offset + j)) continue;
   2478         EmitCharacterFunction* emit_function = NULL;
   2479         switch (pass) {
   2480           case NON_ASCII_MATCH:
   2481             ASSERT(ascii);
   2482             if (quarks[j] > String::kMaxAsciiCharCode) {
   2483               assembler->GoTo(backtrack);
   2484               return;
   2485             }
   2486             break;
   2487           case NON_LETTER_CHARACTER_MATCH:
   2488             emit_function = &EmitAtomNonLetter;
   2489             break;
   2490           case SIMPLE_CHARACTER_MATCH:
   2491             emit_function = &EmitSimpleCharacter;
   2492             break;
   2493           case CASE_CHARACTER_MATCH:
   2494             emit_function = &EmitAtomLetter;
   2495             break;
   2496           default:
   2497             break;
   2498         }
   2499         if (emit_function != NULL) {
   2500           bool bound_checked = emit_function(isolate,
   2501                                              compiler,
   2502                                              quarks[j],
   2503                                              backtrack,
   2504                                              cp_offset + j,
   2505                                              *checked_up_to < cp_offset + j,
   2506                                              preloaded);
   2507           if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
   2508         }
   2509       }
   2510     } else {
   2511       ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);
   2512       if (pass == CHARACTER_CLASS_MATCH) {
   2513         if (first_element_checked && i == 0) continue;
   2514         if (DeterminedAlready(quick_check, elm.cp_offset)) continue;
   2515         RegExpCharacterClass* cc = elm.data.u_char_class;
   2516         EmitCharClass(assembler,
   2517                       cc,
   2518                       ascii,
   2519                       backtrack,
   2520                       cp_offset,
   2521                       *checked_up_to < cp_offset,
   2522                       preloaded);
   2523         UpdateBoundsCheck(cp_offset, checked_up_to);
   2524       }
   2525     }
   2526   }
   2527 }
   2528 
   2529 
   2530 int TextNode::Length() {
   2531   TextElement elm = elms_->last();
   2532   ASSERT(elm.cp_offset >= 0);
   2533   if (elm.type == TextElement::ATOM) {
   2534     return elm.cp_offset + elm.data.u_atom->data().length();
   2535   } else {
   2536     return elm.cp_offset + 1;
   2537   }
   2538 }
   2539 
   2540 
   2541 bool TextNode::SkipPass(int int_pass, bool ignore_case) {
   2542   TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
   2543   if (ignore_case) {
   2544     return pass == SIMPLE_CHARACTER_MATCH;
   2545   } else {
   2546     return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
   2547   }
   2548 }
   2549 
   2550 
   2551 // This generates the code to match a text node.  A text node can contain
   2552 // straight character sequences (possibly to be matched in a case-independent
   2553 // way) and character classes.  For efficiency we do not do this in a single
   2554 // pass from left to right.  Instead we pass over the text node several times,
   2555 // emitting code for some character positions every time.  See the comment on
   2556 // TextEmitPass for details.
   2557 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
   2558   LimitResult limit_result = LimitVersions(compiler, trace);
   2559   if (limit_result == DONE) return;
   2560   ASSERT(limit_result == CONTINUE);
   2561 
   2562   if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
   2563     compiler->SetRegExpTooBig();
   2564     return;
   2565   }
   2566 
   2567   if (compiler->ascii()) {
   2568     int dummy = 0;
   2569     TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy);
   2570   }
   2571 
   2572   bool first_elt_done = false;
   2573   int bound_checked_to = trace->cp_offset() - 1;
   2574   bound_checked_to += trace->bound_checked_up_to();
   2575 
   2576   // If a character is preloaded into the current character register then
   2577   // check that now.
   2578   if (trace->characters_preloaded() == 1) {
   2579     for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
   2580       if (!SkipPass(pass, compiler->ignore_case())) {
   2581         TextEmitPass(compiler,
   2582                      static_cast<TextEmitPassType>(pass),
   2583                      true,
   2584                      trace,
   2585                      false,
   2586                      &bound_checked_to);
   2587       }
   2588     }
   2589     first_elt_done = true;
   2590   }
   2591 
   2592   for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
   2593     if (!SkipPass(pass, compiler->ignore_case())) {
   2594       TextEmitPass(compiler,
   2595                    static_cast<TextEmitPassType>(pass),
   2596                    false,
   2597                    trace,
   2598                    first_elt_done,
   2599                    &bound_checked_to);
   2600     }
   2601   }
   2602 
   2603   Trace successor_trace(*trace);
   2604   successor_trace.set_at_start(false);
   2605   successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);
   2606   RecursionCheck rc(compiler);
   2607   on_success()->Emit(compiler, &successor_trace);
   2608 }
   2609 
   2610 
   2611 void Trace::InvalidateCurrentCharacter() {
   2612   characters_preloaded_ = 0;
   2613 }
   2614 
   2615 
   2616 void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
   2617   ASSERT(by > 0);
   2618   // We don't have an instruction for shifting the current character register
   2619   // down or for using a shifted value for anything so lets just forget that
   2620   // we preloaded any characters into it.
   2621   characters_preloaded_ = 0;
   2622   // Adjust the offsets of the quick check performed information.  This
   2623   // information is used to find out what we already determined about the
   2624   // characters by means of mask and compare.
   2625   quick_check_performed_.Advance(by, compiler->ascii());
   2626   cp_offset_ += by;
   2627   if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
   2628     compiler->SetRegExpTooBig();
   2629     cp_offset_ = 0;
   2630   }
   2631   bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
   2632 }
   2633 
   2634 
   2635 void TextNode::MakeCaseIndependent(bool is_ascii) {
   2636   int element_count = elms_->length();
   2637   for (int i = 0; i < element_count; i++) {
   2638     TextElement elm = elms_->at(i);
   2639     if (elm.type == TextElement::CHAR_CLASS) {
   2640       RegExpCharacterClass* cc = elm.data.u_char_class;
   2641       // None of the standard character classes is different in the case
   2642       // independent case and it slows us down if we don't know that.
   2643       if (cc->is_standard()) continue;
   2644       ZoneList<CharacterRange>* ranges = cc->ranges();
   2645       int range_count = ranges->length();
   2646       for (int j = 0; j < range_count; j++) {
   2647         ranges->at(j).AddCaseEquivalents(ranges, is_ascii);
   2648       }
   2649     }
   2650   }
   2651 }
   2652 
   2653 
   2654 int TextNode::GreedyLoopTextLength() {
   2655   TextElement elm = elms_->at(elms_->length() - 1);
   2656   if (elm.type == TextElement::CHAR_CLASS) {
   2657     return elm.cp_offset + 1;
   2658   } else {
   2659     return elm.cp_offset + elm.data.u_atom->data().length();
   2660   }
   2661 }
   2662 
   2663 
   2664 // Finds the fixed match length of a sequence of nodes that goes from
   2665 // this alternative and back to this choice node.  If there are variable
   2666 // length nodes or other complications in the way then return a sentinel
   2667 // value indicating that a greedy loop cannot be constructed.
   2668 int ChoiceNode::GreedyLoopTextLengthForAlternative(
   2669     GuardedAlternative* alternative) {
   2670   int length = 0;
   2671   RegExpNode* node = alternative->node();
   2672   // Later we will generate code for all these text nodes using recursion
   2673   // so we have to limit the max number.
   2674   int recursion_depth = 0;
   2675   while (node != this) {
   2676     if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
   2677       return kNodeIsTooComplexForGreedyLoops;
   2678     }
   2679     int node_length = node->GreedyLoopTextLength();
   2680     if (node_length == kNodeIsTooComplexForGreedyLoops) {
   2681       return kNodeIsTooComplexForGreedyLoops;
   2682     }
   2683     length += node_length;
   2684     SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
   2685     node = seq_node->on_success();
   2686   }
   2687   return length;
   2688 }
   2689 
   2690 
   2691 void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
   2692   ASSERT_EQ(loop_node_, NULL);
   2693   AddAlternative(alt);
   2694   loop_node_ = alt.node();
   2695 }
   2696 
   2697 
   2698 void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
   2699   ASSERT_EQ(continue_node_, NULL);
   2700   AddAlternative(alt);
   2701   continue_node_ = alt.node();
   2702 }
   2703 
   2704 
   2705 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
   2706   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
   2707   if (trace->stop_node() == this) {
   2708     int text_length =
   2709         GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
   2710     ASSERT(text_length != kNodeIsTooComplexForGreedyLoops);
   2711     // Update the counter-based backtracking info on the stack.  This is an
   2712     // optimization for greedy loops (see below).
   2713     ASSERT(trace->cp_offset() == text_length);
   2714     macro_assembler->AdvanceCurrentPosition(text_length);
   2715     macro_assembler->GoTo(trace->loop_label());
   2716     return;
   2717   }
   2718   ASSERT(trace->stop_node() == NULL);
   2719   if (!trace->is_trivial()) {
   2720     trace->Flush(compiler, this);
   2721     return;
   2722   }
   2723   ChoiceNode::Emit(compiler, trace);
   2724 }
   2725 
   2726 
   2727 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
   2728                                            bool not_at_start) {
   2729   int preload_characters = EatsAtLeast(4, 0, not_at_start);
   2730   if (compiler->macro_assembler()->CanReadUnaligned()) {
   2731     bool ascii = compiler->ascii();
   2732     if (ascii) {
   2733       if (preload_characters > 4) preload_characters = 4;
   2734       // We can't preload 3 characters because there is no machine instruction
   2735       // to do that.  We can't just load 4 because we could be reading
   2736       // beyond the end of the string, which could cause a memory fault.
   2737       if (preload_characters == 3) preload_characters = 2;
   2738     } else {
   2739       if (preload_characters > 2) preload_characters = 2;
   2740     }
   2741   } else {
   2742     if (preload_characters > 1) preload_characters = 1;
   2743   }
   2744   return preload_characters;
   2745 }
   2746 
   2747 
   2748 // This class is used when generating the alternatives in a choice node.  It
   2749 // records the way the alternative is being code generated.
   2750 class AlternativeGeneration: public Malloced {
   2751  public:
   2752   AlternativeGeneration()
   2753       : possible_success(),
   2754         expects_preload(false),
   2755         after(),
   2756         quick_check_details() { }
   2757   Label possible_success;
   2758   bool expects_preload;
   2759   Label after;
   2760   QuickCheckDetails quick_check_details;
   2761 };
   2762 
   2763 
   2764 // Creates a list of AlternativeGenerations.  If the list has a reasonable
   2765 // size then it is on the stack, otherwise the excess is on the heap.
   2766 class AlternativeGenerationList {
   2767  public:
   2768   explicit AlternativeGenerationList(int count)
   2769       : alt_gens_(count) {
   2770     for (int i = 0; i < count && i < kAFew; i++) {
   2771       alt_gens_.Add(a_few_alt_gens_ + i);
   2772     }
   2773     for (int i = kAFew; i < count; i++) {
   2774       alt_gens_.Add(new AlternativeGeneration());
   2775     }
   2776   }
   2777   ~AlternativeGenerationList() {
   2778     for (int i = kAFew; i < alt_gens_.length(); i++) {
   2779       delete alt_gens_[i];
   2780       alt_gens_[i] = NULL;
   2781     }
   2782   }
   2783 
   2784   AlternativeGeneration* at(int i) {
   2785     return alt_gens_[i];
   2786   }
   2787 
   2788  private:
   2789   static const int kAFew = 10;
   2790   ZoneList<AlternativeGeneration*> alt_gens_;
   2791   AlternativeGeneration a_few_alt_gens_[kAFew];
   2792 };
   2793 
   2794 
   2795 /* Code generation for choice nodes.
   2796  *
   2797  * We generate quick checks that do a mask and compare to eliminate a
   2798  * choice.  If the quick check succeeds then it jumps to the continuation to
   2799  * do slow checks and check subsequent nodes.  If it fails (the common case)
   2800  * it falls through to the next choice.
   2801  *
   2802  * Here is the desired flow graph.  Nodes directly below each other imply
   2803  * fallthrough.  Alternatives 1 and 2 have quick checks.  Alternative
   2804  * 3 doesn't have a quick check so we have to call the slow check.
   2805  * Nodes are marked Qn for quick checks and Sn for slow checks.  The entire
   2806  * regexp continuation is generated directly after the Sn node, up to the
   2807  * next GoTo if we decide to reuse some already generated code.  Some
   2808  * nodes expect preload_characters to be preloaded into the current
   2809  * character register.  R nodes do this preloading.  Vertices are marked
   2810  * F for failures and S for success (possible success in the case of quick
   2811  * nodes).  L, V, < and > are used as arrow heads.
   2812  *
   2813  * ----------> R
   2814  *             |
   2815  *             V
   2816  *            Q1 -----> S1
   2817  *             |   S   /
   2818  *            F|      /
   2819  *             |    F/
   2820  *             |    /
   2821  *             |   R
   2822  *             |  /
   2823  *             V L
   2824  *            Q2 -----> S2
   2825  *             |   S   /
   2826  *            F|      /
   2827  *             |    F/
   2828  *             |    /
   2829  *             |   R
   2830  *             |  /
   2831  *             V L
   2832  *            S3
   2833  *             |
   2834  *            F|
   2835  *             |
   2836  *             R
   2837  *             |
   2838  * backtrack   V
   2839  * <----------Q4
   2840  *   \    F    |
   2841  *    \        |S
   2842  *     \   F   V
   2843  *      \-----S4
   2844  *
   2845  * For greedy loops we reverse our expectation and expect to match rather
   2846  * than fail. Therefore we want the loop code to look like this (U is the
   2847  * unwind code that steps back in the greedy loop).  The following alternatives
   2848  * look the same as above.
   2849  *              _____
   2850  *             /     \
   2851  *             V     |
   2852  * ----------> S1    |
   2853  *            /|     |
   2854  *           / |S    |
   2855  *         F/  \_____/
   2856  *         /
   2857  *        |<-----------
   2858  *        |            \
   2859  *        V             \
   2860  *        Q2 ---> S2     \
   2861  *        |  S   /       |
   2862  *       F|     /        |
   2863  *        |   F/         |
   2864  *        |   /          |
   2865  *        |  R           |
   2866  *        | /            |
   2867  *   F    VL             |
   2868  * <------U              |
   2869  * back   |S             |
   2870  *        \______________/
   2871  */
   2872 
   2873 
   2874 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
   2875   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
   2876   int choice_count = alternatives_->length();
   2877 #ifdef DEBUG
   2878   for (int i = 0; i < choice_count - 1; i++) {
   2879     GuardedAlternative alternative = alternatives_->at(i);
   2880     ZoneList<Guard*>* guards = alternative.guards();
   2881     int guard_count = (guards == NULL) ? 0 : guards->length();
   2882     for (int j = 0; j < guard_count; j++) {
   2883       ASSERT(!trace->mentions_reg(guards->at(j)->reg()));
   2884     }
   2885   }
   2886 #endif
   2887 
   2888   LimitResult limit_result = LimitVersions(compiler, trace);
   2889   if (limit_result == DONE) return;
   2890   ASSERT(limit_result == CONTINUE);
   2891 
   2892   int new_flush_budget = trace->flush_budget() / choice_count;
   2893   if (trace->flush_budget() == 0 && trace->actions() != NULL) {
   2894     trace->Flush(compiler, this);
   2895     return;
   2896   }
   2897 
   2898   RecursionCheck rc(compiler);
   2899 
   2900   Trace* current_trace = trace;
   2901 
   2902   int text_length = GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
   2903   bool greedy_loop = false;
   2904   Label greedy_loop_label;
   2905   Trace counter_backtrack_trace;
   2906   counter_backtrack_trace.set_backtrack(&greedy_loop_label);
   2907   if (not_at_start()) counter_backtrack_trace.set_at_start(false);
   2908 
   2909   if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
   2910     // Here we have special handling for greedy loops containing only text nodes
   2911     // and other simple nodes.  These are handled by pushing the current
   2912     // position on the stack and then incrementing the current position each
   2913     // time around the switch.  On backtrack we decrement the current position
   2914     // and check it against the pushed value.  This avoids pushing backtrack
   2915     // information for each iteration of the loop, which could take up a lot of
   2916     // space.
   2917     greedy_loop = true;
   2918     ASSERT(trace->stop_node() == NULL);
   2919     macro_assembler->PushCurrentPosition();
   2920     current_trace = &counter_backtrack_trace;
   2921     Label greedy_match_failed;
   2922     Trace greedy_match_trace;
   2923     if (not_at_start()) greedy_match_trace.set_at_start(false);
   2924     greedy_match_trace.set_backtrack(&greedy_match_failed);
   2925     Label loop_label;
   2926     macro_assembler->Bind(&loop_label);
   2927     greedy_match_trace.set_stop_node(this);
   2928     greedy_match_trace.set_loop_label(&loop_label);
   2929     alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
   2930     macro_assembler->Bind(&greedy_match_failed);
   2931   }
   2932 
   2933   Label second_choice;  // For use in greedy matches.
   2934   macro_assembler->Bind(&second_choice);
   2935 
   2936   int first_normal_choice = greedy_loop ? 1 : 0;
   2937 
   2938   int preload_characters =
   2939       CalculatePreloadCharacters(compiler,
   2940                                  current_trace->at_start() == Trace::FALSE);
   2941   bool preload_is_current =
   2942       (current_trace->characters_preloaded() == preload_characters);
   2943   bool preload_has_checked_bounds = preload_is_current;
   2944 
   2945   AlternativeGenerationList alt_gens(choice_count);
   2946 
   2947   // For now we just call all choices one after the other.  The idea ultimately
   2948   // is to use the Dispatch table to try only the relevant ones.
   2949   for (int i = first_normal_choice; i < choice_count; i++) {
   2950     GuardedAlternative alternative = alternatives_->at(i);
   2951     AlternativeGeneration* alt_gen = alt_gens.at(i);
   2952     alt_gen->quick_check_details.set_characters(preload_characters);
   2953     ZoneList<Guard*>* guards = alternative.guards();
   2954     int guard_count = (guards == NULL) ? 0 : guards->length();
   2955     Trace new_trace(*current_trace);
   2956     new_trace.set_characters_preloaded(preload_is_current ?
   2957                                          preload_characters :
   2958                                          0);
   2959     if (preload_has_checked_bounds) {
   2960       new_trace.set_bound_checked_up_to(preload_characters);
   2961     }
   2962     new_trace.quick_check_performed()->Clear();
   2963     if (not_at_start_) new_trace.set_at_start(Trace::FALSE);
   2964     alt_gen->expects_preload = preload_is_current;
   2965     bool generate_full_check_inline = false;
   2966     if (FLAG_regexp_optimization &&
   2967         try_to_emit_quick_check_for_alternative(i) &&
   2968         alternative.node()->EmitQuickCheck(compiler,
   2969                                            &new_trace,
   2970                                            preload_has_checked_bounds,
   2971                                            &alt_gen->possible_success,
   2972                                            &alt_gen->quick_check_details,
   2973                                            i < choice_count - 1)) {
   2974       // Quick check was generated for this choice.
   2975       preload_is_current = true;
   2976       preload_has_checked_bounds = true;
   2977       // On the last choice in the ChoiceNode we generated the quick
   2978       // check to fall through on possible success.  So now we need to
   2979       // generate the full check inline.
   2980       if (i == choice_count - 1) {
   2981         macro_assembler->Bind(&alt_gen->possible_success);
   2982         new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
   2983         new_trace.set_characters_preloaded(preload_characters);
   2984         new_trace.set_bound_checked_up_to(preload_characters);
   2985         generate_full_check_inline = true;
   2986       }
   2987     } else if (alt_gen->quick_check_details.cannot_match()) {
   2988       if (i == choice_count - 1 && !greedy_loop) {
   2989         macro_assembler->GoTo(trace->backtrack());
   2990       }
   2991       continue;
   2992     } else {
   2993       // No quick check was generated.  Put the full code here.
   2994       // If this is not the first choice then there could be slow checks from
   2995       // previous cases that go here when they fail.  There's no reason to
   2996       // insist that they preload characters since the slow check we are about
   2997       // to generate probably can't use it.
   2998       if (i != first_normal_choice) {
   2999         alt_gen->expects_preload = false;
   3000         new_trace.InvalidateCurrentCharacter();
   3001       }
   3002       if (i < choice_count - 1) {
   3003         new_trace.set_backtrack(&alt_gen->after);
   3004       }
   3005       generate_full_check_inline = true;
   3006     }
   3007     if (generate_full_check_inline) {
   3008       if (new_trace.actions() != NULL) {
   3009         new_trace.set_flush_budget(new_flush_budget);
   3010       }
   3011       for (int j = 0; j < guard_count; j++) {
   3012         GenerateGuard(macro_assembler, guards->at(j), &new_trace);
   3013       }
   3014       alternative.node()->Emit(compiler, &new_trace);
   3015       preload_is_current = false;
   3016     }
   3017     macro_assembler->Bind(&alt_gen->after);
   3018   }
   3019   if (greedy_loop) {
   3020     macro_assembler->Bind(&greedy_loop_label);
   3021     // If we have unwound to the bottom then backtrack.
   3022     macro_assembler->CheckGreedyLoop(trace->backtrack());
   3023     // Otherwise try the second priority at an earlier position.
   3024     macro_assembler->AdvanceCurrentPosition(-text_length);
   3025     macro_assembler->GoTo(&second_choice);
   3026   }
   3027 
   3028   // At this point we need to generate slow checks for the alternatives where
   3029   // the quick check was inlined.  We can recognize these because the associated
   3030   // label was bound.
   3031   for (int i = first_normal_choice; i < choice_count - 1; i++) {
   3032     AlternativeGeneration* alt_gen = alt_gens.at(i);
   3033     Trace new_trace(*current_trace);
   3034     // If there are actions to be flushed we have to limit how many times
   3035     // they are flushed.  Take the budget of the parent trace and distribute
   3036     // it fairly amongst the children.
   3037     if (new_trace.actions() != NULL) {
   3038       new_trace.set_flush_budget(new_flush_budget);
   3039     }
   3040     EmitOutOfLineContinuation(compiler,
   3041                               &new_trace,
   3042                               alternatives_->at(i),
   3043                               alt_gen,
   3044                               preload_characters,
   3045                               alt_gens.at(i + 1)->expects_preload);
   3046   }
   3047 }
   3048 
   3049 
   3050 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
   3051                                            Trace* trace,
   3052                                            GuardedAlternative alternative,
   3053                                            AlternativeGeneration* alt_gen,
   3054                                            int preload_characters,
   3055                                            bool next_expects_preload) {
   3056   if (!alt_gen->possible_success.is_linked()) return;
   3057 
   3058   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
   3059   macro_assembler->Bind(&alt_gen->possible_success);
   3060   Trace out_of_line_trace(*trace);
   3061   out_of_line_trace.set_characters_preloaded(preload_characters);
   3062   out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
   3063   if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE);
   3064   ZoneList<Guard*>* guards = alternative.guards();
   3065   int guard_count = (guards == NULL) ? 0 : guards->length();
   3066   if (next_expects_preload) {
   3067     Label reload_current_char;
   3068     out_of_line_trace.set_backtrack(&reload_current_char);
   3069     for (int j = 0; j < guard_count; j++) {
   3070       GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
   3071     }
   3072     alternative.node()->Emit(compiler, &out_of_line_trace);
   3073     macro_assembler->Bind(&reload_current_char);
   3074     // Reload the current character, since the next quick check expects that.
   3075     // We don't need to check bounds here because we only get into this
   3076     // code through a quick check which already did the checked load.
   3077     macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
   3078                                           NULL,
   3079                                           false,
   3080                                           preload_characters);
   3081     macro_assembler->GoTo(&(alt_gen->after));
   3082   } else {
   3083     out_of_line_trace.set_backtrack(&(alt_gen->after));
   3084     for (int j = 0; j < guard_count; j++) {
   3085       GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
   3086     }
   3087     alternative.node()->Emit(compiler, &out_of_line_trace);
   3088   }
   3089 }
   3090 
   3091 
   3092 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
   3093   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   3094   LimitResult limit_result = LimitVersions(compiler, trace);
   3095   if (limit_result == DONE) return;
   3096   ASSERT(limit_result == CONTINUE);
   3097 
   3098   RecursionCheck rc(compiler);
   3099 
   3100   switch (type_) {
   3101     case STORE_POSITION: {
   3102       Trace::DeferredCapture
   3103           new_capture(data_.u_position_register.reg,
   3104                       data_.u_position_register.is_capture,
   3105                       trace);
   3106       Trace new_trace = *trace;
   3107       new_trace.add_action(&new_capture);
   3108       on_success()->Emit(compiler, &new_trace);
   3109       break;
   3110     }
   3111     case INCREMENT_REGISTER: {
   3112       Trace::DeferredIncrementRegister
   3113           new_increment(data_.u_increment_register.reg);
   3114       Trace new_trace = *trace;
   3115       new_trace.add_action(&new_increment);
   3116       on_success()->Emit(compiler, &new_trace);
   3117       break;
   3118     }
   3119     case SET_REGISTER: {
   3120       Trace::DeferredSetRegister
   3121           new_set(data_.u_store_register.reg, data_.u_store_register.value);
   3122       Trace new_trace = *trace;
   3123       new_trace.add_action(&new_set);
   3124       on_success()->Emit(compiler, &new_trace);
   3125       break;
   3126     }
   3127     case CLEAR_CAPTURES: {
   3128       Trace::DeferredClearCaptures
   3129         new_capture(Interval(data_.u_clear_captures.range_from,
   3130                              data_.u_clear_captures.range_to));
   3131       Trace new_trace = *trace;
   3132       new_trace.add_action(&new_capture);
   3133       on_success()->Emit(compiler, &new_trace);
   3134       break;
   3135     }
   3136     case BEGIN_SUBMATCH:
   3137       if (!trace->is_trivial()) {
   3138         trace->Flush(compiler, this);
   3139       } else {
   3140         assembler->WriteCurrentPositionToRegister(
   3141             data_.u_submatch.current_position_register, 0);
   3142         assembler->WriteStackPointerToRegister(
   3143             data_.u_submatch.stack_pointer_register);
   3144         on_success()->Emit(compiler, trace);
   3145       }
   3146       break;
   3147     case EMPTY_MATCH_CHECK: {
   3148       int start_pos_reg = data_.u_empty_match_check.start_register;
   3149       int stored_pos = 0;
   3150       int rep_reg = data_.u_empty_match_check.repetition_register;
   3151       bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
   3152       bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
   3153       if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
   3154         // If we know we haven't advanced and there is no minimum we
   3155         // can just backtrack immediately.
   3156         assembler->GoTo(trace->backtrack());
   3157       } else if (know_dist && stored_pos < trace->cp_offset()) {
   3158         // If we know we've advanced we can generate the continuation
   3159         // immediately.
   3160         on_success()->Emit(compiler, trace);
   3161       } else if (!trace->is_trivial()) {
   3162         trace->Flush(compiler, this);
   3163       } else {
   3164         Label skip_empty_check;
   3165         // If we have a minimum number of repetitions we check the current
   3166         // number first and skip the empty check if it's not enough.
   3167         if (has_minimum) {
   3168           int limit = data_.u_empty_match_check.repetition_limit;
   3169           assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
   3170         }
   3171         // If the match is empty we bail out, otherwise we fall through
   3172         // to the on-success continuation.
   3173         assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
   3174                                    trace->backtrack());
   3175         assembler->Bind(&skip_empty_check);
   3176         on_success()->Emit(compiler, trace);
   3177       }
   3178       break;
   3179     }
   3180     case POSITIVE_SUBMATCH_SUCCESS: {
   3181       if (!trace->is_trivial()) {
   3182         trace->Flush(compiler, this);
   3183         return;
   3184       }
   3185       assembler->ReadCurrentPositionFromRegister(
   3186           data_.u_submatch.current_position_register);
   3187       assembler->ReadStackPointerFromRegister(
   3188           data_.u_submatch.stack_pointer_register);
   3189       int clear_register_count = data_.u_submatch.clear_register_count;
   3190       if (clear_register_count == 0) {
   3191         on_success()->Emit(compiler, trace);
   3192         return;
   3193       }
   3194       int clear_registers_from = data_.u_submatch.clear_register_from;
   3195       Label clear_registers_backtrack;
   3196       Trace new_trace = *trace;
   3197       new_trace.set_backtrack(&clear_registers_backtrack);
   3198       on_success()->Emit(compiler, &new_trace);
   3199 
   3200       assembler->Bind(&clear_registers_backtrack);
   3201       int clear_registers_to = clear_registers_from + clear_register_count - 1;
   3202       assembler->ClearRegisters(clear_registers_from, clear_registers_to);
   3203 
   3204       ASSERT(trace->backtrack() == NULL);
   3205       assembler->Backtrack();
   3206       return;
   3207     }
   3208     default:
   3209       UNREACHABLE();
   3210   }
   3211 }
   3212 
   3213 
   3214 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
   3215   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   3216   if (!trace->is_trivial()) {
   3217     trace->Flush(compiler, this);
   3218     return;
   3219   }
   3220 
   3221   LimitResult limit_result = LimitVersions(compiler, trace);
   3222   if (limit_result == DONE) return;
   3223   ASSERT(limit_result == CONTINUE);
   3224 
   3225   RecursionCheck rc(compiler);
   3226 
   3227   ASSERT_EQ(start_reg_ + 1, end_reg_);
   3228   if (compiler->ignore_case()) {
   3229     assembler->CheckNotBackReferenceIgnoreCase(start_reg_,
   3230                                                trace->backtrack());
   3231   } else {
   3232     assembler->CheckNotBackReference(start_reg_, trace->backtrack());
   3233   }
   3234   on_success()->Emit(compiler, trace);
   3235 }
   3236 
   3237 
   3238 // -------------------------------------------------------------------
   3239 // Dot/dotty output
   3240 
   3241 
   3242 #ifdef DEBUG
   3243 
   3244 
   3245 class DotPrinter: public NodeVisitor {
   3246  public:
   3247   explicit DotPrinter(bool ignore_case)
   3248       : ignore_case_(ignore_case),
   3249         stream_(&alloc_) { }
   3250   void PrintNode(const char* label, RegExpNode* node);
   3251   void Visit(RegExpNode* node);
   3252   void PrintAttributes(RegExpNode* from);
   3253   StringStream* stream() { return &stream_; }
   3254   void PrintOnFailure(RegExpNode* from, RegExpNode* to);
   3255 #define DECLARE_VISIT(Type)                                          \
   3256   virtual void Visit##Type(Type##Node* that);
   3257 FOR_EACH_NODE_TYPE(DECLARE_VISIT)
   3258 #undef DECLARE_VISIT
   3259  private:
   3260   bool ignore_case_;
   3261   HeapStringAllocator alloc_;
   3262   StringStream stream_;
   3263 };
   3264 
   3265 
   3266 void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
   3267   stream()->Add("digraph G {\n  graph [label=\"");
   3268   for (int i = 0; label[i]; i++) {
   3269     switch (label[i]) {
   3270       case '\\':
   3271         stream()->Add("\\\\");
   3272         break;
   3273       case '"':
   3274         stream()->Add("\"");
   3275         break;
   3276       default:
   3277         stream()->Put(label[i]);
   3278         break;
   3279     }
   3280   }
   3281   stream()->Add("\"];\n");
   3282   Visit(node);
   3283   stream()->Add("}\n");
   3284   printf("%s", *(stream()->ToCString()));
   3285 }
   3286 
   3287 
   3288 void DotPrinter::Visit(RegExpNode* node) {
   3289   if (node->info()->visited) return;
   3290   node->info()->visited = true;
   3291   node->Accept(this);
   3292 }
   3293 
   3294 
   3295 void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
   3296   stream()->Add("  n%p -> n%p [style=dotted];\n", from, on_failure);
   3297   Visit(on_failure);
   3298 }
   3299 
   3300 
   3301 class TableEntryBodyPrinter {
   3302  public:
   3303   TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice)
   3304       : stream_(stream), choice_(choice) { }
   3305   void Call(uc16 from, DispatchTable::Entry entry) {
   3306     OutSet* out_set = entry.out_set();
   3307     for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
   3308       if (out_set->Get(i)) {
   3309         stream()->Add("    n%p:s%io%i -> n%p;\n",
   3310                       choice(),
   3311                       from,
   3312                       i,
   3313                       choice()->alternatives()->at(i).node());
   3314       }
   3315     }
   3316   }
   3317  private:
   3318   StringStream* stream() { return stream_; }
   3319   ChoiceNode* choice() { return choice_; }
   3320   StringStream* stream_;
   3321   ChoiceNode* choice_;
   3322 };
   3323 
   3324 
   3325 class TableEntryHeaderPrinter {
   3326  public:
   3327   explicit TableEntryHeaderPrinter(StringStream* stream)
   3328       : first_(true), stream_(stream) { }
   3329   void Call(uc16 from, DispatchTable::Entry entry) {
   3330     if (first_) {
   3331       first_ = false;
   3332     } else {
   3333       stream()->Add("|");
   3334     }
   3335     stream()->Add("{\\%k-\\%k|{", from, entry.to());
   3336     OutSet* out_set = entry.out_set();
   3337     int priority = 0;
   3338     for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
   3339       if (out_set->Get(i)) {
   3340         if (priority > 0) stream()->Add("|");
   3341         stream()->Add("<s%io%i> %i", from, i, priority);
   3342         priority++;
   3343       }
   3344     }
   3345     stream()->Add("}}");
   3346   }
   3347 
   3348  private:
   3349   bool first_;
   3350   StringStream* stream() { return stream_; }
   3351   StringStream* stream_;
   3352 };
   3353 
   3354 
   3355 class AttributePrinter {
   3356  public:
   3357   explicit AttributePrinter(DotPrinter* out)
   3358       : out_(out), first_(true) { }
   3359   void PrintSeparator() {
   3360     if (first_) {
   3361       first_ = false;
   3362     } else {
   3363       out_->stream()->Add("|");
   3364     }
   3365   }
   3366   void PrintBit(const char* name, bool value) {
   3367     if (!value) return;
   3368     PrintSeparator();
   3369     out_->stream()->Add("{%s}", name);
   3370   }
   3371   void PrintPositive(const char* name, int value) {
   3372     if (value < 0) return;
   3373     PrintSeparator();
   3374     out_->stream()->Add("{%s|%x}", name, value);
   3375   }
   3376  private:
   3377   DotPrinter* out_;
   3378   bool first_;
   3379 };
   3380 
   3381 
   3382 void DotPrinter::PrintAttributes(RegExpNode* that) {
   3383   stream()->Add("  a%p [shape=Mrecord, color=grey, fontcolor=grey, "
   3384                 "margin=0.1, fontsize=10, label=\"{",
   3385                 that);
   3386   AttributePrinter printer(this);
   3387   NodeInfo* info = that->info();
   3388   printer.PrintBit("NI", info->follows_newline_interest);
   3389   printer.PrintBit("WI", info->follows_word_interest);
   3390   printer.PrintBit("SI", info->follows_start_interest);
   3391   Label* label = that->label();
   3392   if (label->is_bound())
   3393     printer.PrintPositive("@", label->pos());
   3394   stream()->Add("}\"];\n");
   3395   stream()->Add("  a%p -> n%p [style=dashed, color=grey, "
   3396                 "arrowhead=none];\n", that, that);
   3397 }
   3398 
   3399 
   3400 static const bool kPrintDispatchTable = false;
   3401 void DotPrinter::VisitChoice(ChoiceNode* that) {
   3402   if (kPrintDispatchTable) {
   3403     stream()->Add("  n%p [shape=Mrecord, label=\"", that);
   3404     TableEntryHeaderPrinter header_printer(stream());
   3405     that->GetTable(ignore_case_)->ForEach(&header_printer);
   3406     stream()->Add("\"]\n", that);
   3407     PrintAttributes(that);
   3408     TableEntryBodyPrinter body_printer(stream(), that);
   3409     that->GetTable(ignore_case_)->ForEach(&body_printer);
   3410   } else {
   3411     stream()->Add("  n%p [shape=Mrecord, label=\"?\"];\n", that);
   3412     for (int i = 0; i < that->alternatives()->length(); i++) {
   3413       GuardedAlternative alt = that->alternatives()->at(i);
   3414       stream()->Add("  n%p -> n%p;\n", that, alt.node());
   3415     }
   3416   }
   3417   for (int i = 0; i < that->alternatives()->length(); i++) {
   3418     GuardedAlternative alt = that->alternatives()->at(i);
   3419     alt.node()->Accept(this);
   3420   }
   3421 }
   3422 
   3423 
   3424 void DotPrinter::VisitText(TextNode* that) {
   3425   stream()->Add("  n%p [label=\"", that);
   3426   for (int i = 0; i < that->elements()->length(); i++) {
   3427     if (i > 0) stream()->Add(" ");
   3428     TextElement elm = that->elements()->at(i);
   3429     switch (elm.type) {
   3430       case TextElement::ATOM: {
   3431         stream()->Add("'%w'", elm.data.u_atom->data());
   3432         break;
   3433       }
   3434       case TextElement::CHAR_CLASS: {
   3435         RegExpCharacterClass* node = elm.data.u_char_class;
   3436         stream()->Add("[");
   3437         if (node->is_negated())
   3438           stream()->Add("^");
   3439         for (int j = 0; j < node->ranges()->length(); j++) {
   3440           CharacterRange range = node->ranges()->at(j);
   3441           stream()->Add("%k-%k", range.from(), range.to());
   3442         }
   3443         stream()->Add("]");
   3444         break;
   3445       }
   3446       default:
   3447         UNREACHABLE();
   3448     }
   3449   }
   3450   stream()->Add("\", shape=box, peripheries=2];\n");
   3451   PrintAttributes(that);
   3452   stream()->Add("  n%p -> n%p;\n", that, that->on_success());
   3453   Visit(that->on_success());
   3454 }
   3455 
   3456 
   3457 void DotPrinter::VisitBackReference(BackReferenceNode* that) {
   3458   stream()->Add("  n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n",
   3459                 that,
   3460                 that->start_register(),
   3461                 that->end_register());
   3462   PrintAttributes(that);
   3463   stream()->Add("  n%p -> n%p;\n", that, that->on_success());
   3464   Visit(that->on_success());
   3465 }
   3466 
   3467 
   3468 void DotPrinter::VisitEnd(EndNode* that) {
   3469   stream()->Add("  n%p [style=bold, shape=point];\n", that);
   3470   PrintAttributes(that);
   3471 }
   3472 
   3473 
   3474 void DotPrinter::VisitAssertion(AssertionNode* that) {
   3475   stream()->Add("  n%p [", that);
   3476   switch (that->type()) {
   3477     case AssertionNode::AT_END:
   3478       stream()->Add("label=\"$\", shape=septagon");
   3479       break;
   3480     case AssertionNode::AT_START:
   3481       stream()->Add("label=\"^\", shape=septagon");
   3482       break;
   3483     case AssertionNode::AT_BOUNDARY:
   3484       stream()->Add("label=\"\\b\", shape=septagon");
   3485       break;
   3486     case AssertionNode::AT_NON_BOUNDARY:
   3487       stream()->Add("label=\"\\B\", shape=septagon");
   3488       break;
   3489     case AssertionNode::AFTER_NEWLINE:
   3490       stream()->Add("label=\"(?<=\\n)\", shape=septagon");
   3491       break;
   3492     case AssertionNode::AFTER_WORD_CHARACTER:
   3493       stream()->Add("label=\"(?<=\\w)\", shape=septagon");
   3494       break;
   3495     case AssertionNode::AFTER_NONWORD_CHARACTER:
   3496       stream()->Add("label=\"(?<=\\W)\", shape=septagon");
   3497       break;
   3498   }
   3499   stream()->Add("];\n");
   3500   PrintAttributes(that);
   3501   RegExpNode* successor = that->on_success();
   3502   stream()->Add("  n%p -> n%p;\n", that, successor);
   3503   Visit(successor);
   3504 }
   3505 
   3506 
   3507 void DotPrinter::VisitAction(ActionNode* that) {
   3508   stream()->Add("  n%p [", that);
   3509   switch (that->type_) {
   3510     case ActionNode::SET_REGISTER:
   3511       stream()->Add("label=\"$%i:=%i\", shape=octagon",
   3512                     that->data_.u_store_register.reg,
   3513                     that->data_.u_store_register.value);
   3514       break;
   3515     case ActionNode::INCREMENT_REGISTER:
   3516       stream()->Add("label=\"$%i++\", shape=octagon",
   3517                     that->data_.u_increment_register.reg);
   3518       break;
   3519     case ActionNode::STORE_POSITION:
   3520       stream()->Add("label=\"$%i:=$pos\", shape=octagon",
   3521                     that->data_.u_position_register.reg);
   3522       break;
   3523     case ActionNode::BEGIN_SUBMATCH:
   3524       stream()->Add("label=\"$%i:=$pos,begin\", shape=septagon",
   3525                     that->data_.u_submatch.current_position_register);
   3526       break;
   3527     case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
   3528       stream()->Add("label=\"escape\", shape=septagon");
   3529       break;
   3530     case ActionNode::EMPTY_MATCH_CHECK:
   3531       stream()->Add("label=\"$%i=$pos?,$%i<%i?\", shape=septagon",
   3532                     that->data_.u_empty_match_check.start_register,
   3533                     that->data_.u_empty_match_check.repetition_register,
   3534                     that->data_.u_empty_match_check.repetition_limit);
   3535       break;
   3536     case ActionNode::CLEAR_CAPTURES: {
   3537       stream()->Add("label=\"clear $%i to $%i\", shape=septagon",
   3538                     that->data_.u_clear_captures.range_from,
   3539                     that->data_.u_clear_captures.range_to);
   3540       break;
   3541     }
   3542   }
   3543   stream()->Add("];\n");
   3544   PrintAttributes(that);
   3545   RegExpNode* successor = that->on_success();
   3546   stream()->Add("  n%p -> n%p;\n", that, successor);
   3547   Visit(successor);
   3548 }
   3549 
   3550 
   3551 class DispatchTableDumper {
   3552  public:
   3553   explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { }
   3554   void Call(uc16 key, DispatchTable::Entry entry);
   3555   StringStream* stream() { return stream_; }
   3556  private:
   3557   StringStream* stream_;
   3558 };
   3559 
   3560 
   3561 void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
   3562   stream()->Add("[%k-%k]: {", key, entry.to());
   3563   OutSet* set = entry.out_set();
   3564   bool first = true;
   3565   for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
   3566     if (set->Get(i)) {
   3567       if (first) {
   3568         first = false;
   3569       } else {
   3570         stream()->Add(", ");
   3571       }
   3572       stream()->Add("%i", i);
   3573     }
   3574   }
   3575   stream()->Add("}\n");
   3576 }
   3577 
   3578 
   3579 void DispatchTable::Dump() {
   3580   HeapStringAllocator alloc;
   3581   StringStream stream(&alloc);
   3582   DispatchTableDumper dumper(&stream);
   3583   tree()->ForEach(&dumper);
   3584   OS::PrintError("%s", *stream.ToCString());
   3585 }
   3586 
   3587 
   3588 void RegExpEngine::DotPrint(const char* label,
   3589                             RegExpNode* node,
   3590                             bool ignore_case) {
   3591   DotPrinter printer(ignore_case);
   3592   printer.PrintNode(label, node);
   3593 }
   3594 
   3595 
   3596 #endif  // DEBUG
   3597 
   3598 
   3599 // -------------------------------------------------------------------
   3600 // Tree to graph conversion
   3601 
   3602 static const uc16 kSpaceRanges[] = { 0x0009, 0x000D, 0x0020, 0x0020, 0x00A0,
   3603     0x00A0, 0x1680, 0x1680, 0x180E, 0x180E, 0x2000, 0x200A, 0x2028, 0x2029,
   3604     0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000, 0xFEFF, 0xFEFF };
   3605 static const int kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges);
   3606 
   3607 static const uc16 kWordRanges[] = { '0', '9', 'A', 'Z', '_', '_', 'a', 'z' };
   3608 static const int kWordRangeCount = ARRAY_SIZE(kWordRanges);
   3609 
   3610 static const uc16 kDigitRanges[] = { '0', '9' };
   3611 static const int kDigitRangeCount = ARRAY_SIZE(kDigitRanges);
   3612 
   3613 static const uc16 kLineTerminatorRanges[] = { 0x000A, 0x000A, 0x000D, 0x000D,
   3614     0x2028, 0x2029 };
   3615 static const int kLineTerminatorRangeCount = ARRAY_SIZE(kLineTerminatorRanges);
   3616 
   3617 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
   3618                                RegExpNode* on_success) {
   3619   ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);
   3620   elms->Add(TextElement::Atom(this));
   3621   return new TextNode(elms, on_success);
   3622 }
   3623 
   3624 
   3625 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
   3626                                RegExpNode* on_success) {
   3627   return new TextNode(elements(), on_success);
   3628 }
   3629 
   3630 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
   3631                                  const uc16* special_class,
   3632                                  int length) {
   3633   ASSERT(ranges->length() != 0);
   3634   ASSERT(length != 0);
   3635   ASSERT(special_class[0] != 0);
   3636   if (ranges->length() != (length >> 1) + 1) {
   3637     return false;
   3638   }
   3639   CharacterRange range = ranges->at(0);
   3640   if (range.from() != 0) {
   3641     return false;
   3642   }
   3643   for (int i = 0; i < length; i += 2) {
   3644     if (special_class[i] != (range.to() + 1)) {
   3645       return false;
   3646     }
   3647     range = ranges->at((i >> 1) + 1);
   3648     if (special_class[i+1] != range.from() - 1) {
   3649       return false;
   3650     }
   3651   }
   3652   if (range.to() != 0xffff) {
   3653     return false;
   3654   }
   3655   return true;
   3656 }
   3657 
   3658 
   3659 static bool CompareRanges(ZoneList<CharacterRange>* ranges,
   3660                           const uc16* special_class,
   3661                           int length) {
   3662   if (ranges->length() * 2 != length) {
   3663     return false;
   3664   }
   3665   for (int i = 0; i < length; i += 2) {
   3666     CharacterRange range = ranges->at(i >> 1);
   3667     if (range.from() != special_class[i] || range.to() != special_class[i+1]) {
   3668       return false;
   3669     }
   3670   }
   3671   return true;
   3672 }
   3673 
   3674 
   3675 bool RegExpCharacterClass::is_standard() {
   3676   // TODO(lrn): Remove need for this function, by not throwing away information
   3677   // along the way.
   3678   if (is_negated_) {
   3679     return false;
   3680   }
   3681   if (set_.is_standard()) {
   3682     return true;
   3683   }
   3684   if (CompareRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
   3685     set_.set_standard_set_type('s');
   3686     return true;
   3687   }
   3688   if (CompareInverseRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
   3689     set_.set_standard_set_type('S');
   3690     return true;
   3691   }
   3692   if (CompareInverseRanges(set_.ranges(),
   3693                            kLineTerminatorRanges,
   3694                            kLineTerminatorRangeCount)) {
   3695     set_.set_standard_set_type('.');
   3696     return true;
   3697   }
   3698   if (CompareRanges(set_.ranges(),
   3699                     kLineTerminatorRanges,
   3700                     kLineTerminatorRangeCount)) {
   3701     set_.set_standard_set_type('n');
   3702     return true;
   3703   }
   3704   if (CompareRanges(set_.ranges(), kWordRanges, kWordRangeCount)) {
   3705     set_.set_standard_set_type('w');
   3706     return true;
   3707   }
   3708   if (CompareInverseRanges(set_.ranges(), kWordRanges, kWordRangeCount)) {
   3709     set_.set_standard_set_type('W');
   3710     return true;
   3711   }
   3712   return false;
   3713 }
   3714 
   3715 
   3716 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
   3717                                          RegExpNode* on_success) {
   3718   return new TextNode(this, on_success);
   3719 }
   3720 
   3721 
   3722 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
   3723                                       RegExpNode* on_success) {
   3724   ZoneList<RegExpTree*>* alternatives = this->alternatives();
   3725   int length = alternatives->length();
   3726   ChoiceNode* result = new ChoiceNode(length);
   3727   for (int i = 0; i < length; i++) {
   3728     GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
   3729                                                                on_success));
   3730     result->AddAlternative(alternative);
   3731   }
   3732   return result;
   3733 }
   3734 
   3735 
   3736 RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
   3737                                      RegExpNode* on_success) {
   3738   return ToNode(min(),
   3739                 max(),
   3740                 is_greedy(),
   3741                 body(),
   3742                 compiler,
   3743                 on_success);
   3744 }
   3745 
   3746 
   3747 // Scoped object to keep track of how much we unroll quantifier loops in the
   3748 // regexp graph generator.
   3749 class RegExpExpansionLimiter {
   3750  public:
   3751   static const int kMaxExpansionFactor = 6;
   3752   RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
   3753       : compiler_(compiler),
   3754         saved_expansion_factor_(compiler->current_expansion_factor()),
   3755         ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
   3756     ASSERT(factor > 0);
   3757     if (ok_to_expand_) {
   3758       if (factor > kMaxExpansionFactor) {
   3759         // Avoid integer overflow of the current expansion factor.
   3760         ok_to_expand_ = false;
   3761         compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
   3762       } else {
   3763         int new_factor = saved_expansion_factor_ * factor;
   3764         ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
   3765         compiler->set_current_expansion_factor(new_factor);
   3766       }
   3767     }
   3768   }
   3769 
   3770   ~RegExpExpansionLimiter() {
   3771     compiler_->set_current_expansion_factor(saved_expansion_factor_);
   3772   }
   3773 
   3774   bool ok_to_expand() { return ok_to_expand_; }
   3775 
   3776  private:
   3777   RegExpCompiler* compiler_;
   3778   int saved_expansion_factor_;
   3779   bool ok_to_expand_;
   3780 
   3781   DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
   3782 };
   3783 
   3784 
   3785 RegExpNode* RegExpQuantifier::ToNode(int min,
   3786                                      int max,
   3787                                      bool is_greedy,
   3788                                      RegExpTree* body,
   3789                                      RegExpCompiler* compiler,
   3790                                      RegExpNode* on_success,
   3791                                      bool not_at_start) {
   3792   // x{f, t} becomes this:
   3793   //
   3794   //             (r++)<-.
   3795   //               |     `
   3796   //               |     (x)
   3797   //               v     ^
   3798   //      (r=0)-->(?)---/ [if r < t]
   3799   //               |
   3800   //   [if r >= f] \----> ...
   3801   //
   3802 
   3803   // 15.10.2.5 RepeatMatcher algorithm.
   3804   // The parser has already eliminated the case where max is 0.  In the case
   3805   // where max_match is zero the parser has removed the quantifier if min was
   3806   // > 0 and removed the atom if min was 0.  See AddQuantifierToAtom.
   3807 
   3808   // If we know that we cannot match zero length then things are a little
   3809   // simpler since we don't need to make the special zero length match check
   3810   // from step 2.1.  If the min and max are small we can unroll a little in
   3811   // this case.
   3812   static const int kMaxUnrolledMinMatches = 3;  // Unroll (foo)+ and (foo){3,}
   3813   static const int kMaxUnrolledMaxMatches = 3;  // Unroll (foo)? and (foo){x,3}
   3814   if (max == 0) return on_success;  // This can happen due to recursion.
   3815   bool body_can_be_empty = (body->min_match() == 0);
   3816   int body_start_reg = RegExpCompiler::kNoRegister;
   3817   Interval capture_registers = body->CaptureRegisters();
   3818   bool needs_capture_clearing = !capture_registers.is_empty();
   3819   if (body_can_be_empty) {
   3820     body_start_reg = compiler->AllocateRegister();
   3821   } else if (FLAG_regexp_optimization && !needs_capture_clearing) {
   3822     // Only unroll if there are no captures and the body can't be
   3823     // empty.
   3824     {
   3825       RegExpExpansionLimiter limiter(
   3826           compiler, min + ((max != min) ? 1 : 0));
   3827       if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
   3828         int new_max = (max == kInfinity) ? max : max - min;
   3829         // Recurse once to get the loop or optional matches after the fixed
   3830         // ones.
   3831         RegExpNode* answer = ToNode(
   3832             0, new_max, is_greedy, body, compiler, on_success, true);
   3833         // Unroll the forced matches from 0 to min.  This can cause chains of
   3834         // TextNodes (which the parser does not generate).  These should be
   3835         // combined if it turns out they hinder good code generation.
   3836         for (int i = 0; i < min; i++) {
   3837           answer = body->ToNode(compiler, answer);
   3838         }
   3839         return answer;
   3840       }
   3841     }
   3842     if (max <= kMaxUnrolledMaxMatches && min == 0) {
   3843       ASSERT(max > 0);  // Due to the 'if' above.
   3844       RegExpExpansionLimiter limiter(compiler, max);
   3845       if (limiter.ok_to_expand()) {
   3846         // Unroll the optional matches up to max.
   3847         RegExpNode* answer = on_success;
   3848         for (int i = 0; i < max; i++) {
   3849           ChoiceNode* alternation = new ChoiceNode(2);
   3850           if (is_greedy) {
   3851             alternation->AddAlternative(
   3852                 GuardedAlternative(body->ToNode(compiler, answer)));
   3853             alternation->AddAlternative(GuardedAlternative(on_success));
   3854           } else {
   3855             alternation->AddAlternative(GuardedAlternative(on_success));
   3856             alternation->AddAlternative(
   3857                 GuardedAlternative(body->ToNode(compiler, answer)));
   3858           }
   3859           answer = alternation;
   3860           if (not_at_start) alternation->set_not_at_start();
   3861         }
   3862         return answer;
   3863       }
   3864     }
   3865   }
   3866   bool has_min = min > 0;
   3867   bool has_max = max < RegExpTree::kInfinity;
   3868   bool needs_counter = has_min || has_max;
   3869   int reg_ctr = needs_counter
   3870       ? compiler->AllocateRegister()
   3871       : RegExpCompiler::kNoRegister;
   3872   LoopChoiceNode* center = new LoopChoiceNode(body->min_match() == 0);
   3873   if (not_at_start) center->set_not_at_start();
   3874   RegExpNode* loop_return = needs_counter
   3875       ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
   3876       : static_cast<RegExpNode*>(center);
   3877   if (body_can_be_empty) {
   3878     // If the body can be empty we need to check if it was and then
   3879     // backtrack.
   3880     loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
   3881                                               reg_ctr,
   3882                                               min,
   3883                                               loop_return);
   3884   }
   3885   RegExpNode* body_node = body->ToNode(compiler, loop_return);
   3886   if (body_can_be_empty) {
   3887     // If the body can be empty we need to store the start position
   3888     // so we can bail out if it was empty.
   3889     body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
   3890   }
   3891   if (needs_capture_clearing) {
   3892     // Before entering the body of this loop we need to clear captures.
   3893     body_node = ActionNode::ClearCaptures(capture_registers, body_node);
   3894   }
   3895   GuardedAlternative body_alt(body_node);
   3896   if (has_max) {
   3897     Guard* body_guard = new Guard(reg_ctr, Guard::LT, max);
   3898     body_alt.AddGuard(body_guard);
   3899   }
   3900   GuardedAlternative rest_alt(on_success);
   3901   if (has_min) {
   3902     Guard* rest_guard = new Guard(reg_ctr, Guard::GEQ, min);
   3903     rest_alt.AddGuard(rest_guard);
   3904   }
   3905   if (is_greedy) {
   3906     center->AddLoopAlternative(body_alt);
   3907     center->AddContinueAlternative(rest_alt);
   3908   } else {
   3909     center->AddContinueAlternative(rest_alt);
   3910     center->AddLoopAlternative(body_alt);
   3911   }
   3912   if (needs_counter) {
   3913     return ActionNode::SetRegister(reg_ctr, 0, center);
   3914   } else {
   3915     return center;
   3916   }
   3917 }
   3918 
   3919 
   3920 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
   3921                                     RegExpNode* on_success) {
   3922   NodeInfo info;
   3923   switch (type()) {
   3924     case START_OF_LINE:
   3925       return AssertionNode::AfterNewline(on_success);
   3926     case START_OF_INPUT:
   3927       return AssertionNode::AtStart(on_success);
   3928     case BOUNDARY:
   3929       return AssertionNode::AtBoundary(on_success);
   3930     case NON_BOUNDARY:
   3931       return AssertionNode::AtNonBoundary(on_success);
   3932     case END_OF_INPUT:
   3933       return AssertionNode::AtEnd(on_success);
   3934     case END_OF_LINE: {
   3935       // Compile $ in multiline regexps as an alternation with a positive
   3936       // lookahead in one side and an end-of-input on the other side.
   3937       // We need two registers for the lookahead.
   3938       int stack_pointer_register = compiler->AllocateRegister();
   3939       int position_register = compiler->AllocateRegister();
   3940       // The ChoiceNode to distinguish between a newline and end-of-input.
   3941       ChoiceNode* result = new ChoiceNode(2);
   3942       // Create a newline atom.
   3943       ZoneList<CharacterRange>* newline_ranges =
   3944           new ZoneList<CharacterRange>(3);
   3945       CharacterRange::AddClassEscape('n', newline_ranges);
   3946       RegExpCharacterClass* newline_atom = new RegExpCharacterClass('n');
   3947       TextNode* newline_matcher = new TextNode(
   3948          newline_atom,
   3949          ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
   3950                                              position_register,
   3951                                              0,  // No captures inside.
   3952                                              -1,  // Ignored if no captures.
   3953                                              on_success));
   3954       // Create an end-of-input matcher.
   3955       RegExpNode* end_of_line = ActionNode::BeginSubmatch(
   3956           stack_pointer_register,
   3957           position_register,
   3958           newline_matcher);
   3959       // Add the two alternatives to the ChoiceNode.
   3960       GuardedAlternative eol_alternative(end_of_line);
   3961       result->AddAlternative(eol_alternative);
   3962       GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
   3963       result->AddAlternative(end_alternative);
   3964       return result;
   3965     }
   3966     default:
   3967       UNREACHABLE();
   3968   }
   3969   return on_success;
   3970 }
   3971 
   3972 
   3973 RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
   3974                                         RegExpNode* on_success) {
   3975   return new BackReferenceNode(RegExpCapture::StartRegister(index()),
   3976                                RegExpCapture::EndRegister(index()),
   3977                                on_success);
   3978 }
   3979 
   3980 
   3981 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
   3982                                 RegExpNode* on_success) {
   3983   return on_success;
   3984 }
   3985 
   3986 
   3987 RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
   3988                                     RegExpNode* on_success) {
   3989   int stack_pointer_register = compiler->AllocateRegister();
   3990   int position_register = compiler->AllocateRegister();
   3991 
   3992   const int registers_per_capture = 2;
   3993   const int register_of_first_capture = 2;
   3994   int register_count = capture_count_ * registers_per_capture;
   3995   int register_start =
   3996     register_of_first_capture + capture_from_ * registers_per_capture;
   3997 
   3998   RegExpNode* success;
   3999   if (is_positive()) {
   4000     RegExpNode* node = ActionNode::BeginSubmatch(
   4001         stack_pointer_register,
   4002         position_register,
   4003         body()->ToNode(
   4004             compiler,
   4005             ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
   4006                                                 position_register,
   4007                                                 register_count,
   4008                                                 register_start,
   4009                                                 on_success)));
   4010     return node;
   4011   } else {
   4012     // We use a ChoiceNode for a negative lookahead because it has most of
   4013     // the characteristics we need.  It has the body of the lookahead as its
   4014     // first alternative and the expression after the lookahead of the second
   4015     // alternative.  If the first alternative succeeds then the
   4016     // NegativeSubmatchSuccess will unwind the stack including everything the
   4017     // choice node set up and backtrack.  If the first alternative fails then
   4018     // the second alternative is tried, which is exactly the desired result
   4019     // for a negative lookahead.  The NegativeLookaheadChoiceNode is a special
   4020     // ChoiceNode that knows to ignore the first exit when calculating quick
   4021     // checks.
   4022     GuardedAlternative body_alt(
   4023         body()->ToNode(
   4024             compiler,
   4025             success = new NegativeSubmatchSuccess(stack_pointer_register,
   4026                                                   position_register,
   4027                                                   register_count,
   4028                                                   register_start)));
   4029     ChoiceNode* choice_node =
   4030         new NegativeLookaheadChoiceNode(body_alt,
   4031                                         GuardedAlternative(on_success));
   4032     return ActionNode::BeginSubmatch(stack_pointer_register,
   4033                                      position_register,
   4034                                      choice_node);
   4035   }
   4036 }
   4037 
   4038 
   4039 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
   4040                                   RegExpNode* on_success) {
   4041   return ToNode(body(), index(), compiler, on_success);
   4042 }
   4043 
   4044 
   4045 RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
   4046                                   int index,
   4047                                   RegExpCompiler* compiler,
   4048                                   RegExpNode* on_success) {
   4049   int start_reg = RegExpCapture::StartRegister(index);
   4050   int end_reg = RegExpCapture::EndRegister(index);
   4051   RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
   4052   RegExpNode* body_node = body->ToNode(compiler, store_end);
   4053   return ActionNode::StorePosition(start_reg, true, body_node);
   4054 }
   4055 
   4056 
   4057 RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
   4058                                       RegExpNode* on_success) {
   4059   ZoneList<RegExpTree*>* children = nodes();
   4060   RegExpNode* current = on_success;
   4061   for (int i = children->length() - 1; i >= 0; i--) {
   4062     current = children->at(i)->ToNode(compiler, current);
   4063   }
   4064   return current;
   4065 }
   4066 
   4067 
   4068 static void AddClass(const uc16* elmv,
   4069                      int elmc,
   4070                      ZoneList<CharacterRange>* ranges) {
   4071   for (int i = 0; i < elmc; i += 2) {
   4072     ASSERT(elmv[i] <= elmv[i + 1]);
   4073     ranges->Add(CharacterRange(elmv[i], elmv[i + 1]));
   4074   }
   4075 }
   4076 
   4077 
   4078 static void AddClassNegated(const uc16 *elmv,
   4079                             int elmc,
   4080                             ZoneList<CharacterRange>* ranges) {
   4081   ASSERT(elmv[0] != 0x0000);
   4082   ASSERT(elmv[elmc-1] != String::kMaxUtf16CodeUnit);
   4083   uc16 last = 0x0000;
   4084   for (int i = 0; i < elmc; i += 2) {
   4085     ASSERT(last <= elmv[i] - 1);
   4086     ASSERT(elmv[i] <= elmv[i + 1]);
   4087     ranges->Add(CharacterRange(last, elmv[i] - 1));
   4088     last = elmv[i + 1] + 1;
   4089   }
   4090   ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit));
   4091 }
   4092 
   4093 
   4094 void CharacterRange::AddClassEscape(uc16 type,
   4095                                     ZoneList<CharacterRange>* ranges) {
   4096   switch (type) {
   4097     case 's':
   4098       AddClass(kSpaceRanges, kSpaceRangeCount, ranges);
   4099       break;
   4100     case 'S':
   4101       AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges);
   4102       break;
   4103     case 'w':
   4104       AddClass(kWordRanges, kWordRangeCount, ranges);
   4105       break;
   4106     case 'W':
   4107       AddClassNegated(kWordRanges, kWordRangeCount, ranges);
   4108       break;
   4109     case 'd':
   4110       AddClass(kDigitRanges, kDigitRangeCount, ranges);
   4111       break;
   4112     case 'D':
   4113       AddClassNegated(kDigitRanges, kDigitRangeCount, ranges);
   4114       break;
   4115     case '.':
   4116       AddClassNegated(kLineTerminatorRanges,
   4117                       kLineTerminatorRangeCount,
   4118                       ranges);
   4119       break;
   4120     // This is not a character range as defined by the spec but a
   4121     // convenient shorthand for a character class that matches any
   4122     // character.
   4123     case '*':
   4124       ranges->Add(CharacterRange::Everything());
   4125       break;
   4126     // This is the set of characters matched by the $ and ^ symbols
   4127     // in multiline mode.
   4128     case 'n':
   4129       AddClass(kLineTerminatorRanges,
   4130                kLineTerminatorRangeCount,
   4131                ranges);
   4132       break;
   4133     default:
   4134       UNREACHABLE();
   4135   }
   4136 }
   4137 
   4138 
   4139 Vector<const uc16> CharacterRange::GetWordBounds() {
   4140   return Vector<const uc16>(kWordRanges, kWordRangeCount);
   4141 }
   4142 
   4143 
   4144 class CharacterRangeSplitter {
   4145  public:
   4146   CharacterRangeSplitter(ZoneList<CharacterRange>** included,
   4147                           ZoneList<CharacterRange>** excluded)
   4148       : included_(included),
   4149         excluded_(excluded) { }
   4150   void Call(uc16 from, DispatchTable::Entry entry);
   4151 
   4152   static const int kInBase = 0;
   4153   static const int kInOverlay = 1;
   4154 
   4155  private:
   4156   ZoneList<CharacterRange>** included_;
   4157   ZoneList<CharacterRange>** excluded_;
   4158 };
   4159 
   4160 
   4161 void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) {
   4162   if (!entry.out_set()->Get(kInBase)) return;
   4163   ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)
   4164     ? included_
   4165     : excluded_;
   4166   if (*target == NULL) *target = new ZoneList<CharacterRange>(2);
   4167   (*target)->Add(CharacterRange(entry.from(), entry.to()));
   4168 }
   4169 
   4170 
   4171 void CharacterRange::Split(ZoneList<CharacterRange>* base,
   4172                            Vector<const uc16> overlay,
   4173                            ZoneList<CharacterRange>** included,
   4174                            ZoneList<CharacterRange>** excluded) {
   4175   ASSERT_EQ(NULL, *included);
   4176   ASSERT_EQ(NULL, *excluded);
   4177   DispatchTable table;
   4178   for (int i = 0; i < base->length(); i++)
   4179     table.AddRange(base->at(i), CharacterRangeSplitter::kInBase);
   4180   for (int i = 0; i < overlay.length(); i += 2) {
   4181     table.AddRange(CharacterRange(overlay[i], overlay[i+1]),
   4182                    CharacterRangeSplitter::kInOverlay);
   4183   }
   4184   CharacterRangeSplitter callback(included, excluded);
   4185   table.ForEach(&callback);
   4186 }
   4187 
   4188 
   4189 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,
   4190                                         bool is_ascii) {
   4191   Isolate* isolate = Isolate::Current();
   4192   uc16 bottom = from();
   4193   uc16 top = to();
   4194   if (is_ascii) {
   4195     if (bottom > String::kMaxAsciiCharCode) return;
   4196     if (top > String::kMaxAsciiCharCode) top = String::kMaxAsciiCharCode;
   4197   }
   4198   unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
   4199   if (top == bottom) {
   4200     // If this is a singleton we just expand the one character.
   4201     int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
   4202     for (int i = 0; i < length; i++) {
   4203       uc32 chr = chars[i];
   4204       if (chr != bottom) {
   4205         ranges->Add(CharacterRange::Singleton(chars[i]));
   4206       }
   4207     }
   4208   } else {
   4209     // If this is a range we expand the characters block by block,
   4210     // expanding contiguous subranges (blocks) one at a time.
   4211     // The approach is as follows.  For a given start character we
   4212     // look up the remainder of the block that contains it (represented
   4213     // by the end point), for instance we find 'z' if the character
   4214     // is 'c'.  A block is characterized by the property
   4215     // that all characters uncanonicalize in the same way, except that
   4216     // each entry in the result is incremented by the distance from the first
   4217     // element.  So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and
   4218     // the k'th letter uncanonicalizes to ['a' + k, 'A' + k].
   4219     // Once we've found the end point we look up its uncanonicalization
   4220     // and produce a range for each element.  For instance for [c-f]
   4221     // we look up ['z', 'Z'] and produce [c-f] and [C-F].  We then only
   4222     // add a range if it is not already contained in the input, so [c-f]
   4223     // will be skipped but [C-F] will be added.  If this range is not
   4224     // completely contained in a block we do this for all the blocks
   4225     // covered by the range (handling characters that is not in a block
   4226     // as a "singleton block").
   4227     unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
   4228     int pos = bottom;
   4229     while (pos < top) {
   4230       int length = isolate->jsregexp_canonrange()->get(pos, '\0', range);
   4231       uc16 block_end;
   4232       if (length == 0) {
   4233         block_end = pos;
   4234       } else {
   4235         ASSERT_EQ(1, length);
   4236         block_end = range[0];
   4237       }
   4238       int end = (block_end > top) ? top : block_end;
   4239       length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', range);
   4240       for (int i = 0; i < length; i++) {
   4241         uc32 c = range[i];
   4242         uc16 range_from = c - (block_end - pos);
   4243         uc16 range_to = c - (block_end - end);
   4244         if (!(bottom <= range_from && range_to <= top)) {
   4245           ranges->Add(CharacterRange(range_from, range_to));
   4246         }
   4247       }
   4248       pos = end + 1;
   4249     }
   4250   }
   4251 }
   4252 
   4253 
   4254 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
   4255   ASSERT_NOT_NULL(ranges);
   4256   int n = ranges->length();
   4257   if (n <= 1) return true;
   4258   int max = ranges->at(0).to();
   4259   for (int i = 1; i < n; i++) {
   4260     CharacterRange next_range = ranges->at(i);
   4261     if (next_range.from() <= max + 1) return false;
   4262     max = next_range.to();
   4263   }
   4264   return true;
   4265 }
   4266 
   4267 SetRelation CharacterRange::WordCharacterRelation(
   4268     ZoneList<CharacterRange>* range) {
   4269   ASSERT(IsCanonical(range));
   4270   int i = 0;  // Word character range index.
   4271   int j = 0;  // Argument range index.
   4272   ASSERT_NE(0, kWordRangeCount);
   4273   SetRelation result;
   4274   if (range->length() == 0) {
   4275     result.SetElementsInSecondSet();
   4276     return result;
   4277   }
   4278   CharacterRange argument_range = range->at(0);
   4279   CharacterRange word_range = CharacterRange(kWordRanges[0], kWordRanges[1]);
   4280   while (i < kWordRangeCount && j < range->length()) {
   4281     // Check the two ranges for the five cases:
   4282     // - no overlap.
   4283     // - partial overlap (there are elements in both ranges that isn't
   4284     //   in the other, and there are also elements that are in both).
   4285     // - argument range entirely inside word range.
   4286     // - word range entirely inside argument range.
   4287     // - ranges are completely equal.
   4288 
   4289     // First check for no overlap. The earlier range is not in the other set.
   4290     if (argument_range.from() > word_range.to()) {
   4291       // Ranges are disjoint. The earlier word range contains elements that
   4292       // cannot be in the argument set.
   4293       result.SetElementsInSecondSet();
   4294     } else if (word_range.from() > argument_range.to()) {
   4295       // Ranges are disjoint. The earlier argument range contains elements that
   4296       // cannot be in the word set.
   4297       result.SetElementsInFirstSet();
   4298     } else if (word_range.from() <= argument_range.from() &&
   4299                word_range.to() >= argument_range.from()) {
   4300       result.SetElementsInBothSets();
   4301       // argument range completely inside word range.
   4302       if (word_range.from() < argument_range.from() ||
   4303           word_range.to() > argument_range.from()) {
   4304         result.SetElementsInSecondSet();
   4305       }
   4306     } else if (word_range.from() >= argument_range.from() &&
   4307                word_range.to() <= argument_range.from()) {
   4308       result.SetElementsInBothSets();
   4309       result.SetElementsInFirstSet();
   4310     } else {
   4311       // There is overlap, and neither is a subrange of the other
   4312       result.SetElementsInFirstSet();
   4313       result.SetElementsInSecondSet();
   4314       result.SetElementsInBothSets();
   4315     }
   4316     if (result.NonTrivialIntersection()) {
   4317       // The result is as (im)precise as we can possibly make it.
   4318       return result;
   4319     }
   4320     // Progress the range(s) with minimal to-character.
   4321     uc16 word_to = word_range.to();
   4322     uc16 argument_to = argument_range.to();
   4323     if (argument_to <= word_to) {
   4324       j++;
   4325       if (j < range->length()) {
   4326         argument_range = range->at(j);
   4327       }
   4328     }
   4329     if (word_to <= argument_to) {
   4330       i += 2;
   4331       if (i < kWordRangeCount) {
   4332         word_range = CharacterRange(kWordRanges[i], kWordRanges[i + 1]);
   4333       }
   4334     }
   4335   }
   4336   // Check if anything wasn't compared in the loop.
   4337   if (i < kWordRangeCount) {
   4338     // word range contains something not in argument range.
   4339     result.SetElementsInSecondSet();
   4340   } else if (j < range->length()) {
   4341     // Argument range contains something not in word range.
   4342     result.SetElementsInFirstSet();
   4343   }
   4344 
   4345   return result;
   4346 }
   4347 
   4348 
   4349 ZoneList<CharacterRange>* CharacterSet::ranges() {
   4350   if (ranges_ == NULL) {
   4351     ranges_ = new ZoneList<CharacterRange>(2);
   4352     CharacterRange::AddClassEscape(standard_set_type_, ranges_);
   4353   }
   4354   return ranges_;
   4355 }
   4356 
   4357 
   4358 // Move a number of elements in a zonelist to another position
   4359 // in the same list. Handles overlapping source and target areas.
   4360 static void MoveRanges(ZoneList<CharacterRange>* list,
   4361                        int from,
   4362                        int to,
   4363                        int count) {
   4364   // Ranges are potentially overlapping.
   4365   if (from < to) {
   4366     for (int i = count - 1; i >= 0; i--) {
   4367       list->at(to + i) = list->at(from + i);
   4368     }
   4369   } else {
   4370     for (int i = 0; i < count; i++) {
   4371       list->at(to + i) = list->at(from + i);
   4372     }
   4373   }
   4374 }
   4375 
   4376 
   4377 static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
   4378                                       int count,
   4379                                       CharacterRange insert) {
   4380   // Inserts a range into list[0..count[, which must be sorted
   4381   // by from value and non-overlapping and non-adjacent, using at most
   4382   // list[0..count] for the result. Returns the number of resulting
   4383   // canonicalized ranges. Inserting a range may collapse existing ranges into
   4384   // fewer ranges, so the return value can be anything in the range 1..count+1.
   4385   uc16 from = insert.from();
   4386   uc16 to = insert.to();
   4387   int start_pos = 0;
   4388   int end_pos = count;
   4389   for (int i = count - 1; i >= 0; i--) {
   4390     CharacterRange current = list->at(i);
   4391     if (current.from() > to + 1) {
   4392       end_pos = i;
   4393     } else if (current.to() + 1 < from) {
   4394       start_pos = i + 1;
   4395       break;
   4396     }
   4397   }
   4398 
   4399   // Inserted range overlaps, or is adjacent to, ranges at positions
   4400   // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
   4401   // not affected by the insertion.
   4402   // If start_pos == end_pos, the range must be inserted before start_pos.
   4403   // if start_pos < end_pos, the entire range from start_pos to end_pos
   4404   // must be merged with the insert range.
   4405 
   4406   if (start_pos == end_pos) {
   4407     // Insert between existing ranges at position start_pos.
   4408     if (start_pos < count) {
   4409       MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
   4410     }
   4411     list->at(start_pos) = insert;
   4412     return count + 1;
   4413   }
   4414   if (start_pos + 1 == end_pos) {
   4415     // Replace single existing range at position start_pos.
   4416     CharacterRange to_replace = list->at(start_pos);
   4417     int new_from = Min(to_replace.from(), from);
   4418     int new_to = Max(to_replace.to(), to);
   4419     list->at(start_pos) = CharacterRange(new_from, new_to);
   4420     return count;
   4421   }
   4422   // Replace a number of existing ranges from start_pos to end_pos - 1.
   4423   // Move the remaining ranges down.
   4424 
   4425   int new_from = Min(list->at(start_pos).from(), from);
   4426   int new_to = Max(list->at(end_pos - 1).to(), to);
   4427   if (end_pos < count) {
   4428     MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
   4429   }
   4430   list->at(start_pos) = CharacterRange(new_from, new_to);
   4431   return count - (end_pos - start_pos) + 1;
   4432 }
   4433 
   4434 
   4435 void CharacterSet::Canonicalize() {
   4436   // Special/default classes are always considered canonical. The result
   4437   // of calling ranges() will be sorted.
   4438   if (ranges_ == NULL) return;
   4439   CharacterRange::Canonicalize(ranges_);
   4440 }
   4441 
   4442 
   4443 void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
   4444   if (character_ranges->length() <= 1) return;
   4445   // Check whether ranges are already canonical (increasing, non-overlapping,
   4446   // non-adjacent).
   4447   int n = character_ranges->length();
   4448   int max = character_ranges->at(0).to();
   4449   int i = 1;
   4450   while (i < n) {
   4451     CharacterRange current = character_ranges->at(i);
   4452     if (current.from() <= max + 1) {
   4453       break;
   4454     }
   4455     max = current.to();
   4456     i++;
   4457   }
   4458   // Canonical until the i'th range. If that's all of them, we are done.
   4459   if (i == n) return;
   4460 
   4461   // The ranges at index i and forward are not canonicalized. Make them so by
   4462   // doing the equivalent of insertion sort (inserting each into the previous
   4463   // list, in order).
   4464   // Notice that inserting a range can reduce the number of ranges in the
   4465   // result due to combining of adjacent and overlapping ranges.
   4466   int read = i;  // Range to insert.
   4467   int num_canonical = i;  // Length of canonicalized part of list.
   4468   do {
   4469     num_canonical = InsertRangeInCanonicalList(character_ranges,
   4470                                                num_canonical,
   4471                                                character_ranges->at(read));
   4472     read++;
   4473   } while (read < n);
   4474   character_ranges->Rewind(num_canonical);
   4475 
   4476   ASSERT(CharacterRange::IsCanonical(character_ranges));
   4477 }
   4478 
   4479 
   4480 // Utility function for CharacterRange::Merge. Adds a range at the end of
   4481 // a canonicalized range list, if necessary merging the range with the last
   4482 // range of the list.
   4483 static void AddRangeToSet(ZoneList<CharacterRange>* set, CharacterRange range) {
   4484   if (set == NULL) return;
   4485   ASSERT(set->length() == 0 || set->at(set->length() - 1).to() < range.from());
   4486   int n = set->length();
   4487   if (n > 0) {
   4488     CharacterRange lastRange = set->at(n - 1);
   4489     if (lastRange.to() == range.from() - 1) {
   4490       set->at(n - 1) = CharacterRange(lastRange.from(), range.to());
   4491       return;
   4492     }
   4493   }
   4494   set->Add(range);
   4495 }
   4496 
   4497 
   4498 static void AddRangeToSelectedSet(int selector,
   4499                                   ZoneList<CharacterRange>* first_set,
   4500                                   ZoneList<CharacterRange>* second_set,
   4501                                   ZoneList<CharacterRange>* intersection_set,
   4502                                   CharacterRange range) {
   4503   switch (selector) {
   4504     case kInsideFirst:
   4505       AddRangeToSet(first_set, range);
   4506       break;
   4507     case kInsideSecond:
   4508       AddRangeToSet(second_set, range);
   4509       break;
   4510     case kInsideBoth:
   4511       AddRangeToSet(intersection_set, range);
   4512       break;
   4513   }
   4514 }
   4515 
   4516 
   4517 
   4518 void CharacterRange::Merge(ZoneList<CharacterRange>* first_set,
   4519                            ZoneList<CharacterRange>* second_set,
   4520                            ZoneList<CharacterRange>* first_set_only_out,
   4521                            ZoneList<CharacterRange>* second_set_only_out,
   4522                            ZoneList<CharacterRange>* both_sets_out) {
   4523   // Inputs are canonicalized.
   4524   ASSERT(CharacterRange::IsCanonical(first_set));
   4525   ASSERT(CharacterRange::IsCanonical(second_set));
   4526   // Outputs are empty, if applicable.
   4527   ASSERT(first_set_only_out == NULL || first_set_only_out->length() == 0);
   4528   ASSERT(second_set_only_out == NULL || second_set_only_out->length() == 0);
   4529   ASSERT(both_sets_out == NULL || both_sets_out->length() == 0);
   4530 
   4531   // Merge sets by iterating through the lists in order of lowest "from" value,
   4532   // and putting intervals into one of three sets.
   4533 
   4534   if (first_set->length() == 0) {
   4535     second_set_only_out->AddAll(*second_set);
   4536     return;
   4537   }
   4538   if (second_set->length() == 0) {
   4539     first_set_only_out->AddAll(*first_set);
   4540     return;
   4541   }
   4542   // Indices into input lists.
   4543   int i1 = 0;
   4544   int i2 = 0;
   4545   // Cache length of input lists.
   4546   int n1 = first_set->length();
   4547   int n2 = second_set->length();
   4548   // Current range. May be invalid if state is kInsideNone.
   4549   int from = 0;
   4550   int to = -1;
   4551   // Where current range comes from.
   4552   int state = kInsideNone;
   4553 
   4554   while (i1 < n1 || i2 < n2) {
   4555     CharacterRange next_range;
   4556     int range_source;
   4557     if (i2 == n2 ||
   4558         (i1 < n1 && first_set->at(i1).from() < second_set->at(i2).from())) {
   4559       // Next smallest element is in first set.
   4560       next_range = first_set->at(i1++);
   4561       range_source = kInsideFirst;
   4562     } else {
   4563       // Next smallest element is in second set.
   4564       next_range = second_set->at(i2++);
   4565       range_source = kInsideSecond;
   4566     }
   4567     if (to < next_range.from()) {
   4568       // Ranges disjoint: |current|  |next|
   4569       AddRangeToSelectedSet(state,
   4570                             first_set_only_out,
   4571                             second_set_only_out,
   4572                             both_sets_out,
   4573                             CharacterRange(from, to));
   4574       from = next_range.from();
   4575       to = next_range.to();
   4576       state = range_source;
   4577     } else {
   4578       if (from < next_range.from()) {
   4579         AddRangeToSelectedSet(state,
   4580                               first_set_only_out,
   4581                               second_set_only_out,
   4582                               both_sets_out,
   4583                               CharacterRange(from, next_range.from()-1));
   4584       }
   4585       if (to < next_range.to()) {
   4586         // Ranges overlap:  |current|
   4587         //                       |next|
   4588         AddRangeToSelectedSet(state | range_source,
   4589                               first_set_only_out,
   4590                               second_set_only_out,
   4591                               both_sets_out,
   4592                               CharacterRange(next_range.from(), to));
   4593         from = to + 1;
   4594         to = next_range.to();
   4595         state = range_source;
   4596       } else {
   4597         // Range included:    |current| , possibly ending at same character.
   4598         //                      |next|
   4599         AddRangeToSelectedSet(
   4600             state | range_source,
   4601             first_set_only_out,
   4602             second_set_only_out,
   4603             both_sets_out,
   4604             CharacterRange(next_range.from(), next_range.to()));
   4605         from = next_range.to() + 1;
   4606         // If ranges end at same character, both ranges are consumed completely.
   4607         if (next_range.to() == to) state = kInsideNone;
   4608       }
   4609     }
   4610   }
   4611   AddRangeToSelectedSet(state,
   4612                         first_set_only_out,
   4613                         second_set_only_out,
   4614                         both_sets_out,
   4615                         CharacterRange(from, to));
   4616 }
   4617 
   4618 
   4619 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
   4620                             ZoneList<CharacterRange>* negated_ranges) {
   4621   ASSERT(CharacterRange::IsCanonical(ranges));
   4622   ASSERT_EQ(0, negated_ranges->length());
   4623   int range_count = ranges->length();
   4624   uc16 from = 0;
   4625   int i = 0;
   4626   if (range_count > 0 && ranges->at(0).from() == 0) {
   4627     from = ranges->at(0).to();
   4628     i = 1;
   4629   }
   4630   while (i < range_count) {
   4631     CharacterRange range = ranges->at(i);
   4632     negated_ranges->Add(CharacterRange(from + 1, range.from() - 1));
   4633     from = range.to();
   4634     i++;
   4635   }
   4636   if (from < String::kMaxUtf16CodeUnit) {
   4637     negated_ranges->Add(CharacterRange(from + 1, String::kMaxUtf16CodeUnit));
   4638   }
   4639 }
   4640 
   4641 
   4642 
   4643 // -------------------------------------------------------------------
   4644 // Interest propagation
   4645 
   4646 
   4647 RegExpNode* RegExpNode::TryGetSibling(NodeInfo* info) {
   4648   for (int i = 0; i < siblings_.length(); i++) {
   4649     RegExpNode* sibling = siblings_.Get(i);
   4650     if (sibling->info()->Matches(info))
   4651       return sibling;
   4652   }
   4653   return NULL;
   4654 }
   4655 
   4656 
   4657 RegExpNode* RegExpNode::EnsureSibling(NodeInfo* info, bool* cloned) {
   4658   ASSERT_EQ(false, *cloned);
   4659   siblings_.Ensure(this);
   4660   RegExpNode* result = TryGetSibling(info);
   4661   if (result != NULL) return result;
   4662   result = this->Clone();
   4663   NodeInfo* new_info = result->info();
   4664   new_info->ResetCompilationState();
   4665   new_info->AddFromPreceding(info);
   4666   AddSibling(result);
   4667   *cloned = true;
   4668   return result;
   4669 }
   4670 
   4671 
   4672 template <class C>
   4673 static RegExpNode* PropagateToEndpoint(C* node, NodeInfo* info) {
   4674   NodeInfo full_info(*node->info());
   4675   full_info.AddFromPreceding(info);
   4676   bool cloned = false;
   4677   return RegExpNode::EnsureSibling(node, &full_info, &cloned);
   4678 }
   4679 
   4680 
   4681 // -------------------------------------------------------------------
   4682 // Splay tree
   4683 
   4684 
   4685 OutSet* OutSet::Extend(unsigned value) {
   4686   if (Get(value))
   4687     return this;
   4688   if (successors() != NULL) {
   4689     for (int i = 0; i < successors()->length(); i++) {
   4690       OutSet* successor = successors()->at(i);
   4691       if (successor->Get(value))
   4692         return successor;
   4693     }
   4694   } else {
   4695     successors_ = new ZoneList<OutSet*>(2);
   4696   }
   4697   OutSet* result = new OutSet(first_, remaining_);
   4698   result->Set(value);
   4699   successors()->Add(result);
   4700   return result;
   4701 }
   4702 
   4703 
   4704 void OutSet::Set(unsigned value) {
   4705   if (value < kFirstLimit) {
   4706     first_ |= (1 << value);
   4707   } else {
   4708     if (remaining_ == NULL)
   4709       remaining_ = new ZoneList<unsigned>(1);
   4710     if (remaining_->is_empty() || !remaining_->Contains(value))
   4711       remaining_->Add(value);
   4712   }
   4713 }
   4714 
   4715 
   4716 bool OutSet::Get(unsigned value) {
   4717   if (value < kFirstLimit) {
   4718     return (first_ & (1 << value)) != 0;
   4719   } else if (remaining_ == NULL) {
   4720     return false;
   4721   } else {
   4722     return remaining_->Contains(value);
   4723   }
   4724 }
   4725 
   4726 
   4727 const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
   4728 
   4729 
   4730 void DispatchTable::AddRange(CharacterRange full_range, int value) {
   4731   CharacterRange current = full_range;
   4732   if (tree()->is_empty()) {
   4733     // If this is the first range we just insert into the table.
   4734     ZoneSplayTree<Config>::Locator loc;
   4735     ASSERT_RESULT(tree()->Insert(current.from(), &loc));
   4736     loc.set_value(Entry(current.from(), current.to(), empty()->Extend(value)));
   4737     return;
   4738   }
   4739   // First see if there is a range to the left of this one that
   4740   // overlaps.
   4741   ZoneSplayTree<Config>::Locator loc;
   4742   if (tree()->FindGreatestLessThan(current.from(), &loc)) {
   4743     Entry* entry = &loc.value();
   4744     // If we've found a range that overlaps with this one, and it
   4745     // starts strictly to the left of this one, we have to fix it
   4746     // because the following code only handles ranges that start on
   4747     // or after the start point of the range we're adding.
   4748     if (entry->from() < current.from() && entry->to() >= current.from()) {
   4749       // Snap the overlapping range in half around the start point of
   4750       // the range we're adding.
   4751       CharacterRange left(entry->from(), current.from() - 1);
   4752       CharacterRange right(current.from(), entry->to());
   4753       // The left part of the overlapping range doesn't overlap.
   4754       // Truncate the whole entry to be just the left part.
   4755       entry->set_to(left.to());
   4756       // The right part is the one that overlaps.  We add this part
   4757       // to the map and let the next step deal with merging it with
   4758       // the range we're adding.
   4759       ZoneSplayTree<Config>::Locator loc;
   4760       ASSERT_RESULT(tree()->Insert(right.from(), &loc));
   4761       loc.set_value(Entry(right.from(),
   4762                           right.to(),
   4763                           entry->out_set()));
   4764     }
   4765   }
   4766   while (current.is_valid()) {
   4767     if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
   4768         (loc.value().from() <= current.to()) &&
   4769         (loc.value().to() >= current.from())) {
   4770       Entry* entry = &loc.value();
   4771       // We have overlap.  If there is space between the start point of
   4772       // the range we're adding and where the overlapping range starts
   4773       // then we have to add a range covering just that space.
   4774       if (current.from() < entry->from()) {
   4775         ZoneSplayTree<Config>::Locator ins;
   4776         ASSERT_RESULT(tree()->Insert(current.from(), &ins));
   4777         ins.set_value(Entry(current.from(),
   4778                             entry->from() - 1,
   4779                             empty()->Extend(value)));
   4780         current.set_from(entry->from());
   4781       }
   4782       ASSERT_EQ(current.from(), entry->from());
   4783       // If the overlapping range extends beyond the one we want to add
   4784       // we have to snap the right part off and add it separately.
   4785       if (entry->to() > current.to()) {
   4786         ZoneSplayTree<Config>::Locator ins;
   4787         ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins));
   4788         ins.set_value(Entry(current.to() + 1,
   4789                             entry->to(),
   4790                             entry->out_set()));
   4791         entry->set_to(current.to());
   4792       }
   4793       ASSERT(entry->to() <= current.to());
   4794       // The overlapping range is now completely contained by the range
   4795       // we're adding so we can just update it and move the start point
   4796       // of the range we're adding just past it.
   4797       entry->AddValue(value);
   4798       // Bail out if the last interval ended at 0xFFFF since otherwise
   4799       // adding 1 will wrap around to 0.
   4800       if (entry->to() == String::kMaxUtf16CodeUnit)
   4801         break;
   4802       ASSERT(entry->to() + 1 > current.from());
   4803       current.set_from(entry->to() + 1);
   4804     } else {
   4805       // There is no overlap so we can just add the range
   4806       ZoneSplayTree<Config>::Locator ins;
   4807       ASSERT_RESULT(tree()->Insert(current.from(), &ins));
   4808       ins.set_value(Entry(current.from(),
   4809                           current.to(),
   4810                           empty()->Extend(value)));
   4811       break;
   4812     }
   4813   }
   4814 }
   4815 
   4816 
   4817 OutSet* DispatchTable::Get(uc16 value) {
   4818   ZoneSplayTree<Config>::Locator loc;
   4819   if (!tree()->FindGreatestLessThan(value, &loc))
   4820     return empty();
   4821   Entry* entry = &loc.value();
   4822   if (value <= entry->to())
   4823     return entry->out_set();
   4824   else
   4825     return empty();
   4826 }
   4827 
   4828 
   4829 // -------------------------------------------------------------------
   4830 // Analysis
   4831 
   4832 
   4833 void Analysis::EnsureAnalyzed(RegExpNode* that) {
   4834   StackLimitCheck check(Isolate::Current());
   4835   if (check.HasOverflowed()) {
   4836     fail("Stack overflow");
   4837     return;
   4838   }
   4839   if (that->info()->been_analyzed || that->info()->being_analyzed)
   4840     return;
   4841   that->info()->being_analyzed = true;
   4842   that->Accept(this);
   4843   that->info()->being_analyzed = false;
   4844   that->info()->been_analyzed = true;
   4845 }
   4846 
   4847 
   4848 void Analysis::VisitEnd(EndNode* that) {
   4849   // nothing to do
   4850 }
   4851 
   4852 
   4853 void TextNode::CalculateOffsets() {
   4854   int element_count = elements()->length();
   4855   // Set up the offsets of the elements relative to the start.  This is a fixed
   4856   // quantity since a TextNode can only contain fixed-width things.
   4857   int cp_offset = 0;
   4858   for (int i = 0; i < element_count; i++) {
   4859     TextElement& elm = elements()->at(i);
   4860     elm.cp_offset = cp_offset;
   4861     if (elm.type == TextElement::ATOM) {
   4862       cp_offset += elm.data.u_atom->data().length();
   4863     } else {
   4864       cp_offset++;
   4865     }
   4866   }
   4867 }
   4868 
   4869 
   4870 void Analysis::VisitText(TextNode* that) {
   4871   if (ignore_case_) {
   4872     that->MakeCaseIndependent(is_ascii_);
   4873   }
   4874   EnsureAnalyzed(that->on_success());
   4875   if (!has_failed()) {
   4876     that->CalculateOffsets();
   4877   }
   4878 }
   4879 
   4880 
   4881 void Analysis::VisitAction(ActionNode* that) {
   4882   RegExpNode* target = that->on_success();
   4883   EnsureAnalyzed(target);
   4884   if (!has_failed()) {
   4885     // If the next node is interested in what it follows then this node
   4886     // has to be interested too so it can pass the information on.
   4887     that->info()->AddFromFollowing(target->info());
   4888   }
   4889 }
   4890 
   4891 
   4892 void Analysis::VisitChoice(ChoiceNode* that) {
   4893   NodeInfo* info = that->info();
   4894   for (int i = 0; i < that->alternatives()->length(); i++) {
   4895     RegExpNode* node = that->alternatives()->at(i).node();
   4896     EnsureAnalyzed(node);
   4897     if (has_failed()) return;
   4898     // Anything the following nodes need to know has to be known by
   4899     // this node also, so it can pass it on.
   4900     info->AddFromFollowing(node->info());
   4901   }
   4902 }
   4903 
   4904 
   4905 void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
   4906   NodeInfo* info = that->info();
   4907   for (int i = 0; i < that->alternatives()->length(); i++) {
   4908     RegExpNode* node = that->alternatives()->at(i).node();
   4909     if (node != that->loop_node()) {
   4910       EnsureAnalyzed(node);
   4911       if (has_failed()) return;
   4912       info->AddFromFollowing(node->info());
   4913     }
   4914   }
   4915   // Check the loop last since it may need the value of this node
   4916   // to get a correct result.
   4917   EnsureAnalyzed(that->loop_node());
   4918   if (!has_failed()) {
   4919     info->AddFromFollowing(that->loop_node()->info());
   4920   }
   4921 }
   4922 
   4923 
   4924 void Analysis::VisitBackReference(BackReferenceNode* that) {
   4925   EnsureAnalyzed(that->on_success());
   4926 }
   4927 
   4928 
   4929 void Analysis::VisitAssertion(AssertionNode* that) {
   4930   EnsureAnalyzed(that->on_success());
   4931   AssertionNode::AssertionNodeType type = that->type();
   4932   if (type == AssertionNode::AT_BOUNDARY ||
   4933       type == AssertionNode::AT_NON_BOUNDARY) {
   4934     // Check if the following character is known to be a word character
   4935     // or known to not be a word character.
   4936     ZoneList<CharacterRange>* following_chars = that->FirstCharacterSet();
   4937 
   4938     CharacterRange::Canonicalize(following_chars);
   4939 
   4940     SetRelation word_relation =
   4941         CharacterRange::WordCharacterRelation(following_chars);
   4942     if (word_relation.Disjoint()) {
   4943       // Includes the case where following_chars is empty (e.g., end-of-input).
   4944       // Following character is definitely *not* a word character.
   4945       type = (type == AssertionNode::AT_BOUNDARY) ?
   4946                  AssertionNode::AFTER_WORD_CHARACTER :
   4947                  AssertionNode::AFTER_NONWORD_CHARACTER;
   4948       that->set_type(type);
   4949     } else if (word_relation.ContainedIn()) {
   4950       // Following character is definitely a word character.
   4951       type = (type == AssertionNode::AT_BOUNDARY) ?
   4952                  AssertionNode::AFTER_NONWORD_CHARACTER :
   4953                  AssertionNode::AFTER_WORD_CHARACTER;
   4954       that->set_type(type);
   4955     }
   4956   }
   4957 }
   4958 
   4959 
   4960 ZoneList<CharacterRange>* RegExpNode::FirstCharacterSet() {
   4961   if (first_character_set_ == NULL) {
   4962     if (ComputeFirstCharacterSet(kFirstCharBudget) < 0) {
   4963       // If we can't find an exact solution within the budget, we
   4964       // set the value to the set of every character, i.e., all characters
   4965       // are possible.
   4966       ZoneList<CharacterRange>* all_set = new ZoneList<CharacterRange>(1);
   4967       all_set->Add(CharacterRange::Everything());
   4968       first_character_set_ = all_set;
   4969     }
   4970   }
   4971   return first_character_set_;
   4972 }
   4973 
   4974 
   4975 int RegExpNode::ComputeFirstCharacterSet(int budget) {
   4976   // Default behavior is to not be able to determine the first character.
   4977   return kComputeFirstCharacterSetFail;
   4978 }
   4979 
   4980 
   4981 int LoopChoiceNode::ComputeFirstCharacterSet(int budget) {
   4982   budget--;
   4983   if (budget >= 0) {
   4984     // Find loop min-iteration. It's the value of the guarded choice node
   4985     // with a GEQ guard, if any.
   4986     int min_repetition = 0;
   4987 
   4988     for (int i = 0; i <= 1; i++) {
   4989       GuardedAlternative alternative = alternatives()->at(i);
   4990       ZoneList<Guard*>* guards = alternative.guards();
   4991       if (guards != NULL && guards->length() > 0) {
   4992         Guard* guard = guards->at(0);
   4993         if (guard->op() == Guard::GEQ) {
   4994           min_repetition = guard->value();
   4995           break;
   4996         }
   4997       }
   4998     }
   4999 
   5000     budget = loop_node()->ComputeFirstCharacterSet(budget);
   5001     if (budget >= 0) {
   5002       ZoneList<CharacterRange>* character_set =
   5003           loop_node()->first_character_set();
   5004       if (body_can_be_zero_length() || min_repetition == 0) {
   5005         budget = continue_node()->ComputeFirstCharacterSet(budget);
   5006         if (budget < 0) return budget;
   5007         ZoneList<CharacterRange>* body_set =
   5008             continue_node()->first_character_set();
   5009         ZoneList<CharacterRange>* union_set =
   5010           new ZoneList<CharacterRange>(Max(character_set->length(),
   5011                                            body_set->length()));
   5012         CharacterRange::Merge(character_set,
   5013                               body_set,
   5014                               union_set,
   5015                               union_set,
   5016                               union_set);
   5017         character_set = union_set;
   5018       }
   5019       set_first_character_set(character_set);
   5020     }
   5021   }
   5022   return budget;
   5023 }
   5024 
   5025 
   5026 int NegativeLookaheadChoiceNode::ComputeFirstCharacterSet(int budget) {
   5027   budget--;
   5028   if (budget >= 0) {
   5029     GuardedAlternative successor = this->alternatives()->at(1);
   5030     RegExpNode* successor_node = successor.node();
   5031     budget = successor_node->ComputeFirstCharacterSet(budget);
   5032     if (budget >= 0) {
   5033       set_first_character_set(successor_node->first_character_set());
   5034     }
   5035   }
   5036   return budget;
   5037 }
   5038 
   5039 
   5040 // The first character set of an EndNode is unknowable. Just use the
   5041 // default implementation that fails and returns all characters as possible.
   5042 
   5043 
   5044 int AssertionNode::ComputeFirstCharacterSet(int budget) {
   5045   budget -= 1;
   5046   if (budget >= 0) {
   5047     switch (type_) {
   5048       case AT_END: {
   5049         set_first_character_set(new ZoneList<CharacterRange>(0));
   5050         break;
   5051       }
   5052       case AT_START:
   5053       case AT_BOUNDARY:
   5054       case AT_NON_BOUNDARY:
   5055       case AFTER_NEWLINE:
   5056       case AFTER_NONWORD_CHARACTER:
   5057       case AFTER_WORD_CHARACTER: {
   5058         ASSERT_NOT_NULL(on_success());
   5059         budget = on_success()->ComputeFirstCharacterSet(budget);
   5060         if (budget >= 0) {
   5061           set_first_character_set(on_success()->first_character_set());
   5062         }
   5063         break;
   5064       }
   5065     }
   5066   }
   5067   return budget;
   5068 }
   5069 
   5070 
   5071 int ActionNode::ComputeFirstCharacterSet(int budget) {
   5072   if (type_ == POSITIVE_SUBMATCH_SUCCESS) return kComputeFirstCharacterSetFail;
   5073   budget--;
   5074   if (budget >= 0) {
   5075     ASSERT_NOT_NULL(on_success());
   5076     budget = on_success()->ComputeFirstCharacterSet(budget);
   5077     if (budget >= 0) {
   5078       set_first_character_set(on_success()->first_character_set());
   5079     }
   5080   }
   5081   return budget;
   5082 }
   5083 
   5084 
   5085 int BackReferenceNode::ComputeFirstCharacterSet(int budget) {
   5086   // We don't know anything about the first character of a backreference
   5087   // at this point.
   5088   // The potential first characters are the first characters of the capture,
   5089   // and the first characters of the on_success node, depending on whether the
   5090   // capture can be empty and whether it is known to be participating or known
   5091   // not to be.
   5092   return kComputeFirstCharacterSetFail;
   5093 }
   5094 
   5095 
   5096 int TextNode::ComputeFirstCharacterSet(int budget) {
   5097   budget--;
   5098   if (budget >= 0) {
   5099     ASSERT_NE(0, elements()->length());
   5100     TextElement text = elements()->at(0);
   5101     if (text.type == TextElement::ATOM) {
   5102       RegExpAtom* atom = text.data.u_atom;
   5103       ASSERT_NE(0, atom->length());
   5104       uc16 first_char = atom->data()[0];
   5105       ZoneList<CharacterRange>* range = new ZoneList<CharacterRange>(1);
   5106       range->Add(CharacterRange(first_char, first_char));
   5107       set_first_character_set(range);
   5108     } else {
   5109       ASSERT(text.type == TextElement::CHAR_CLASS);
   5110       RegExpCharacterClass* char_class = text.data.u_char_class;
   5111       ZoneList<CharacterRange>* ranges = char_class->ranges();
   5112       // TODO(lrn): Canonicalize ranges when they are created
   5113       // instead of waiting until now.
   5114       CharacterRange::Canonicalize(ranges);
   5115       if (char_class->is_negated()) {
   5116         int length = ranges->length();
   5117         int new_length = length + 1;
   5118         if (length > 0) {
   5119           if (ranges->at(0).from() == 0) new_length--;
   5120           if (ranges->at(length - 1).to() == String::kMaxUtf16CodeUnit) {
   5121             new_length--;
   5122           }
   5123         }
   5124         ZoneList<CharacterRange>* negated_ranges =
   5125             new ZoneList<CharacterRange>(new_length);
   5126         CharacterRange::Negate(ranges, negated_ranges);
   5127         set_first_character_set(negated_ranges);
   5128       } else {
   5129         set_first_character_set(ranges);
   5130       }
   5131     }
   5132   }
   5133   return budget;
   5134 }
   5135 
   5136 
   5137 
   5138 // -------------------------------------------------------------------
   5139 // Dispatch table construction
   5140 
   5141 
   5142 void DispatchTableConstructor::VisitEnd(EndNode* that) {
   5143   AddRange(CharacterRange::Everything());
   5144 }
   5145 
   5146 
   5147 void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
   5148   node->set_being_calculated(true);
   5149   ZoneList<GuardedAlternative>* alternatives = node->alternatives();
   5150   for (int i = 0; i < alternatives->length(); i++) {
   5151     set_choice_index(i);
   5152     alternatives->at(i).node()->Accept(this);
   5153   }
   5154   node->set_being_calculated(false);
   5155 }
   5156 
   5157 
   5158 class AddDispatchRange {
   5159  public:
   5160   explicit AddDispatchRange(DispatchTableConstructor* constructor)
   5161     : constructor_(constructor) { }
   5162   void Call(uc32 from, DispatchTable::Entry entry);
   5163  private:
   5164   DispatchTableConstructor* constructor_;
   5165 };
   5166 
   5167 
   5168 void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
   5169   CharacterRange range(from, entry.to());
   5170   constructor_->AddRange(range);
   5171 }
   5172 
   5173 
   5174 void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
   5175   if (node->being_calculated())
   5176     return;
   5177   DispatchTable* table = node->GetTable(ignore_case_);
   5178   AddDispatchRange adder(this);
   5179   table->ForEach(&adder);
   5180 }
   5181 
   5182 
   5183 void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
   5184   // TODO(160): Find the node that we refer back to and propagate its start
   5185   // set back to here.  For now we just accept anything.
   5186   AddRange(CharacterRange::Everything());
   5187 }
   5188 
   5189 
   5190 void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
   5191   RegExpNode* target = that->on_success();
   5192   target->Accept(this);
   5193 }
   5194 
   5195 
   5196 static int CompareRangeByFrom(const CharacterRange* a,
   5197                               const CharacterRange* b) {
   5198   return Compare<uc16>(a->from(), b->from());
   5199 }
   5200 
   5201 
   5202 void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
   5203   ranges->Sort(CompareRangeByFrom);
   5204   uc16 last = 0;
   5205   for (int i = 0; i < ranges->length(); i++) {
   5206     CharacterRange range = ranges->at(i);
   5207     if (last < range.from())
   5208       AddRange(CharacterRange(last, range.from() - 1));
   5209     if (range.to() >= last) {
   5210       if (range.to() == String::kMaxUtf16CodeUnit) {
   5211         return;
   5212       } else {
   5213         last = range.to() + 1;
   5214       }
   5215     }
   5216   }
   5217   AddRange(CharacterRange(last, String::kMaxUtf16CodeUnit));
   5218 }
   5219 
   5220 
   5221 void DispatchTableConstructor::VisitText(TextNode* that) {
   5222   TextElement elm = that->elements()->at(0);
   5223   switch (elm.type) {
   5224     case TextElement::ATOM: {
   5225       uc16 c = elm.data.u_atom->data()[0];
   5226       AddRange(CharacterRange(c, c));
   5227       break;
   5228     }
   5229     case TextElement::CHAR_CLASS: {
   5230       RegExpCharacterClass* tree = elm.data.u_char_class;
   5231       ZoneList<CharacterRange>* ranges = tree->ranges();
   5232       if (tree->is_negated()) {
   5233         AddInverse(ranges);
   5234       } else {
   5235         for (int i = 0; i < ranges->length(); i++)
   5236           AddRange(ranges->at(i));
   5237       }
   5238       break;
   5239     }
   5240     default: {
   5241       UNIMPLEMENTED();
   5242     }
   5243   }
   5244 }
   5245 
   5246 
   5247 void DispatchTableConstructor::VisitAction(ActionNode* that) {
   5248   RegExpNode* target = that->on_success();
   5249   target->Accept(this);
   5250 }
   5251 
   5252 
   5253 RegExpEngine::CompilationResult RegExpEngine::Compile(RegExpCompileData* data,
   5254                                                       bool ignore_case,
   5255                                                       bool is_multiline,
   5256                                                       Handle<String> pattern,
   5257                                                       bool is_ascii) {
   5258   if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
   5259     return IrregexpRegExpTooBig();
   5260   }
   5261   RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii);
   5262   // Wrap the body of the regexp in capture #0.
   5263   RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
   5264                                                     0,
   5265                                                     &compiler,
   5266                                                     compiler.accept());
   5267   RegExpNode* node = captured_body;
   5268   bool is_end_anchored = data->tree->IsAnchoredAtEnd();
   5269   bool is_start_anchored = data->tree->IsAnchoredAtStart();
   5270   int max_length = data->tree->max_match();
   5271   if (!is_start_anchored) {
   5272     // Add a .*? at the beginning, outside the body capture, unless
   5273     // this expression is anchored at the beginning.
   5274     RegExpNode* loop_node =
   5275         RegExpQuantifier::ToNode(0,
   5276                                  RegExpTree::kInfinity,
   5277                                  false,
   5278                                  new RegExpCharacterClass('*'),
   5279                                  &compiler,
   5280                                  captured_body,
   5281                                  data->contains_anchor);
   5282 
   5283     if (data->contains_anchor) {
   5284       // Unroll loop once, to take care of the case that might start
   5285       // at the start of input.
   5286       ChoiceNode* first_step_node = new ChoiceNode(2);
   5287       first_step_node->AddAlternative(GuardedAlternative(captured_body));
   5288       first_step_node->AddAlternative(GuardedAlternative(
   5289           new TextNode(new RegExpCharacterClass('*'), loop_node)));
   5290       node = first_step_node;
   5291     } else {
   5292       node = loop_node;
   5293     }
   5294   }
   5295   data->node = node;
   5296   Analysis analysis(ignore_case, is_ascii);
   5297   analysis.EnsureAnalyzed(node);
   5298   if (analysis.has_failed()) {
   5299     const char* error_message = analysis.error_message();
   5300     return CompilationResult(error_message);
   5301   }
   5302 
   5303   // Create the correct assembler for the architecture.
   5304 #ifndef V8_INTERPRETED_REGEXP
   5305   // Native regexp implementation.
   5306 
   5307   NativeRegExpMacroAssembler::Mode mode =
   5308       is_ascii ? NativeRegExpMacroAssembler::ASCII
   5309                : NativeRegExpMacroAssembler::UC16;
   5310 
   5311 #if V8_TARGET_ARCH_IA32
   5312   RegExpMacroAssemblerIA32 macro_assembler(mode, (data->capture_count + 1) * 2);
   5313 #elif V8_TARGET_ARCH_X64
   5314   RegExpMacroAssemblerX64 macro_assembler(mode, (data->capture_count + 1) * 2);
   5315 #elif V8_TARGET_ARCH_ARM
   5316   RegExpMacroAssemblerARM macro_assembler(mode, (data->capture_count + 1) * 2);
   5317 #elif V8_TARGET_ARCH_MIPS
   5318   RegExpMacroAssemblerMIPS macro_assembler(mode, (data->capture_count + 1) * 2);
   5319 #endif
   5320 
   5321 #else  // V8_INTERPRETED_REGEXP
   5322   // Interpreted regexp implementation.
   5323   EmbeddedVector<byte, 1024> codes;
   5324   RegExpMacroAssemblerIrregexp macro_assembler(codes);
   5325 #endif  // V8_INTERPRETED_REGEXP
   5326 
   5327   // Inserted here, instead of in Assembler, because it depends on information
   5328   // in the AST that isn't replicated in the Node structure.
   5329   static const int kMaxBacksearchLimit = 1024;
   5330   if (is_end_anchored &&
   5331       !is_start_anchored &&
   5332       max_length < kMaxBacksearchLimit) {
   5333     macro_assembler.SetCurrentPositionFromEnd(max_length);
   5334   }
   5335 
   5336   return compiler.Assemble(&macro_assembler,
   5337                            node,
   5338                            data->capture_count,
   5339                            pattern);
   5340 }
   5341 
   5342 
   5343 }}  // namespace v8::internal
   5344