Home | History | Annotate | Download | only in src
      1 // Copyright 2011 the V8 project authors. All rights reserved.
      2 // Redistribution and use in source and binary forms, with or without
      3 // modification, are permitted provided that the following conditions are
      4 // met:
      5 //
      6 //     * Redistributions of source code must retain the above copyright
      7 //       notice, this list of conditions and the following disclaimer.
      8 //     * Redistributions in binary form must reproduce the above
      9 //       copyright notice, this list of conditions and the following
     10 //       disclaimer in the documentation and/or other materials provided
     11 //       with the distribution.
     12 //     * Neither the name of Google Inc. nor the names of its
     13 //       contributors may be used to endorse or promote products derived
     14 //       from this software without specific prior written permission.
     15 //
     16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27 
     28 #include "v8.h"
     29 
     30 #include "ast.h"
     31 #include "compiler.h"
     32 #include "execution.h"
     33 #include "factory.h"
     34 #include "jsregexp.h"
     35 #include "platform.h"
     36 #include "string-search.h"
     37 #include "runtime.h"
     38 #include "compilation-cache.h"
     39 #include "string-stream.h"
     40 #include "parser.h"
     41 #include "regexp-macro-assembler.h"
     42 #include "regexp-macro-assembler-tracer.h"
     43 #include "regexp-macro-assembler-irregexp.h"
     44 #include "regexp-stack.h"
     45 
     46 #ifndef V8_INTERPRETED_REGEXP
     47 #if V8_TARGET_ARCH_IA32
     48 #include "ia32/regexp-macro-assembler-ia32.h"
     49 #elif V8_TARGET_ARCH_X64
     50 #include "x64/regexp-macro-assembler-x64.h"
     51 #elif V8_TARGET_ARCH_ARM
     52 #include "arm/regexp-macro-assembler-arm.h"
     53 #elif V8_TARGET_ARCH_MIPS
     54 #include "mips/regexp-macro-assembler-mips.h"
     55 #else
     56 #error Unsupported target architecture.
     57 #endif
     58 #endif
     59 
     60 #include "interpreter-irregexp.h"
     61 
     62 
     63 namespace v8 {
     64 namespace internal {
     65 
     66 Handle<Object> RegExpImpl::CreateRegExpLiteral(Handle<JSFunction> constructor,
     67                                                Handle<String> pattern,
     68                                                Handle<String> flags,
     69                                                bool* has_pending_exception) {
     70   // Call the construct code with 2 arguments.
     71   Handle<Object> argv[] = { pattern, flags };
     72   return Execution::New(constructor, ARRAY_SIZE(argv), argv,
     73                         has_pending_exception);
     74 }
     75 
     76 
     77 static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) {
     78   int flags = JSRegExp::NONE;
     79   for (int i = 0; i < str->length(); i++) {
     80     switch (str->Get(i)) {
     81       case 'i':
     82         flags |= JSRegExp::IGNORE_CASE;
     83         break;
     84       case 'g':
     85         flags |= JSRegExp::GLOBAL;
     86         break;
     87       case 'm':
     88         flags |= JSRegExp::MULTILINE;
     89         break;
     90     }
     91   }
     92   return JSRegExp::Flags(flags);
     93 }
     94 
     95 
     96 static inline void ThrowRegExpException(Handle<JSRegExp> re,
     97                                         Handle<String> pattern,
     98                                         Handle<String> error_text,
     99                                         const char* message) {
    100   Isolate* isolate = re->GetIsolate();
    101   Factory* factory = isolate->factory();
    102   Handle<FixedArray> elements = factory->NewFixedArray(2);
    103   elements->set(0, *pattern);
    104   elements->set(1, *error_text);
    105   Handle<JSArray> array = factory->NewJSArrayWithElements(elements);
    106   Handle<Object> regexp_err = factory->NewSyntaxError(message, array);
    107   isolate->Throw(*regexp_err);
    108 }
    109 
    110 
    111 // Generic RegExp methods. Dispatches to implementation specific methods.
    112 
    113 
    114 Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
    115                                    Handle<String> pattern,
    116                                    Handle<String> flag_str) {
    117   Isolate* isolate = re->GetIsolate();
    118   JSRegExp::Flags flags = RegExpFlagsFromString(flag_str);
    119   CompilationCache* compilation_cache = isolate->compilation_cache();
    120   Handle<FixedArray> cached = compilation_cache->LookupRegExp(pattern, flags);
    121   bool in_cache = !cached.is_null();
    122   LOG(isolate, RegExpCompileEvent(re, in_cache));
    123 
    124   Handle<Object> result;
    125   if (in_cache) {
    126     re->set_data(*cached);
    127     return re;
    128   }
    129   pattern = FlattenGetString(pattern);
    130   ZoneScope zone_scope(isolate, DELETE_ON_EXIT);
    131   PostponeInterruptsScope postpone(isolate);
    132   RegExpCompileData parse_result;
    133   FlatStringReader reader(isolate, pattern);
    134   if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(),
    135                                  &parse_result)) {
    136     // Throw an exception if we fail to parse the pattern.
    137     ThrowRegExpException(re,
    138                          pattern,
    139                          parse_result.error,
    140                          "malformed_regexp");
    141     return Handle<Object>::null();
    142   }
    143 
    144   if (parse_result.simple && !flags.is_ignore_case()) {
    145     // Parse-tree is a single atom that is equal to the pattern.
    146     AtomCompile(re, pattern, flags, pattern);
    147   } else if (parse_result.tree->IsAtom() &&
    148       !flags.is_ignore_case() &&
    149       parse_result.capture_count == 0) {
    150     RegExpAtom* atom = parse_result.tree->AsAtom();
    151     Vector<const uc16> atom_pattern = atom->data();
    152     Handle<String> atom_string =
    153         isolate->factory()->NewStringFromTwoByte(atom_pattern);
    154     AtomCompile(re, pattern, flags, atom_string);
    155   } else {
    156     IrregexpInitialize(re, pattern, flags, parse_result.capture_count);
    157   }
    158   ASSERT(re->data()->IsFixedArray());
    159   // Compilation succeeded so the data is set on the regexp
    160   // and we can store it in the cache.
    161   Handle<FixedArray> data(FixedArray::cast(re->data()));
    162   compilation_cache->PutRegExp(pattern, flags, data);
    163 
    164   return re;
    165 }
    166 
    167 
    168 Handle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
    169                                 Handle<String> subject,
    170                                 int index,
    171                                 Handle<JSArray> last_match_info) {
    172   switch (regexp->TypeTag()) {
    173     case JSRegExp::ATOM:
    174       return AtomExec(regexp, subject, index, last_match_info);
    175     case JSRegExp::IRREGEXP: {
    176       Handle<Object> result =
    177           IrregexpExec(regexp, subject, index, last_match_info);
    178       ASSERT(!result.is_null() ||
    179              regexp->GetIsolate()->has_pending_exception());
    180       return result;
    181     }
    182     default:
    183       UNREACHABLE();
    184       return Handle<Object>::null();
    185   }
    186 }
    187 
    188 
    189 // RegExp Atom implementation: Simple string search using indexOf.
    190 
    191 
    192 void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
    193                              Handle<String> pattern,
    194                              JSRegExp::Flags flags,
    195                              Handle<String> match_pattern) {
    196   re->GetIsolate()->factory()->SetRegExpAtomData(re,
    197                                                  JSRegExp::ATOM,
    198                                                  pattern,
    199                                                  flags,
    200                                                  match_pattern);
    201 }
    202 
    203 
    204 static void SetAtomLastCapture(FixedArray* array,
    205                                String* subject,
    206                                int from,
    207                                int to) {
    208   NoHandleAllocation no_handles;
    209   RegExpImpl::SetLastCaptureCount(array, 2);
    210   RegExpImpl::SetLastSubject(array, subject);
    211   RegExpImpl::SetLastInput(array, subject);
    212   RegExpImpl::SetCapture(array, 0, from);
    213   RegExpImpl::SetCapture(array, 1, to);
    214 }
    215 
    216 
    217 Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
    218                                     Handle<String> subject,
    219                                     int index,
    220                                     Handle<JSArray> last_match_info) {
    221   Isolate* isolate = re->GetIsolate();
    222 
    223   ASSERT(0 <= index);
    224   ASSERT(index <= subject->length());
    225 
    226   if (!subject->IsFlat()) FlattenString(subject);
    227   AssertNoAllocation no_heap_allocation;  // ensure vectors stay valid
    228 
    229   String* needle = String::cast(re->DataAt(JSRegExp::kAtomPatternIndex));
    230   int needle_len = needle->length();
    231   ASSERT(needle->IsFlat());
    232 
    233   if (needle_len != 0) {
    234     if (index + needle_len > subject->length()) {
    235       return isolate->factory()->null_value();
    236     }
    237 
    238     String::FlatContent needle_content = needle->GetFlatContent();
    239     String::FlatContent subject_content = subject->GetFlatContent();
    240     ASSERT(needle_content.IsFlat());
    241     ASSERT(subject_content.IsFlat());
    242     // dispatch on type of strings
    243     index = (needle_content.IsAscii()
    244              ? (subject_content.IsAscii()
    245                 ? SearchString(isolate,
    246                                subject_content.ToAsciiVector(),
    247                                needle_content.ToAsciiVector(),
    248                                index)
    249                 : SearchString(isolate,
    250                                subject_content.ToUC16Vector(),
    251                                needle_content.ToAsciiVector(),
    252                                index))
    253              : (subject_content.IsAscii()
    254                 ? SearchString(isolate,
    255                                subject_content.ToAsciiVector(),
    256                                needle_content.ToUC16Vector(),
    257                                index)
    258                 : SearchString(isolate,
    259                                subject_content.ToUC16Vector(),
    260                                needle_content.ToUC16Vector(),
    261                                index)));
    262     if (index == -1) return isolate->factory()->null_value();
    263   }
    264   ASSERT(last_match_info->HasFastElements());
    265 
    266   {
    267     NoHandleAllocation no_handles;
    268     FixedArray* array = FixedArray::cast(last_match_info->elements());
    269     SetAtomLastCapture(array, *subject, index, index + needle_len);
    270   }
    271   return last_match_info;
    272 }
    273 
    274 
    275 // Irregexp implementation.
    276 
    277 // Ensures that the regexp object contains a compiled version of the
    278 // source for either ASCII or non-ASCII strings.
    279 // If the compiled version doesn't already exist, it is compiled
    280 // from the source pattern.
    281 // If compilation fails, an exception is thrown and this function
    282 // returns false.
    283 bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, bool is_ascii) {
    284   Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii));
    285 #ifdef V8_INTERPRETED_REGEXP
    286   if (compiled_code->IsByteArray()) return true;
    287 #else  // V8_INTERPRETED_REGEXP (RegExp native code)
    288   if (compiled_code->IsCode()) return true;
    289 #endif
    290   // We could potentially have marked this as flushable, but have kept
    291   // a saved version if we did not flush it yet.
    292   Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_ascii));
    293   if (saved_code->IsCode()) {
    294     // Reinstate the code in the original place.
    295     re->SetDataAt(JSRegExp::code_index(is_ascii), saved_code);
    296     ASSERT(compiled_code->IsSmi());
    297     return true;
    298   }
    299   return CompileIrregexp(re, is_ascii);
    300 }
    301 
    302 
    303 static bool CreateRegExpErrorObjectAndThrow(Handle<JSRegExp> re,
    304                                             bool is_ascii,
    305                                             Handle<String> error_message,
    306                                             Isolate* isolate) {
    307   Factory* factory = isolate->factory();
    308   Handle<FixedArray> elements = factory->NewFixedArray(2);
    309   elements->set(0, re->Pattern());
    310   elements->set(1, *error_message);
    311   Handle<JSArray> array = factory->NewJSArrayWithElements(elements);
    312   Handle<Object> regexp_err =
    313       factory->NewSyntaxError("malformed_regexp", array);
    314   isolate->Throw(*regexp_err);
    315   return false;
    316 }
    317 
    318 
    319 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, bool is_ascii) {
    320   // Compile the RegExp.
    321   Isolate* isolate = re->GetIsolate();
    322   ZoneScope zone_scope(isolate, DELETE_ON_EXIT);
    323   PostponeInterruptsScope postpone(isolate);
    324   // If we had a compilation error the last time this is saved at the
    325   // saved code index.
    326   Object* entry = re->DataAt(JSRegExp::code_index(is_ascii));
    327   // When arriving here entry can only be a smi, either representing an
    328   // uncompiled regexp, a previous compilation error, or code that has
    329   // been flushed.
    330   ASSERT(entry->IsSmi());
    331   int entry_value = Smi::cast(entry)->value();
    332   ASSERT(entry_value == JSRegExp::kUninitializedValue ||
    333          entry_value == JSRegExp::kCompilationErrorValue ||
    334          (entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0));
    335 
    336   if (entry_value == JSRegExp::kCompilationErrorValue) {
    337     // A previous compilation failed and threw an error which we store in
    338     // the saved code index (we store the error message, not the actual
    339     // error). Recreate the error object and throw it.
    340     Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_ascii));
    341     ASSERT(error_string->IsString());
    342     Handle<String> error_message(String::cast(error_string));
    343     CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate);
    344     return false;
    345   }
    346 
    347   JSRegExp::Flags flags = re->GetFlags();
    348 
    349   Handle<String> pattern(re->Pattern());
    350   if (!pattern->IsFlat()) FlattenString(pattern);
    351   RegExpCompileData compile_data;
    352   FlatStringReader reader(isolate, pattern);
    353   if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(),
    354                                  &compile_data)) {
    355     // Throw an exception if we fail to parse the pattern.
    356     // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
    357     ThrowRegExpException(re,
    358                          pattern,
    359                          compile_data.error,
    360                          "malformed_regexp");
    361     return false;
    362   }
    363   RegExpEngine::CompilationResult result =
    364       RegExpEngine::Compile(&compile_data,
    365                             flags.is_ignore_case(),
    366                             flags.is_multiline(),
    367                             pattern,
    368                             is_ascii);
    369   if (result.error_message != NULL) {
    370     // Unable to compile regexp.
    371     Handle<String> error_message =
    372         isolate->factory()->NewStringFromUtf8(CStrVector(result.error_message));
    373     CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate);
    374     return false;
    375   }
    376 
    377   Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
    378   data->set(JSRegExp::code_index(is_ascii), result.code);
    379   int register_max = IrregexpMaxRegisterCount(*data);
    380   if (result.num_registers > register_max) {
    381     SetIrregexpMaxRegisterCount(*data, result.num_registers);
    382   }
    383 
    384   return true;
    385 }
    386 
    387 
    388 int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
    389   return Smi::cast(
    390       re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
    391 }
    392 
    393 
    394 void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
    395   re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
    396 }
    397 
    398 
    399 int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
    400   return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
    401 }
    402 
    403 
    404 int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
    405   return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
    406 }
    407 
    408 
    409 ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_ascii) {
    410   return ByteArray::cast(re->get(JSRegExp::code_index(is_ascii)));
    411 }
    412 
    413 
    414 Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_ascii) {
    415   return Code::cast(re->get(JSRegExp::code_index(is_ascii)));
    416 }
    417 
    418 
    419 void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re,
    420                                     Handle<String> pattern,
    421                                     JSRegExp::Flags flags,
    422                                     int capture_count) {
    423   // Initialize compiled code entries to null.
    424   re->GetIsolate()->factory()->SetRegExpIrregexpData(re,
    425                                                      JSRegExp::IRREGEXP,
    426                                                      pattern,
    427                                                      flags,
    428                                                      capture_count);
    429 }
    430 
    431 
    432 int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp,
    433                                 Handle<String> subject) {
    434   if (!subject->IsFlat()) FlattenString(subject);
    435 
    436   // Check the asciiness of the underlying storage.
    437   bool is_ascii = subject->IsAsciiRepresentationUnderneath();
    438   if (!EnsureCompiledIrregexp(regexp, is_ascii)) return -1;
    439 
    440 #ifdef V8_INTERPRETED_REGEXP
    441   // Byte-code regexp needs space allocated for all its registers.
    442   return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data()));
    443 #else  // V8_INTERPRETED_REGEXP
    444   // Native regexp only needs room to output captures. Registers are handled
    445   // internally.
    446   return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
    447 #endif  // V8_INTERPRETED_REGEXP
    448 }
    449 
    450 
    451 RegExpImpl::IrregexpResult RegExpImpl::IrregexpExecOnce(
    452     Handle<JSRegExp> regexp,
    453     Handle<String> subject,
    454     int index,
    455     Vector<int> output) {
    456   Isolate* isolate = regexp->GetIsolate();
    457 
    458   Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
    459 
    460   ASSERT(index >= 0);
    461   ASSERT(index <= subject->length());
    462   ASSERT(subject->IsFlat());
    463 
    464   bool is_ascii = subject->IsAsciiRepresentationUnderneath();
    465 
    466 #ifndef V8_INTERPRETED_REGEXP
    467   ASSERT(output.length() >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
    468   do {
    469     EnsureCompiledIrregexp(regexp, is_ascii);
    470     Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii), isolate);
    471     NativeRegExpMacroAssembler::Result res =
    472         NativeRegExpMacroAssembler::Match(code,
    473                                           subject,
    474                                           output.start(),
    475                                           output.length(),
    476                                           index,
    477                                           isolate);
    478     if (res != NativeRegExpMacroAssembler::RETRY) {
    479       ASSERT(res != NativeRegExpMacroAssembler::EXCEPTION ||
    480              isolate->has_pending_exception());
    481       STATIC_ASSERT(
    482           static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
    483       STATIC_ASSERT(
    484           static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
    485       STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)
    486                     == RE_EXCEPTION);
    487       return static_cast<IrregexpResult>(res);
    488     }
    489     // If result is RETRY, the string has changed representation, and we
    490     // must restart from scratch.
    491     // In this case, it means we must make sure we are prepared to handle
    492     // the, potentially, different subject (the string can switch between
    493     // being internal and external, and even between being ASCII and UC16,
    494     // but the characters are always the same).
    495     IrregexpPrepare(regexp, subject);
    496     is_ascii = subject->IsAsciiRepresentationUnderneath();
    497   } while (true);
    498   UNREACHABLE();
    499   return RE_EXCEPTION;
    500 #else  // V8_INTERPRETED_REGEXP
    501 
    502   ASSERT(output.length() >= IrregexpNumberOfRegisters(*irregexp));
    503   // We must have done EnsureCompiledIrregexp, so we can get the number of
    504   // registers.
    505   int* register_vector = output.start();
    506   int number_of_capture_registers =
    507       (IrregexpNumberOfCaptures(*irregexp) + 1) * 2;
    508   for (int i = number_of_capture_registers - 1; i >= 0; i--) {
    509     register_vector[i] = -1;
    510   }
    511   Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_ascii), isolate);
    512 
    513   IrregexpResult result = IrregexpInterpreter::Match(isolate,
    514                                                      byte_codes,
    515                                                      subject,
    516                                                      register_vector,
    517                                                      index);
    518   if (result == RE_EXCEPTION) {
    519     ASSERT(!isolate->has_pending_exception());
    520     isolate->StackOverflow();
    521   }
    522   return result;
    523 #endif  // V8_INTERPRETED_REGEXP
    524 }
    525 
    526 
    527 Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> jsregexp,
    528                                         Handle<String> subject,
    529                                         int previous_index,
    530                                         Handle<JSArray> last_match_info) {
    531   Isolate* isolate = jsregexp->GetIsolate();
    532   ASSERT_EQ(jsregexp->TypeTag(), JSRegExp::IRREGEXP);
    533 
    534   // Prepare space for the return values.
    535 #ifdef V8_INTERPRETED_REGEXP
    536 #ifdef DEBUG
    537   if (FLAG_trace_regexp_bytecodes) {
    538     String* pattern = jsregexp->Pattern();
    539     PrintF("\n\nRegexp match:   /%s/\n\n", *(pattern->ToCString()));
    540     PrintF("\n\nSubject string: '%s'\n\n", *(subject->ToCString()));
    541   }
    542 #endif
    543 #endif
    544   int required_registers = RegExpImpl::IrregexpPrepare(jsregexp, subject);
    545   if (required_registers < 0) {
    546     // Compiling failed with an exception.
    547     ASSERT(isolate->has_pending_exception());
    548     return Handle<Object>::null();
    549   }
    550 
    551   OffsetsVector registers(required_registers, isolate);
    552 
    553   IrregexpResult res = RegExpImpl::IrregexpExecOnce(
    554       jsregexp, subject, previous_index, Vector<int>(registers.vector(),
    555                                                      registers.length()));
    556   if (res == RE_SUCCESS) {
    557     int capture_register_count =
    558         (IrregexpNumberOfCaptures(FixedArray::cast(jsregexp->data())) + 1) * 2;
    559     last_match_info->EnsureSize(capture_register_count + kLastMatchOverhead);
    560     AssertNoAllocation no_gc;
    561     int* register_vector = registers.vector();
    562     FixedArray* array = FixedArray::cast(last_match_info->elements());
    563     for (int i = 0; i < capture_register_count; i += 2) {
    564       SetCapture(array, i, register_vector[i]);
    565       SetCapture(array, i + 1, register_vector[i + 1]);
    566     }
    567     SetLastCaptureCount(array, capture_register_count);
    568     SetLastSubject(array, *subject);
    569     SetLastInput(array, *subject);
    570     return last_match_info;
    571   }
    572   if (res == RE_EXCEPTION) {
    573     ASSERT(isolate->has_pending_exception());
    574     return Handle<Object>::null();
    575   }
    576   ASSERT(res == RE_FAILURE);
    577   return isolate->factory()->null_value();
    578 }
    579 
    580 
    581 // -------------------------------------------------------------------
    582 // Implementation of the Irregexp regular expression engine.
    583 //
    584 // The Irregexp regular expression engine is intended to be a complete
    585 // implementation of ECMAScript regular expressions.  It generates either
    586 // bytecodes or native code.
    587 
    588 //   The Irregexp regexp engine is structured in three steps.
    589 //   1) The parser generates an abstract syntax tree.  See ast.cc.
    590 //   2) From the AST a node network is created.  The nodes are all
    591 //      subclasses of RegExpNode.  The nodes represent states when
    592 //      executing a regular expression.  Several optimizations are
    593 //      performed on the node network.
    594 //   3) From the nodes we generate either byte codes or native code
    595 //      that can actually execute the regular expression (perform
    596 //      the search).  The code generation step is described in more
    597 //      detail below.
    598 
    599 // Code generation.
    600 //
    601 //   The nodes are divided into four main categories.
    602 //   * Choice nodes
    603 //        These represent places where the regular expression can
    604 //        match in more than one way.  For example on entry to an
    605 //        alternation (foo|bar) or a repetition (*, +, ? or {}).
    606 //   * Action nodes
    607 //        These represent places where some action should be
    608 //        performed.  Examples include recording the current position
    609 //        in the input string to a register (in order to implement
    610 //        captures) or other actions on register for example in order
    611 //        to implement the counters needed for {} repetitions.
    612 //   * Matching nodes
    613 //        These attempt to match some element part of the input string.
    614 //        Examples of elements include character classes, plain strings
    615 //        or back references.
    616 //   * End nodes
    617 //        These are used to implement the actions required on finding
    618 //        a successful match or failing to find a match.
    619 //
    620 //   The code generated (whether as byte codes or native code) maintains
    621 //   some state as it runs.  This consists of the following elements:
    622 //
    623 //   * The capture registers.  Used for string captures.
    624 //   * Other registers.  Used for counters etc.
    625 //   * The current position.
    626 //   * The stack of backtracking information.  Used when a matching node
    627 //     fails to find a match and needs to try an alternative.
    628 //
    629 // Conceptual regular expression execution model:
    630 //
    631 //   There is a simple conceptual model of regular expression execution
    632 //   which will be presented first.  The actual code generated is a more
    633 //   efficient simulation of the simple conceptual model:
    634 //
    635 //   * Choice nodes are implemented as follows:
    636 //     For each choice except the last {
    637 //       push current position
    638 //       push backtrack code location
    639 //       <generate code to test for choice>
    640 //       backtrack code location:
    641 //       pop current position
    642 //     }
    643 //     <generate code to test for last choice>
    644 //
    645 //   * Actions nodes are generated as follows
    646 //     <push affected registers on backtrack stack>
    647 //     <generate code to perform action>
    648 //     push backtrack code location
    649 //     <generate code to test for following nodes>
    650 //     backtrack code location:
    651 //     <pop affected registers to restore their state>
    652 //     <pop backtrack location from stack and go to it>
    653 //
    654 //   * Matching nodes are generated as follows:
    655 //     if input string matches at current position
    656 //       update current position
    657 //       <generate code to test for following nodes>
    658 //     else
    659 //       <pop backtrack location from stack and go to it>
    660 //
    661 //   Thus it can be seen that the current position is saved and restored
    662 //   by the choice nodes, whereas the registers are saved and restored by
    663 //   by the action nodes that manipulate them.
    664 //
    665 //   The other interesting aspect of this model is that nodes are generated
    666 //   at the point where they are needed by a recursive call to Emit().  If
    667 //   the node has already been code generated then the Emit() call will
    668 //   generate a jump to the previously generated code instead.  In order to
    669 //   limit recursion it is possible for the Emit() function to put the node
    670 //   on a work list for later generation and instead generate a jump.  The
    671 //   destination of the jump is resolved later when the code is generated.
    672 //
    673 // Actual regular expression code generation.
    674 //
    675 //   Code generation is actually more complicated than the above.  In order
    676 //   to improve the efficiency of the generated code some optimizations are
    677 //   performed
    678 //
    679 //   * Choice nodes have 1-character lookahead.
    680 //     A choice node looks at the following character and eliminates some of
    681 //     the choices immediately based on that character.  This is not yet
    682 //     implemented.
    683 //   * Simple greedy loops store reduced backtracking information.
    684 //     A quantifier like /.*foo/m will greedily match the whole input.  It will
    685 //     then need to backtrack to a point where it can match "foo".  The naive
    686 //     implementation of this would push each character position onto the
    687 //     backtracking stack, then pop them off one by one.  This would use space
    688 //     proportional to the length of the input string.  However since the "."
    689 //     can only match in one way and always has a constant length (in this case
    690 //     of 1) it suffices to store the current position on the top of the stack
    691 //     once.  Matching now becomes merely incrementing the current position and
    692 //     backtracking becomes decrementing the current position and checking the
    693 //     result against the stored current position.  This is faster and saves
    694 //     space.
    695 //   * The current state is virtualized.
    696 //     This is used to defer expensive operations until it is clear that they
    697 //     are needed and to generate code for a node more than once, allowing
    698 //     specialized an efficient versions of the code to be created. This is
    699 //     explained in the section below.
    700 //
    701 // Execution state virtualization.
    702 //
    703 //   Instead of emitting code, nodes that manipulate the state can record their
    704 //   manipulation in an object called the Trace.  The Trace object can record a
    705 //   current position offset, an optional backtrack code location on the top of
    706 //   the virtualized backtrack stack and some register changes.  When a node is
    707 //   to be emitted it can flush the Trace or update it.  Flushing the Trace
    708 //   will emit code to bring the actual state into line with the virtual state.
    709 //   Avoiding flushing the state can postpone some work (e.g. updates of capture
    710 //   registers).  Postponing work can save time when executing the regular
    711 //   expression since it may be found that the work never has to be done as a
    712 //   failure to match can occur.  In addition it is much faster to jump to a
    713 //   known backtrack code location than it is to pop an unknown backtrack
    714 //   location from the stack and jump there.
    715 //
    716 //   The virtual state found in the Trace affects code generation.  For example
    717 //   the virtual state contains the difference between the actual current
    718 //   position and the virtual current position, and matching code needs to use
    719 //   this offset to attempt a match in the correct location of the input
    720 //   string.  Therefore code generated for a non-trivial trace is specialized
    721 //   to that trace.  The code generator therefore has the ability to generate
    722 //   code for each node several times.  In order to limit the size of the
    723 //   generated code there is an arbitrary limit on how many specialized sets of
    724 //   code may be generated for a given node.  If the limit is reached, the
    725 //   trace is flushed and a generic version of the code for a node is emitted.
    726 //   This is subsequently used for that node.  The code emitted for non-generic
    727 //   trace is not recorded in the node and so it cannot currently be reused in
    728 //   the event that code generation is requested for an identical trace.
    729 
    730 
    731 void RegExpTree::AppendToText(RegExpText* text) {
    732   UNREACHABLE();
    733 }
    734 
    735 
    736 void RegExpAtom::AppendToText(RegExpText* text) {
    737   text->AddElement(TextElement::Atom(this));
    738 }
    739 
    740 
    741 void RegExpCharacterClass::AppendToText(RegExpText* text) {
    742   text->AddElement(TextElement::CharClass(this));
    743 }
    744 
    745 
    746 void RegExpText::AppendToText(RegExpText* text) {
    747   for (int i = 0; i < elements()->length(); i++)
    748     text->AddElement(elements()->at(i));
    749 }
    750 
    751 
    752 TextElement TextElement::Atom(RegExpAtom* atom) {
    753   TextElement result = TextElement(ATOM);
    754   result.data.u_atom = atom;
    755   return result;
    756 }
    757 
    758 
    759 TextElement TextElement::CharClass(
    760       RegExpCharacterClass* char_class) {
    761   TextElement result = TextElement(CHAR_CLASS);
    762   result.data.u_char_class = char_class;
    763   return result;
    764 }
    765 
    766 
    767 int TextElement::length() {
    768   if (type == ATOM) {
    769     return data.u_atom->length();
    770   } else {
    771     ASSERT(type == CHAR_CLASS);
    772     return 1;
    773   }
    774 }
    775 
    776 
    777 DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
    778   if (table_ == NULL) {
    779     table_ = new DispatchTable();
    780     DispatchTableConstructor cons(table_, ignore_case);
    781     cons.BuildTable(this);
    782   }
    783   return table_;
    784 }
    785 
    786 
    787 class RegExpCompiler {
    788  public:
    789   RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii);
    790 
    791   int AllocateRegister() {
    792     if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
    793       reg_exp_too_big_ = true;
    794       return next_register_;
    795     }
    796     return next_register_++;
    797   }
    798 
    799   RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
    800                                            RegExpNode* start,
    801                                            int capture_count,
    802                                            Handle<String> pattern);
    803 
    804   inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
    805 
    806   static const int kImplementationOffset = 0;
    807   static const int kNumberOfRegistersOffset = 0;
    808   static const int kCodeOffset = 1;
    809 
    810   RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
    811   EndNode* accept() { return accept_; }
    812 
    813   static const int kMaxRecursion = 100;
    814   inline int recursion_depth() { return recursion_depth_; }
    815   inline void IncrementRecursionDepth() { recursion_depth_++; }
    816   inline void DecrementRecursionDepth() { recursion_depth_--; }
    817 
    818   void SetRegExpTooBig() { reg_exp_too_big_ = true; }
    819 
    820   inline bool ignore_case() { return ignore_case_; }
    821   inline bool ascii() { return ascii_; }
    822 
    823   int current_expansion_factor() { return current_expansion_factor_; }
    824   void set_current_expansion_factor(int value) {
    825     current_expansion_factor_ = value;
    826   }
    827 
    828   static const int kNoRegister = -1;
    829 
    830  private:
    831   EndNode* accept_;
    832   int next_register_;
    833   List<RegExpNode*>* work_list_;
    834   int recursion_depth_;
    835   RegExpMacroAssembler* macro_assembler_;
    836   bool ignore_case_;
    837   bool ascii_;
    838   bool reg_exp_too_big_;
    839   int current_expansion_factor_;
    840 };
    841 
    842 
    843 class RecursionCheck {
    844  public:
    845   explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
    846     compiler->IncrementRecursionDepth();
    847   }
    848   ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
    849  private:
    850   RegExpCompiler* compiler_;
    851 };
    852 
    853 
    854 static RegExpEngine::CompilationResult IrregexpRegExpTooBig() {
    855   return RegExpEngine::CompilationResult("RegExp too big");
    856 }
    857 
    858 
    859 // Attempts to compile the regexp using an Irregexp code generator.  Returns
    860 // a fixed array or a null handle depending on whether it succeeded.
    861 RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii)
    862     : next_register_(2 * (capture_count + 1)),
    863       work_list_(NULL),
    864       recursion_depth_(0),
    865       ignore_case_(ignore_case),
    866       ascii_(ascii),
    867       reg_exp_too_big_(false),
    868       current_expansion_factor_(1) {
    869   accept_ = new EndNode(EndNode::ACCEPT);
    870   ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
    871 }
    872 
    873 
    874 RegExpEngine::CompilationResult RegExpCompiler::Assemble(
    875     RegExpMacroAssembler* macro_assembler,
    876     RegExpNode* start,
    877     int capture_count,
    878     Handle<String> pattern) {
    879   Heap* heap = pattern->GetHeap();
    880 
    881   bool use_slow_safe_regexp_compiler = false;
    882   if (heap->total_regexp_code_generated() >
    883           RegExpImpl::kRegWxpCompiledLimit &&
    884       heap->isolate()->memory_allocator()->SizeExecutable() >
    885           RegExpImpl::kRegExpExecutableMemoryLimit) {
    886     use_slow_safe_regexp_compiler = true;
    887   }
    888 
    889   macro_assembler->set_slow_safe(use_slow_safe_regexp_compiler);
    890 
    891 #ifdef DEBUG
    892   if (FLAG_trace_regexp_assembler)
    893     macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler);
    894   else
    895 #endif
    896     macro_assembler_ = macro_assembler;
    897 
    898   List <RegExpNode*> work_list(0);
    899   work_list_ = &work_list;
    900   Label fail;
    901   macro_assembler_->PushBacktrack(&fail);
    902   Trace new_trace;
    903   start->Emit(this, &new_trace);
    904   macro_assembler_->Bind(&fail);
    905   macro_assembler_->Fail();
    906   while (!work_list.is_empty()) {
    907     work_list.RemoveLast()->Emit(this, &new_trace);
    908   }
    909   if (reg_exp_too_big_) return IrregexpRegExpTooBig();
    910 
    911   Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
    912   heap->IncreaseTotalRegexpCodeGenerated(code->Size());
    913   work_list_ = NULL;
    914 #ifdef DEBUG
    915   if (FLAG_print_code) {
    916     Handle<Code>::cast(code)->Disassemble(*pattern->ToCString());
    917   }
    918   if (FLAG_trace_regexp_assembler) {
    919     delete macro_assembler_;
    920   }
    921 #endif
    922   return RegExpEngine::CompilationResult(*code, next_register_);
    923 }
    924 
    925 
    926 bool Trace::DeferredAction::Mentions(int that) {
    927   if (type() == ActionNode::CLEAR_CAPTURES) {
    928     Interval range = static_cast<DeferredClearCaptures*>(this)->range();
    929     return range.Contains(that);
    930   } else {
    931     return reg() == that;
    932   }
    933 }
    934 
    935 
    936 bool Trace::mentions_reg(int reg) {
    937   for (DeferredAction* action = actions_;
    938        action != NULL;
    939        action = action->next()) {
    940     if (action->Mentions(reg))
    941       return true;
    942   }
    943   return false;
    944 }
    945 
    946 
    947 bool Trace::GetStoredPosition(int reg, int* cp_offset) {
    948   ASSERT_EQ(0, *cp_offset);
    949   for (DeferredAction* action = actions_;
    950        action != NULL;
    951        action = action->next()) {
    952     if (action->Mentions(reg)) {
    953       if (action->type() == ActionNode::STORE_POSITION) {
    954         *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
    955         return true;
    956       } else {
    957         return false;
    958       }
    959     }
    960   }
    961   return false;
    962 }
    963 
    964 
    965 int Trace::FindAffectedRegisters(OutSet* affected_registers) {
    966   int max_register = RegExpCompiler::kNoRegister;
    967   for (DeferredAction* action = actions_;
    968        action != NULL;
    969        action = action->next()) {
    970     if (action->type() == ActionNode::CLEAR_CAPTURES) {
    971       Interval range = static_cast<DeferredClearCaptures*>(action)->range();
    972       for (int i = range.from(); i <= range.to(); i++)
    973         affected_registers->Set(i);
    974       if (range.to() > max_register) max_register = range.to();
    975     } else {
    976       affected_registers->Set(action->reg());
    977       if (action->reg() > max_register) max_register = action->reg();
    978     }
    979   }
    980   return max_register;
    981 }
    982 
    983 
    984 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
    985                                      int max_register,
    986                                      OutSet& registers_to_pop,
    987                                      OutSet& registers_to_clear) {
    988   for (int reg = max_register; reg >= 0; reg--) {
    989     if (registers_to_pop.Get(reg)) assembler->PopRegister(reg);
    990     else if (registers_to_clear.Get(reg)) {
    991       int clear_to = reg;
    992       while (reg > 0 && registers_to_clear.Get(reg - 1)) {
    993         reg--;
    994       }
    995       assembler->ClearRegisters(reg, clear_to);
    996     }
    997   }
    998 }
    999 
   1000 
   1001 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
   1002                                    int max_register,
   1003                                    OutSet& affected_registers,
   1004                                    OutSet* registers_to_pop,
   1005                                    OutSet* registers_to_clear) {
   1006   // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
   1007   const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
   1008 
   1009   // Count pushes performed to force a stack limit check occasionally.
   1010   int pushes = 0;
   1011 
   1012   for (int reg = 0; reg <= max_register; reg++) {
   1013     if (!affected_registers.Get(reg)) {
   1014       continue;
   1015     }
   1016 
   1017     // The chronologically first deferred action in the trace
   1018     // is used to infer the action needed to restore a register
   1019     // to its previous state (or not, if it's safe to ignore it).
   1020     enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
   1021     DeferredActionUndoType undo_action = IGNORE;
   1022 
   1023     int value = 0;
   1024     bool absolute = false;
   1025     bool clear = false;
   1026     int store_position = -1;
   1027     // This is a little tricky because we are scanning the actions in reverse
   1028     // historical order (newest first).
   1029     for (DeferredAction* action = actions_;
   1030          action != NULL;
   1031          action = action->next()) {
   1032       if (action->Mentions(reg)) {
   1033         switch (action->type()) {
   1034           case ActionNode::SET_REGISTER: {
   1035             Trace::DeferredSetRegister* psr =
   1036                 static_cast<Trace::DeferredSetRegister*>(action);
   1037             if (!absolute) {
   1038               value += psr->value();
   1039               absolute = true;
   1040             }
   1041             // SET_REGISTER is currently only used for newly introduced loop
   1042             // counters. They can have a significant previous value if they
   1043             // occour in a loop. TODO(lrn): Propagate this information, so
   1044             // we can set undo_action to IGNORE if we know there is no value to
   1045             // restore.
   1046             undo_action = RESTORE;
   1047             ASSERT_EQ(store_position, -1);
   1048             ASSERT(!clear);
   1049             break;
   1050           }
   1051           case ActionNode::INCREMENT_REGISTER:
   1052             if (!absolute) {
   1053               value++;
   1054             }
   1055             ASSERT_EQ(store_position, -1);
   1056             ASSERT(!clear);
   1057             undo_action = RESTORE;
   1058             break;
   1059           case ActionNode::STORE_POSITION: {
   1060             Trace::DeferredCapture* pc =
   1061                 static_cast<Trace::DeferredCapture*>(action);
   1062             if (!clear && store_position == -1) {
   1063               store_position = pc->cp_offset();
   1064             }
   1065 
   1066             // For captures we know that stores and clears alternate.
   1067             // Other register, are never cleared, and if the occur
   1068             // inside a loop, they might be assigned more than once.
   1069             if (reg <= 1) {
   1070               // Registers zero and one, aka "capture zero", is
   1071               // always set correctly if we succeed. There is no
   1072               // need to undo a setting on backtrack, because we
   1073               // will set it again or fail.
   1074               undo_action = IGNORE;
   1075             } else {
   1076               undo_action = pc->is_capture() ? CLEAR : RESTORE;
   1077             }
   1078             ASSERT(!absolute);
   1079             ASSERT_EQ(value, 0);
   1080             break;
   1081           }
   1082           case ActionNode::CLEAR_CAPTURES: {
   1083             // Since we're scanning in reverse order, if we've already
   1084             // set the position we have to ignore historically earlier
   1085             // clearing operations.
   1086             if (store_position == -1) {
   1087               clear = true;
   1088             }
   1089             undo_action = RESTORE;
   1090             ASSERT(!absolute);
   1091             ASSERT_EQ(value, 0);
   1092             break;
   1093           }
   1094           default:
   1095             UNREACHABLE();
   1096             break;
   1097         }
   1098       }
   1099     }
   1100     // Prepare for the undo-action (e.g., push if it's going to be popped).
   1101     if (undo_action == RESTORE) {
   1102       pushes++;
   1103       RegExpMacroAssembler::StackCheckFlag stack_check =
   1104           RegExpMacroAssembler::kNoStackLimitCheck;
   1105       if (pushes == push_limit) {
   1106         stack_check = RegExpMacroAssembler::kCheckStackLimit;
   1107         pushes = 0;
   1108       }
   1109 
   1110       assembler->PushRegister(reg, stack_check);
   1111       registers_to_pop->Set(reg);
   1112     } else if (undo_action == CLEAR) {
   1113       registers_to_clear->Set(reg);
   1114     }
   1115     // Perform the chronologically last action (or accumulated increment)
   1116     // for the register.
   1117     if (store_position != -1) {
   1118       assembler->WriteCurrentPositionToRegister(reg, store_position);
   1119     } else if (clear) {
   1120       assembler->ClearRegisters(reg, reg);
   1121     } else if (absolute) {
   1122       assembler->SetRegister(reg, value);
   1123     } else if (value != 0) {
   1124       assembler->AdvanceRegister(reg, value);
   1125     }
   1126   }
   1127 }
   1128 
   1129 
   1130 // This is called as we come into a loop choice node and some other tricky
   1131 // nodes.  It normalizes the state of the code generator to ensure we can
   1132 // generate generic code.
   1133 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
   1134   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   1135 
   1136   ASSERT(!is_trivial());
   1137 
   1138   if (actions_ == NULL && backtrack() == NULL) {
   1139     // Here we just have some deferred cp advances to fix and we are back to
   1140     // a normal situation.  We may also have to forget some information gained
   1141     // through a quick check that was already performed.
   1142     if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
   1143     // Create a new trivial state and generate the node with that.
   1144     Trace new_state;
   1145     successor->Emit(compiler, &new_state);
   1146     return;
   1147   }
   1148 
   1149   // Generate deferred actions here along with code to undo them again.
   1150   OutSet affected_registers;
   1151 
   1152   if (backtrack() != NULL) {
   1153     // Here we have a concrete backtrack location.  These are set up by choice
   1154     // nodes and so they indicate that we have a deferred save of the current
   1155     // position which we may need to emit here.
   1156     assembler->PushCurrentPosition();
   1157   }
   1158 
   1159   int max_register = FindAffectedRegisters(&affected_registers);
   1160   OutSet registers_to_pop;
   1161   OutSet registers_to_clear;
   1162   PerformDeferredActions(assembler,
   1163                          max_register,
   1164                          affected_registers,
   1165                          &registers_to_pop,
   1166                          &registers_to_clear);
   1167   if (cp_offset_ != 0) {
   1168     assembler->AdvanceCurrentPosition(cp_offset_);
   1169   }
   1170 
   1171   // Create a new trivial state and generate the node with that.
   1172   Label undo;
   1173   assembler->PushBacktrack(&undo);
   1174   Trace new_state;
   1175   successor->Emit(compiler, &new_state);
   1176 
   1177   // On backtrack we need to restore state.
   1178   assembler->Bind(&undo);
   1179   RestoreAffectedRegisters(assembler,
   1180                            max_register,
   1181                            registers_to_pop,
   1182                            registers_to_clear);
   1183   if (backtrack() == NULL) {
   1184     assembler->Backtrack();
   1185   } else {
   1186     assembler->PopCurrentPosition();
   1187     assembler->GoTo(backtrack());
   1188   }
   1189 }
   1190 
   1191 
   1192 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
   1193   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   1194 
   1195   // Omit flushing the trace. We discard the entire stack frame anyway.
   1196 
   1197   if (!label()->is_bound()) {
   1198     // We are completely independent of the trace, since we ignore it,
   1199     // so this code can be used as the generic version.
   1200     assembler->Bind(label());
   1201   }
   1202 
   1203   // Throw away everything on the backtrack stack since the start
   1204   // of the negative submatch and restore the character position.
   1205   assembler->ReadCurrentPositionFromRegister(current_position_register_);
   1206   assembler->ReadStackPointerFromRegister(stack_pointer_register_);
   1207   if (clear_capture_count_ > 0) {
   1208     // Clear any captures that might have been performed during the success
   1209     // of the body of the negative look-ahead.
   1210     int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
   1211     assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
   1212   }
   1213   // Now that we have unwound the stack we find at the top of the stack the
   1214   // backtrack that the BeginSubmatch node got.
   1215   assembler->Backtrack();
   1216 }
   1217 
   1218 
   1219 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
   1220   if (!trace->is_trivial()) {
   1221     trace->Flush(compiler, this);
   1222     return;
   1223   }
   1224   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   1225   if (!label()->is_bound()) {
   1226     assembler->Bind(label());
   1227   }
   1228   switch (action_) {
   1229     case ACCEPT:
   1230       assembler->Succeed();
   1231       return;
   1232     case BACKTRACK:
   1233       assembler->GoTo(trace->backtrack());
   1234       return;
   1235     case NEGATIVE_SUBMATCH_SUCCESS:
   1236       // This case is handled in a different virtual method.
   1237       UNREACHABLE();
   1238   }
   1239   UNIMPLEMENTED();
   1240 }
   1241 
   1242 
   1243 void GuardedAlternative::AddGuard(Guard* guard) {
   1244   if (guards_ == NULL)
   1245     guards_ = new ZoneList<Guard*>(1);
   1246   guards_->Add(guard);
   1247 }
   1248 
   1249 
   1250 ActionNode* ActionNode::SetRegister(int reg,
   1251                                     int val,
   1252                                     RegExpNode* on_success) {
   1253   ActionNode* result = new ActionNode(SET_REGISTER, on_success);
   1254   result->data_.u_store_register.reg = reg;
   1255   result->data_.u_store_register.value = val;
   1256   return result;
   1257 }
   1258 
   1259 
   1260 ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
   1261   ActionNode* result = new ActionNode(INCREMENT_REGISTER, on_success);
   1262   result->data_.u_increment_register.reg = reg;
   1263   return result;
   1264 }
   1265 
   1266 
   1267 ActionNode* ActionNode::StorePosition(int reg,
   1268                                       bool is_capture,
   1269                                       RegExpNode* on_success) {
   1270   ActionNode* result = new ActionNode(STORE_POSITION, on_success);
   1271   result->data_.u_position_register.reg = reg;
   1272   result->data_.u_position_register.is_capture = is_capture;
   1273   return result;
   1274 }
   1275 
   1276 
   1277 ActionNode* ActionNode::ClearCaptures(Interval range,
   1278                                       RegExpNode* on_success) {
   1279   ActionNode* result = new ActionNode(CLEAR_CAPTURES, on_success);
   1280   result->data_.u_clear_captures.range_from = range.from();
   1281   result->data_.u_clear_captures.range_to = range.to();
   1282   return result;
   1283 }
   1284 
   1285 
   1286 ActionNode* ActionNode::BeginSubmatch(int stack_reg,
   1287                                       int position_reg,
   1288                                       RegExpNode* on_success) {
   1289   ActionNode* result = new ActionNode(BEGIN_SUBMATCH, on_success);
   1290   result->data_.u_submatch.stack_pointer_register = stack_reg;
   1291   result->data_.u_submatch.current_position_register = position_reg;
   1292   return result;
   1293 }
   1294 
   1295 
   1296 ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
   1297                                                 int position_reg,
   1298                                                 int clear_register_count,
   1299                                                 int clear_register_from,
   1300                                                 RegExpNode* on_success) {
   1301   ActionNode* result = new ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
   1302   result->data_.u_submatch.stack_pointer_register = stack_reg;
   1303   result->data_.u_submatch.current_position_register = position_reg;
   1304   result->data_.u_submatch.clear_register_count = clear_register_count;
   1305   result->data_.u_submatch.clear_register_from = clear_register_from;
   1306   return result;
   1307 }
   1308 
   1309 
   1310 ActionNode* ActionNode::EmptyMatchCheck(int start_register,
   1311                                         int repetition_register,
   1312                                         int repetition_limit,
   1313                                         RegExpNode* on_success) {
   1314   ActionNode* result = new ActionNode(EMPTY_MATCH_CHECK, on_success);
   1315   result->data_.u_empty_match_check.start_register = start_register;
   1316   result->data_.u_empty_match_check.repetition_register = repetition_register;
   1317   result->data_.u_empty_match_check.repetition_limit = repetition_limit;
   1318   return result;
   1319 }
   1320 
   1321 
   1322 #define DEFINE_ACCEPT(Type)                                          \
   1323   void Type##Node::Accept(NodeVisitor* visitor) {                    \
   1324     visitor->Visit##Type(this);                                      \
   1325   }
   1326 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
   1327 #undef DEFINE_ACCEPT
   1328 
   1329 
   1330 void LoopChoiceNode::Accept(NodeVisitor* visitor) {
   1331   visitor->VisitLoopChoice(this);
   1332 }
   1333 
   1334 
   1335 // -------------------------------------------------------------------
   1336 // Emit code.
   1337 
   1338 
   1339 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
   1340                                Guard* guard,
   1341                                Trace* trace) {
   1342   switch (guard->op()) {
   1343     case Guard::LT:
   1344       ASSERT(!trace->mentions_reg(guard->reg()));
   1345       macro_assembler->IfRegisterGE(guard->reg(),
   1346                                     guard->value(),
   1347                                     trace->backtrack());
   1348       break;
   1349     case Guard::GEQ:
   1350       ASSERT(!trace->mentions_reg(guard->reg()));
   1351       macro_assembler->IfRegisterLT(guard->reg(),
   1352                                     guard->value(),
   1353                                     trace->backtrack());
   1354       break;
   1355   }
   1356 }
   1357 
   1358 
   1359 // Returns the number of characters in the equivalence class, omitting those
   1360 // that cannot occur in the source string because it is ASCII.
   1361 static int GetCaseIndependentLetters(Isolate* isolate,
   1362                                      uc16 character,
   1363                                      bool ascii_subject,
   1364                                      unibrow::uchar* letters) {
   1365   int length =
   1366       isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
   1367   // Unibrow returns 0 or 1 for characters where case independence is
   1368   // trivial.
   1369   if (length == 0) {
   1370     letters[0] = character;
   1371     length = 1;
   1372   }
   1373   if (!ascii_subject || character <= String::kMaxAsciiCharCode) {
   1374     return length;
   1375   }
   1376   // The standard requires that non-ASCII characters cannot have ASCII
   1377   // character codes in their equivalence class.
   1378   return 0;
   1379 }
   1380 
   1381 
   1382 static inline bool EmitSimpleCharacter(Isolate* isolate,
   1383                                        RegExpCompiler* compiler,
   1384                                        uc16 c,
   1385                                        Label* on_failure,
   1386                                        int cp_offset,
   1387                                        bool check,
   1388                                        bool preloaded) {
   1389   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   1390   bool bound_checked = false;
   1391   if (!preloaded) {
   1392     assembler->LoadCurrentCharacter(
   1393         cp_offset,
   1394         on_failure,
   1395         check);
   1396     bound_checked = true;
   1397   }
   1398   assembler->CheckNotCharacter(c, on_failure);
   1399   return bound_checked;
   1400 }
   1401 
   1402 
   1403 // Only emits non-letters (things that don't have case).  Only used for case
   1404 // independent matches.
   1405 static inline bool EmitAtomNonLetter(Isolate* isolate,
   1406                                      RegExpCompiler* compiler,
   1407                                      uc16 c,
   1408                                      Label* on_failure,
   1409                                      int cp_offset,
   1410                                      bool check,
   1411                                      bool preloaded) {
   1412   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
   1413   bool ascii = compiler->ascii();
   1414   unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
   1415   int length = GetCaseIndependentLetters(isolate, c, ascii, chars);
   1416   if (length < 1) {
   1417     // This can't match.  Must be an ASCII subject and a non-ASCII character.
   1418     // We do not need to do anything since the ASCII pass already handled this.
   1419     return false;  // Bounds not checked.
   1420   }
   1421   bool checked = false;
   1422   // We handle the length > 1 case in a later pass.
   1423   if (length == 1) {
   1424     if (ascii && c > String::kMaxAsciiCharCodeU) {
   1425       // Can't match - see above.
   1426       return false;  // Bounds not checked.
   1427     }
   1428     if (!preloaded) {
   1429       macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
   1430       checked = check;
   1431     }
   1432     macro_assembler->CheckNotCharacter(c, on_failure);
   1433   }
   1434   return checked;
   1435 }
   1436 
   1437 
   1438 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
   1439                                       bool ascii,
   1440                                       uc16 c1,
   1441                                       uc16 c2,
   1442                                       Label* on_failure) {
   1443   uc16 char_mask;
   1444   if (ascii) {
   1445     char_mask = String::kMaxAsciiCharCode;
   1446   } else {
   1447     char_mask = String::kMaxUtf16CodeUnit;
   1448   }
   1449   uc16 exor = c1 ^ c2;
   1450   // Check whether exor has only one bit set.
   1451   if (((exor - 1) & exor) == 0) {
   1452     // If c1 and c2 differ only by one bit.
   1453     // Ecma262UnCanonicalize always gives the highest number last.
   1454     ASSERT(c2 > c1);
   1455     uc16 mask = char_mask ^ exor;
   1456     macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
   1457     return true;
   1458   }
   1459   ASSERT(c2 > c1);
   1460   uc16 diff = c2 - c1;
   1461   if (((diff - 1) & diff) == 0 && c1 >= diff) {
   1462     // If the characters differ by 2^n but don't differ by one bit then
   1463     // subtract the difference from the found character, then do the or
   1464     // trick.  We avoid the theoretical case where negative numbers are
   1465     // involved in order to simplify code generation.
   1466     uc16 mask = char_mask ^ diff;
   1467     macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
   1468                                                     diff,
   1469                                                     mask,
   1470                                                     on_failure);
   1471     return true;
   1472   }
   1473   return false;
   1474 }
   1475 
   1476 
   1477 typedef bool EmitCharacterFunction(Isolate* isolate,
   1478                                    RegExpCompiler* compiler,
   1479                                    uc16 c,
   1480                                    Label* on_failure,
   1481                                    int cp_offset,
   1482                                    bool check,
   1483                                    bool preloaded);
   1484 
   1485 // Only emits letters (things that have case).  Only used for case independent
   1486 // matches.
   1487 static inline bool EmitAtomLetter(Isolate* isolate,
   1488                                   RegExpCompiler* compiler,
   1489                                   uc16 c,
   1490                                   Label* on_failure,
   1491                                   int cp_offset,
   1492                                   bool check,
   1493                                   bool preloaded) {
   1494   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
   1495   bool ascii = compiler->ascii();
   1496   unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
   1497   int length = GetCaseIndependentLetters(isolate, c, ascii, chars);
   1498   if (length <= 1) return false;
   1499   // We may not need to check against the end of the input string
   1500   // if this character lies before a character that matched.
   1501   if (!preloaded) {
   1502     macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
   1503   }
   1504   Label ok;
   1505   ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
   1506   switch (length) {
   1507     case 2: {
   1508       if (ShortCutEmitCharacterPair(macro_assembler,
   1509                                     ascii,
   1510                                     chars[0],
   1511                                     chars[1],
   1512                                     on_failure)) {
   1513       } else {
   1514         macro_assembler->CheckCharacter(chars[0], &ok);
   1515         macro_assembler->CheckNotCharacter(chars[1], on_failure);
   1516         macro_assembler->Bind(&ok);
   1517       }
   1518       break;
   1519     }
   1520     case 4:
   1521       macro_assembler->CheckCharacter(chars[3], &ok);
   1522       // Fall through!
   1523     case 3:
   1524       macro_assembler->CheckCharacter(chars[0], &ok);
   1525       macro_assembler->CheckCharacter(chars[1], &ok);
   1526       macro_assembler->CheckNotCharacter(chars[2], on_failure);
   1527       macro_assembler->Bind(&ok);
   1528       break;
   1529     default:
   1530       UNREACHABLE();
   1531       break;
   1532   }
   1533   return true;
   1534 }
   1535 
   1536 
   1537 static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
   1538                           RegExpCharacterClass* cc,
   1539                           bool ascii,
   1540                           Label* on_failure,
   1541                           int cp_offset,
   1542                           bool check_offset,
   1543                           bool preloaded) {
   1544   ZoneList<CharacterRange>* ranges = cc->ranges();
   1545   int max_char;
   1546   if (ascii) {
   1547     max_char = String::kMaxAsciiCharCode;
   1548   } else {
   1549     max_char = String::kMaxUtf16CodeUnit;
   1550   }
   1551 
   1552   Label success;
   1553 
   1554   Label* char_is_in_class =
   1555       cc->is_negated() ? on_failure : &success;
   1556 
   1557   int range_count = ranges->length();
   1558 
   1559   int last_valid_range = range_count - 1;
   1560   while (last_valid_range >= 0) {
   1561     CharacterRange& range = ranges->at(last_valid_range);
   1562     if (range.from() <= max_char) {
   1563       break;
   1564     }
   1565     last_valid_range--;
   1566   }
   1567 
   1568   if (last_valid_range < 0) {
   1569     if (!cc->is_negated()) {
   1570       // TODO(plesner): We can remove this when the node level does our
   1571       // ASCII optimizations for us.
   1572       macro_assembler->GoTo(on_failure);
   1573     }
   1574     if (check_offset) {
   1575       macro_assembler->CheckPosition(cp_offset, on_failure);
   1576     }
   1577     return;
   1578   }
   1579 
   1580   if (last_valid_range == 0 &&
   1581       !cc->is_negated() &&
   1582       ranges->at(0).IsEverything(max_char)) {
   1583     // This is a common case hit by non-anchored expressions.
   1584     if (check_offset) {
   1585       macro_assembler->CheckPosition(cp_offset, on_failure);
   1586     }
   1587     return;
   1588   }
   1589 
   1590   if (!preloaded) {
   1591     macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
   1592   }
   1593 
   1594   if (cc->is_standard() &&
   1595         macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
   1596                                                     on_failure)) {
   1597       return;
   1598   }
   1599 
   1600   for (int i = 0; i < last_valid_range; i++) {
   1601     CharacterRange& range = ranges->at(i);
   1602     Label next_range;
   1603     uc16 from = range.from();
   1604     uc16 to = range.to();
   1605     if (from > max_char) {
   1606       continue;
   1607     }
   1608     if (to > max_char) to = max_char;
   1609     if (to == from) {
   1610       macro_assembler->CheckCharacter(to, char_is_in_class);
   1611     } else {
   1612       if (from != 0) {
   1613         macro_assembler->CheckCharacterLT(from, &next_range);
   1614       }
   1615       if (to != max_char) {
   1616         macro_assembler->CheckCharacterLT(to + 1, char_is_in_class);
   1617       } else {
   1618         macro_assembler->GoTo(char_is_in_class);
   1619       }
   1620     }
   1621     macro_assembler->Bind(&next_range);
   1622   }
   1623 
   1624   CharacterRange& range = ranges->at(last_valid_range);
   1625   uc16 from = range.from();
   1626   uc16 to = range.to();
   1627 
   1628   if (to > max_char) to = max_char;
   1629   ASSERT(to >= from);
   1630 
   1631   if (to == from) {
   1632     if (cc->is_negated()) {
   1633       macro_assembler->CheckCharacter(to, on_failure);
   1634     } else {
   1635       macro_assembler->CheckNotCharacter(to, on_failure);
   1636     }
   1637   } else {
   1638     if (from != 0) {
   1639       if (cc->is_negated()) {
   1640         macro_assembler->CheckCharacterLT(from, &success);
   1641       } else {
   1642         macro_assembler->CheckCharacterLT(from, on_failure);
   1643       }
   1644     }
   1645     if (to != String::kMaxUtf16CodeUnit) {
   1646       if (cc->is_negated()) {
   1647         macro_assembler->CheckCharacterLT(to + 1, on_failure);
   1648       } else {
   1649         macro_assembler->CheckCharacterGT(to, on_failure);
   1650       }
   1651     } else {
   1652       if (cc->is_negated()) {
   1653         macro_assembler->GoTo(on_failure);
   1654       }
   1655     }
   1656   }
   1657   macro_assembler->Bind(&success);
   1658 }
   1659 
   1660 
   1661 RegExpNode::~RegExpNode() {
   1662 }
   1663 
   1664 
   1665 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
   1666                                                   Trace* trace) {
   1667   // If we are generating a greedy loop then don't stop and don't reuse code.
   1668   if (trace->stop_node() != NULL) {
   1669     return CONTINUE;
   1670   }
   1671 
   1672   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
   1673   if (trace->is_trivial()) {
   1674     if (label_.is_bound()) {
   1675       // We are being asked to generate a generic version, but that's already
   1676       // been done so just go to it.
   1677       macro_assembler->GoTo(&label_);
   1678       return DONE;
   1679     }
   1680     if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
   1681       // To avoid too deep recursion we push the node to the work queue and just
   1682       // generate a goto here.
   1683       compiler->AddWork(this);
   1684       macro_assembler->GoTo(&label_);
   1685       return DONE;
   1686     }
   1687     // Generate generic version of the node and bind the label for later use.
   1688     macro_assembler->Bind(&label_);
   1689     return CONTINUE;
   1690   }
   1691 
   1692   // We are being asked to make a non-generic version.  Keep track of how many
   1693   // non-generic versions we generate so as not to overdo it.
   1694   trace_count_++;
   1695   if (FLAG_regexp_optimization &&
   1696       trace_count_ < kMaxCopiesCodeGenerated &&
   1697       compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {
   1698     return CONTINUE;
   1699   }
   1700 
   1701   // If we get here code has been generated for this node too many times or
   1702   // recursion is too deep.  Time to switch to a generic version.  The code for
   1703   // generic versions above can handle deep recursion properly.
   1704   trace->Flush(compiler, this);
   1705   return DONE;
   1706 }
   1707 
   1708 
   1709 int ActionNode::EatsAtLeast(int still_to_find,
   1710                             int recursion_depth,
   1711                             bool not_at_start) {
   1712   if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
   1713   if (type_ == POSITIVE_SUBMATCH_SUCCESS) return 0;  // Rewinds input!
   1714   return on_success()->EatsAtLeast(still_to_find,
   1715                                    recursion_depth + 1,
   1716                                    not_at_start);
   1717 }
   1718 
   1719 
   1720 int AssertionNode::EatsAtLeast(int still_to_find,
   1721                                int recursion_depth,
   1722                                bool not_at_start) {
   1723   if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
   1724   // If we know we are not at the start and we are asked "how many characters
   1725   // will you match if you succeed?" then we can answer anything since false
   1726   // implies false.  So lets just return the max answer (still_to_find) since
   1727   // that won't prevent us from preloading a lot of characters for the other
   1728   // branches in the node graph.
   1729   if (type() == AT_START && not_at_start) return still_to_find;
   1730   return on_success()->EatsAtLeast(still_to_find,
   1731                                    recursion_depth + 1,
   1732                                    not_at_start);
   1733 }
   1734 
   1735 
   1736 int BackReferenceNode::EatsAtLeast(int still_to_find,
   1737                                    int recursion_depth,
   1738                                    bool not_at_start) {
   1739   if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
   1740   return on_success()->EatsAtLeast(still_to_find,
   1741                                    recursion_depth + 1,
   1742                                    not_at_start);
   1743 }
   1744 
   1745 
   1746 int TextNode::EatsAtLeast(int still_to_find,
   1747                           int recursion_depth,
   1748                           bool not_at_start) {
   1749   int answer = Length();
   1750   if (answer >= still_to_find) return answer;
   1751   if (recursion_depth > RegExpCompiler::kMaxRecursion) return answer;
   1752   // We are not at start after this node so we set the last argument to 'true'.
   1753   return answer + on_success()->EatsAtLeast(still_to_find - answer,
   1754                                             recursion_depth + 1,
   1755                                             true);
   1756 }
   1757 
   1758 
   1759 int NegativeLookaheadChoiceNode::EatsAtLeast(int still_to_find,
   1760                                              int recursion_depth,
   1761                                              bool not_at_start) {
   1762   if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
   1763   // Alternative 0 is the negative lookahead, alternative 1 is what comes
   1764   // afterwards.
   1765   RegExpNode* node = alternatives_->at(1).node();
   1766   return node->EatsAtLeast(still_to_find, recursion_depth + 1, not_at_start);
   1767 }
   1768 
   1769 
   1770 void NegativeLookaheadChoiceNode::GetQuickCheckDetails(
   1771     QuickCheckDetails* details,
   1772     RegExpCompiler* compiler,
   1773     int filled_in,
   1774     bool not_at_start) {
   1775   // Alternative 0 is the negative lookahead, alternative 1 is what comes
   1776   // afterwards.
   1777   RegExpNode* node = alternatives_->at(1).node();
   1778   return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
   1779 }
   1780 
   1781 
   1782 int ChoiceNode::EatsAtLeastHelper(int still_to_find,
   1783                                   int recursion_depth,
   1784                                   RegExpNode* ignore_this_node,
   1785                                   bool not_at_start) {
   1786   if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
   1787   int min = 100;
   1788   int choice_count = alternatives_->length();
   1789   for (int i = 0; i < choice_count; i++) {
   1790     RegExpNode* node = alternatives_->at(i).node();
   1791     if (node == ignore_this_node) continue;
   1792     int node_eats_at_least = node->EatsAtLeast(still_to_find,
   1793                                                recursion_depth + 1,
   1794                                                not_at_start);
   1795     if (node_eats_at_least < min) min = node_eats_at_least;
   1796   }
   1797   return min;
   1798 }
   1799 
   1800 
   1801 int LoopChoiceNode::EatsAtLeast(int still_to_find,
   1802                                 int recursion_depth,
   1803                                 bool not_at_start) {
   1804   return EatsAtLeastHelper(still_to_find,
   1805                            recursion_depth,
   1806                            loop_node_,
   1807                            not_at_start);
   1808 }
   1809 
   1810 
   1811 int ChoiceNode::EatsAtLeast(int still_to_find,
   1812                             int recursion_depth,
   1813                             bool not_at_start) {
   1814   return EatsAtLeastHelper(still_to_find,
   1815                            recursion_depth,
   1816                            NULL,
   1817                            not_at_start);
   1818 }
   1819 
   1820 
   1821 // Takes the left-most 1-bit and smears it out, setting all bits to its right.
   1822 static inline uint32_t SmearBitsRight(uint32_t v) {
   1823   v |= v >> 1;
   1824   v |= v >> 2;
   1825   v |= v >> 4;
   1826   v |= v >> 8;
   1827   v |= v >> 16;
   1828   return v;
   1829 }
   1830 
   1831 
   1832 bool QuickCheckDetails::Rationalize(bool asc) {
   1833   bool found_useful_op = false;
   1834   uint32_t char_mask;
   1835   if (asc) {
   1836     char_mask = String::kMaxAsciiCharCode;
   1837   } else {
   1838     char_mask = String::kMaxUtf16CodeUnit;
   1839   }
   1840   mask_ = 0;
   1841   value_ = 0;
   1842   int char_shift = 0;
   1843   for (int i = 0; i < characters_; i++) {
   1844     Position* pos = &positions_[i];
   1845     if ((pos->mask & String::kMaxAsciiCharCode) != 0) {
   1846       found_useful_op = true;
   1847     }
   1848     mask_ |= (pos->mask & char_mask) << char_shift;
   1849     value_ |= (pos->value & char_mask) << char_shift;
   1850     char_shift += asc ? 8 : 16;
   1851   }
   1852   return found_useful_op;
   1853 }
   1854 
   1855 
   1856 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
   1857                                 Trace* trace,
   1858                                 bool preload_has_checked_bounds,
   1859                                 Label* on_possible_success,
   1860                                 QuickCheckDetails* details,
   1861                                 bool fall_through_on_failure) {
   1862   if (details->characters() == 0) return false;
   1863   GetQuickCheckDetails(details, compiler, 0, trace->at_start() == Trace::FALSE);
   1864   if (details->cannot_match()) return false;
   1865   if (!details->Rationalize(compiler->ascii())) return false;
   1866   ASSERT(details->characters() == 1 ||
   1867          compiler->macro_assembler()->CanReadUnaligned());
   1868   uint32_t mask = details->mask();
   1869   uint32_t value = details->value();
   1870 
   1871   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   1872 
   1873   if (trace->characters_preloaded() != details->characters()) {
   1874     assembler->LoadCurrentCharacter(trace->cp_offset(),
   1875                                     trace->backtrack(),
   1876                                     !preload_has_checked_bounds,
   1877                                     details->characters());
   1878   }
   1879 
   1880 
   1881   bool need_mask = true;
   1882 
   1883   if (details->characters() == 1) {
   1884     // If number of characters preloaded is 1 then we used a byte or 16 bit
   1885     // load so the value is already masked down.
   1886     uint32_t char_mask;
   1887     if (compiler->ascii()) {
   1888       char_mask = String::kMaxAsciiCharCode;
   1889     } else {
   1890       char_mask = String::kMaxUtf16CodeUnit;
   1891     }
   1892     if ((mask & char_mask) == char_mask) need_mask = false;
   1893     mask &= char_mask;
   1894   } else {
   1895     // For 2-character preloads in ASCII mode or 1-character preloads in
   1896     // TWO_BYTE mode we also use a 16 bit load with zero extend.
   1897     if (details->characters() == 2 && compiler->ascii()) {
   1898       if ((mask & 0x7f7f) == 0x7f7f) need_mask = false;
   1899     } else if (details->characters() == 1 && !compiler->ascii()) {
   1900       if ((mask & 0xffff) == 0xffff) need_mask = false;
   1901     } else {
   1902       if (mask == 0xffffffff) need_mask = false;
   1903     }
   1904   }
   1905 
   1906   if (fall_through_on_failure) {
   1907     if (need_mask) {
   1908       assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
   1909     } else {
   1910       assembler->CheckCharacter(value, on_possible_success);
   1911     }
   1912   } else {
   1913     if (need_mask) {
   1914       assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
   1915     } else {
   1916       assembler->CheckNotCharacter(value, trace->backtrack());
   1917     }
   1918   }
   1919   return true;
   1920 }
   1921 
   1922 
   1923 // Here is the meat of GetQuickCheckDetails (see also the comment on the
   1924 // super-class in the .h file).
   1925 //
   1926 // We iterate along the text object, building up for each character a
   1927 // mask and value that can be used to test for a quick failure to match.
   1928 // The masks and values for the positions will be combined into a single
   1929 // machine word for the current character width in order to be used in
   1930 // generating a quick check.
   1931 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
   1932                                     RegExpCompiler* compiler,
   1933                                     int characters_filled_in,
   1934                                     bool not_at_start) {
   1935   Isolate* isolate = Isolate::Current();
   1936   ASSERT(characters_filled_in < details->characters());
   1937   int characters = details->characters();
   1938   int char_mask;
   1939   if (compiler->ascii()) {
   1940     char_mask = String::kMaxAsciiCharCode;
   1941   } else {
   1942     char_mask = String::kMaxUtf16CodeUnit;
   1943   }
   1944   for (int k = 0; k < elms_->length(); k++) {
   1945     TextElement elm = elms_->at(k);
   1946     if (elm.type == TextElement::ATOM) {
   1947       Vector<const uc16> quarks = elm.data.u_atom->data();
   1948       for (int i = 0; i < characters && i < quarks.length(); i++) {
   1949         QuickCheckDetails::Position* pos =
   1950             details->positions(characters_filled_in);
   1951         uc16 c = quarks[i];
   1952         if (c > char_mask) {
   1953           // If we expect a non-ASCII character from an ASCII string,
   1954           // there is no way we can match. Not even case independent
   1955           // matching can turn an ASCII character into non-ASCII or
   1956           // vice versa.
   1957           details->set_cannot_match();
   1958           pos->determines_perfectly = false;
   1959           return;
   1960         }
   1961         if (compiler->ignore_case()) {
   1962           unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
   1963           int length = GetCaseIndependentLetters(isolate, c, compiler->ascii(),
   1964                                                  chars);
   1965           ASSERT(length != 0);  // Can only happen if c > char_mask (see above).
   1966           if (length == 1) {
   1967             // This letter has no case equivalents, so it's nice and simple
   1968             // and the mask-compare will determine definitely whether we have
   1969             // a match at this character position.
   1970             pos->mask = char_mask;
   1971             pos->value = c;
   1972             pos->determines_perfectly = true;
   1973           } else {
   1974             uint32_t common_bits = char_mask;
   1975             uint32_t bits = chars[0];
   1976             for (int j = 1; j < length; j++) {
   1977               uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
   1978               common_bits ^= differing_bits;
   1979               bits &= common_bits;
   1980             }
   1981             // If length is 2 and common bits has only one zero in it then
   1982             // our mask and compare instruction will determine definitely
   1983             // whether we have a match at this character position.  Otherwise
   1984             // it can only be an approximate check.
   1985             uint32_t one_zero = (common_bits | ~char_mask);
   1986             if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
   1987               pos->determines_perfectly = true;
   1988             }
   1989             pos->mask = common_bits;
   1990             pos->value = bits;
   1991           }
   1992         } else {
   1993           // Don't ignore case.  Nice simple case where the mask-compare will
   1994           // determine definitely whether we have a match at this character
   1995           // position.
   1996           pos->mask = char_mask;
   1997           pos->value = c;
   1998           pos->determines_perfectly = true;
   1999         }
   2000         characters_filled_in++;
   2001         ASSERT(characters_filled_in <= details->characters());
   2002         if (characters_filled_in == details->characters()) {
   2003           return;
   2004         }
   2005       }
   2006     } else {
   2007       QuickCheckDetails::Position* pos =
   2008           details->positions(characters_filled_in);
   2009       RegExpCharacterClass* tree = elm.data.u_char_class;
   2010       ZoneList<CharacterRange>* ranges = tree->ranges();
   2011       if (tree->is_negated()) {
   2012         // A quick check uses multi-character mask and compare.  There is no
   2013         // useful way to incorporate a negative char class into this scheme
   2014         // so we just conservatively create a mask and value that will always
   2015         // succeed.
   2016         pos->mask = 0;
   2017         pos->value = 0;
   2018       } else {
   2019         int first_range = 0;
   2020         while (ranges->at(first_range).from() > char_mask) {
   2021           first_range++;
   2022           if (first_range == ranges->length()) {
   2023             details->set_cannot_match();
   2024             pos->determines_perfectly = false;
   2025             return;
   2026           }
   2027         }
   2028         CharacterRange range = ranges->at(first_range);
   2029         uc16 from = range.from();
   2030         uc16 to = range.to();
   2031         if (to > char_mask) {
   2032           to = char_mask;
   2033         }
   2034         uint32_t differing_bits = (from ^ to);
   2035         // A mask and compare is only perfect if the differing bits form a
   2036         // number like 00011111 with one single block of trailing 1s.
   2037         if ((differing_bits & (differing_bits + 1)) == 0 &&
   2038              from + differing_bits == to) {
   2039           pos->determines_perfectly = true;
   2040         }
   2041         uint32_t common_bits = ~SmearBitsRight(differing_bits);
   2042         uint32_t bits = (from & common_bits);
   2043         for (int i = first_range + 1; i < ranges->length(); i++) {
   2044           CharacterRange range = ranges->at(i);
   2045           uc16 from = range.from();
   2046           uc16 to = range.to();
   2047           if (from > char_mask) continue;
   2048           if (to > char_mask) to = char_mask;
   2049           // Here we are combining more ranges into the mask and compare
   2050           // value.  With each new range the mask becomes more sparse and
   2051           // so the chances of a false positive rise.  A character class
   2052           // with multiple ranges is assumed never to be equivalent to a
   2053           // mask and compare operation.
   2054           pos->determines_perfectly = false;
   2055           uint32_t new_common_bits = (from ^ to);
   2056           new_common_bits = ~SmearBitsRight(new_common_bits);
   2057           common_bits &= new_common_bits;
   2058           bits &= new_common_bits;
   2059           uint32_t differing_bits = (from & common_bits) ^ bits;
   2060           common_bits ^= differing_bits;
   2061           bits &= common_bits;
   2062         }
   2063         pos->mask = common_bits;
   2064         pos->value = bits;
   2065       }
   2066       characters_filled_in++;
   2067       ASSERT(characters_filled_in <= details->characters());
   2068       if (characters_filled_in == details->characters()) {
   2069         return;
   2070       }
   2071     }
   2072   }
   2073   ASSERT(characters_filled_in != details->characters());
   2074   on_success()-> GetQuickCheckDetails(details,
   2075                                       compiler,
   2076                                       characters_filled_in,
   2077                                       true);
   2078 }
   2079 
   2080 
   2081 void QuickCheckDetails::Clear() {
   2082   for (int i = 0; i < characters_; i++) {
   2083     positions_[i].mask = 0;
   2084     positions_[i].value = 0;
   2085     positions_[i].determines_perfectly = false;
   2086   }
   2087   characters_ = 0;
   2088 }
   2089 
   2090 
   2091 void QuickCheckDetails::Advance(int by, bool ascii) {
   2092   ASSERT(by >= 0);
   2093   if (by >= characters_) {
   2094     Clear();
   2095     return;
   2096   }
   2097   for (int i = 0; i < characters_ - by; i++) {
   2098     positions_[i] = positions_[by + i];
   2099   }
   2100   for (int i = characters_ - by; i < characters_; i++) {
   2101     positions_[i].mask = 0;
   2102     positions_[i].value = 0;
   2103     positions_[i].determines_perfectly = false;
   2104   }
   2105   characters_ -= by;
   2106   // We could change mask_ and value_ here but we would never advance unless
   2107   // they had already been used in a check and they won't be used again because
   2108   // it would gain us nothing.  So there's no point.
   2109 }
   2110 
   2111 
   2112 void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
   2113   ASSERT(characters_ == other->characters_);
   2114   if (other->cannot_match_) {
   2115     return;
   2116   }
   2117   if (cannot_match_) {
   2118     *this = *other;
   2119     return;
   2120   }
   2121   for (int i = from_index; i < characters_; i++) {
   2122     QuickCheckDetails::Position* pos = positions(i);
   2123     QuickCheckDetails::Position* other_pos = other->positions(i);
   2124     if (pos->mask != other_pos->mask ||
   2125         pos->value != other_pos->value ||
   2126         !other_pos->determines_perfectly) {
   2127       // Our mask-compare operation will be approximate unless we have the
   2128       // exact same operation on both sides of the alternation.
   2129       pos->determines_perfectly = false;
   2130     }
   2131     pos->mask &= other_pos->mask;
   2132     pos->value &= pos->mask;
   2133     other_pos->value &= pos->mask;
   2134     uc16 differing_bits = (pos->value ^ other_pos->value);
   2135     pos->mask &= ~differing_bits;
   2136     pos->value &= pos->mask;
   2137   }
   2138 }
   2139 
   2140 
   2141 class VisitMarker {
   2142  public:
   2143   explicit VisitMarker(NodeInfo* info) : info_(info) {
   2144     ASSERT(!info->visited);
   2145     info->visited = true;
   2146   }
   2147   ~VisitMarker() {
   2148     info_->visited = false;
   2149   }
   2150  private:
   2151   NodeInfo* info_;
   2152 };
   2153 
   2154 
   2155 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
   2156                                           RegExpCompiler* compiler,
   2157                                           int characters_filled_in,
   2158                                           bool not_at_start) {
   2159   if (body_can_be_zero_length_ || info()->visited) return;
   2160   VisitMarker marker(info());
   2161   return ChoiceNode::GetQuickCheckDetails(details,
   2162                                           compiler,
   2163                                           characters_filled_in,
   2164                                           not_at_start);
   2165 }
   2166 
   2167 
   2168 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
   2169                                       RegExpCompiler* compiler,
   2170                                       int characters_filled_in,
   2171                                       bool not_at_start) {
   2172   not_at_start = (not_at_start || not_at_start_);
   2173   int choice_count = alternatives_->length();
   2174   ASSERT(choice_count > 0);
   2175   alternatives_->at(0).node()->GetQuickCheckDetails(details,
   2176                                                     compiler,
   2177                                                     characters_filled_in,
   2178                                                     not_at_start);
   2179   for (int i = 1; i < choice_count; i++) {
   2180     QuickCheckDetails new_details(details->characters());
   2181     RegExpNode* node = alternatives_->at(i).node();
   2182     node->GetQuickCheckDetails(&new_details, compiler,
   2183                                characters_filled_in,
   2184                                not_at_start);
   2185     // Here we merge the quick match details of the two branches.
   2186     details->Merge(&new_details, characters_filled_in);
   2187   }
   2188 }
   2189 
   2190 
   2191 // Check for [0-9A-Z_a-z].
   2192 static void EmitWordCheck(RegExpMacroAssembler* assembler,
   2193                           Label* word,
   2194                           Label* non_word,
   2195                           bool fall_through_on_word) {
   2196   if (assembler->CheckSpecialCharacterClass(
   2197           fall_through_on_word ? 'w' : 'W',
   2198           fall_through_on_word ? non_word : word)) {
   2199     // Optimized implementation available.
   2200     return;
   2201   }
   2202   assembler->CheckCharacterGT('z', non_word);
   2203   assembler->CheckCharacterLT('0', non_word);
   2204   assembler->CheckCharacterGT('a' - 1, word);
   2205   assembler->CheckCharacterLT('9' + 1, word);
   2206   assembler->CheckCharacterLT('A', non_word);
   2207   assembler->CheckCharacterLT('Z' + 1, word);
   2208   if (fall_through_on_word) {
   2209     assembler->CheckNotCharacter('_', non_word);
   2210   } else {
   2211     assembler->CheckCharacter('_', word);
   2212   }
   2213 }
   2214 
   2215 
   2216 // Emit the code to check for a ^ in multiline mode (1-character lookbehind
   2217 // that matches newline or the start of input).
   2218 static void EmitHat(RegExpCompiler* compiler,
   2219                     RegExpNode* on_success,
   2220                     Trace* trace) {
   2221   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   2222   // We will be loading the previous character into the current character
   2223   // register.
   2224   Trace new_trace(*trace);
   2225   new_trace.InvalidateCurrentCharacter();
   2226 
   2227   Label ok;
   2228   if (new_trace.cp_offset() == 0) {
   2229     // The start of input counts as a newline in this context, so skip to
   2230     // ok if we are at the start.
   2231     assembler->CheckAtStart(&ok);
   2232   }
   2233   // We already checked that we are not at the start of input so it must be
   2234   // OK to load the previous character.
   2235   assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
   2236                                   new_trace.backtrack(),
   2237                                   false);
   2238   if (!assembler->CheckSpecialCharacterClass('n',
   2239                                              new_trace.backtrack())) {
   2240     // Newline means \n, \r, 0x2028 or 0x2029.
   2241     if (!compiler->ascii()) {
   2242       assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
   2243     }
   2244     assembler->CheckCharacter('\n', &ok);
   2245     assembler->CheckNotCharacter('\r', new_trace.backtrack());
   2246   }
   2247   assembler->Bind(&ok);
   2248   on_success->Emit(compiler, &new_trace);
   2249 }
   2250 
   2251 
   2252 // Emit the code to handle \b and \B (word-boundary or non-word-boundary)
   2253 // when we know whether the next character must be a word character or not.
   2254 static void EmitHalfBoundaryCheck(AssertionNode::AssertionNodeType type,
   2255                                   RegExpCompiler* compiler,
   2256                                   RegExpNode* on_success,
   2257                                   Trace* trace) {
   2258   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   2259   Label done;
   2260 
   2261   Trace new_trace(*trace);
   2262 
   2263   bool expect_word_character = (type == AssertionNode::AFTER_WORD_CHARACTER);
   2264   Label* on_word = expect_word_character ? &done : new_trace.backtrack();
   2265   Label* on_non_word = expect_word_character ? new_trace.backtrack() : &done;
   2266 
   2267   // Check whether previous character was a word character.
   2268   switch (trace->at_start()) {
   2269     case Trace::TRUE:
   2270       if (expect_word_character) {
   2271         assembler->GoTo(on_non_word);
   2272       }
   2273       break;
   2274     case Trace::UNKNOWN:
   2275       ASSERT_EQ(0, trace->cp_offset());
   2276       assembler->CheckAtStart(on_non_word);
   2277       // Fall through.
   2278     case Trace::FALSE:
   2279       int prev_char_offset = trace->cp_offset() - 1;
   2280       assembler->LoadCurrentCharacter(prev_char_offset, NULL, false, 1);
   2281       EmitWordCheck(assembler, on_word, on_non_word, expect_word_character);
   2282       // We may or may not have loaded the previous character.
   2283       new_trace.InvalidateCurrentCharacter();
   2284   }
   2285 
   2286   assembler->Bind(&done);
   2287 
   2288   on_success->Emit(compiler, &new_trace);
   2289 }
   2290 
   2291 
   2292 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
   2293 static void EmitBoundaryCheck(AssertionNode::AssertionNodeType type,
   2294                               RegExpCompiler* compiler,
   2295                               RegExpNode* on_success,
   2296                               Trace* trace) {
   2297   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   2298   Label before_non_word;
   2299   Label before_word;
   2300   if (trace->characters_preloaded() != 1) {
   2301     assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
   2302   }
   2303   // Fall through on non-word.
   2304   EmitWordCheck(assembler, &before_word, &before_non_word, false);
   2305 
   2306   // We will be loading the previous character into the current character
   2307   // register.
   2308   Trace new_trace(*trace);
   2309   new_trace.InvalidateCurrentCharacter();
   2310 
   2311   Label ok;
   2312   Label* boundary;
   2313   Label* not_boundary;
   2314   if (type == AssertionNode::AT_BOUNDARY) {
   2315     boundary = &ok;
   2316     not_boundary = new_trace.backtrack();
   2317   } else {
   2318     not_boundary = &ok;
   2319     boundary = new_trace.backtrack();
   2320   }
   2321 
   2322   // Next character is not a word character.
   2323   assembler->Bind(&before_non_word);
   2324   if (new_trace.cp_offset() == 0) {
   2325     // The start of input counts as a non-word character, so the question is
   2326     // decided if we are at the start.
   2327     assembler->CheckAtStart(not_boundary);
   2328   }
   2329   // We already checked that we are not at the start of input so it must be
   2330   // OK to load the previous character.
   2331   assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
   2332                                   &ok,  // Unused dummy label in this call.
   2333                                   false);
   2334   // Fall through on non-word.
   2335   EmitWordCheck(assembler, boundary, not_boundary, false);
   2336   assembler->GoTo(not_boundary);
   2337 
   2338   // Next character is a word character.
   2339   assembler->Bind(&before_word);
   2340   if (new_trace.cp_offset() == 0) {
   2341     // The start of input counts as a non-word character, so the question is
   2342     // decided if we are at the start.
   2343     assembler->CheckAtStart(boundary);
   2344   }
   2345   // We already checked that we are not at the start of input so it must be
   2346   // OK to load the previous character.
   2347   assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
   2348                                   &ok,  // Unused dummy label in this call.
   2349                                   false);
   2350   bool fall_through_on_word = (type == AssertionNode::AT_NON_BOUNDARY);
   2351   EmitWordCheck(assembler, not_boundary, boundary, fall_through_on_word);
   2352 
   2353   assembler->Bind(&ok);
   2354 
   2355   on_success->Emit(compiler, &new_trace);
   2356 }
   2357 
   2358 
   2359 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
   2360                                          RegExpCompiler* compiler,
   2361                                          int filled_in,
   2362                                          bool not_at_start) {
   2363   if (type_ == AT_START && not_at_start) {
   2364     details->set_cannot_match();
   2365     return;
   2366   }
   2367   return on_success()->GetQuickCheckDetails(details,
   2368                                             compiler,
   2369                                             filled_in,
   2370                                             not_at_start);
   2371 }
   2372 
   2373 
   2374 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
   2375   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   2376   switch (type_) {
   2377     case AT_END: {
   2378       Label ok;
   2379       assembler->CheckPosition(trace->cp_offset(), &ok);
   2380       assembler->GoTo(trace->backtrack());
   2381       assembler->Bind(&ok);
   2382       break;
   2383     }
   2384     case AT_START: {
   2385       if (trace->at_start() == Trace::FALSE) {
   2386         assembler->GoTo(trace->backtrack());
   2387         return;
   2388       }
   2389       if (trace->at_start() == Trace::UNKNOWN) {
   2390         assembler->CheckNotAtStart(trace->backtrack());
   2391         Trace at_start_trace = *trace;
   2392         at_start_trace.set_at_start(true);
   2393         on_success()->Emit(compiler, &at_start_trace);
   2394         return;
   2395       }
   2396     }
   2397     break;
   2398     case AFTER_NEWLINE:
   2399       EmitHat(compiler, on_success(), trace);
   2400       return;
   2401     case AT_BOUNDARY:
   2402     case AT_NON_BOUNDARY: {
   2403       EmitBoundaryCheck(type_, compiler, on_success(), trace);
   2404       return;
   2405     }
   2406     case AFTER_WORD_CHARACTER:
   2407     case AFTER_NONWORD_CHARACTER: {
   2408       EmitHalfBoundaryCheck(type_, compiler, on_success(), trace);
   2409     }
   2410   }
   2411   on_success()->Emit(compiler, trace);
   2412 }
   2413 
   2414 
   2415 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
   2416   if (quick_check == NULL) return false;
   2417   if (offset >= quick_check->characters()) return false;
   2418   return quick_check->positions(offset)->determines_perfectly;
   2419 }
   2420 
   2421 
   2422 static void UpdateBoundsCheck(int index, int* checked_up_to) {
   2423   if (index > *checked_up_to) {
   2424     *checked_up_to = index;
   2425   }
   2426 }
   2427 
   2428 
   2429 // We call this repeatedly to generate code for each pass over the text node.
   2430 // The passes are in increasing order of difficulty because we hope one
   2431 // of the first passes will fail in which case we are saved the work of the
   2432 // later passes.  for example for the case independent regexp /%[asdfghjkl]a/
   2433 // we will check the '%' in the first pass, the case independent 'a' in the
   2434 // second pass and the character class in the last pass.
   2435 //
   2436 // The passes are done from right to left, so for example to test for /bar/
   2437 // we will first test for an 'r' with offset 2, then an 'a' with offset 1
   2438 // and then a 'b' with offset 0.  This means we can avoid the end-of-input
   2439 // bounds check most of the time.  In the example we only need to check for
   2440 // end-of-input when loading the putative 'r'.
   2441 //
   2442 // A slight complication involves the fact that the first character may already
   2443 // be fetched into a register by the previous node.  In this case we want to
   2444 // do the test for that character first.  We do this in separate passes.  The
   2445 // 'preloaded' argument indicates that we are doing such a 'pass'.  If such a
   2446 // pass has been performed then subsequent passes will have true in
   2447 // first_element_checked to indicate that that character does not need to be
   2448 // checked again.
   2449 //
   2450 // In addition to all this we are passed a Trace, which can
   2451 // contain an AlternativeGeneration object.  In this AlternativeGeneration
   2452 // object we can see details of any quick check that was already passed in
   2453 // order to get to the code we are now generating.  The quick check can involve
   2454 // loading characters, which means we do not need to recheck the bounds
   2455 // up to the limit the quick check already checked.  In addition the quick
   2456 // check can have involved a mask and compare operation which may simplify
   2457 // or obviate the need for further checks at some character positions.
   2458 void TextNode::TextEmitPass(RegExpCompiler* compiler,
   2459                             TextEmitPassType pass,
   2460                             bool preloaded,
   2461                             Trace* trace,
   2462                             bool first_element_checked,
   2463                             int* checked_up_to) {
   2464   Isolate* isolate = Isolate::Current();
   2465   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   2466   bool ascii = compiler->ascii();
   2467   Label* backtrack = trace->backtrack();
   2468   QuickCheckDetails* quick_check = trace->quick_check_performed();
   2469   int element_count = elms_->length();
   2470   for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
   2471     TextElement elm = elms_->at(i);
   2472     int cp_offset = trace->cp_offset() + elm.cp_offset;
   2473     if (elm.type == TextElement::ATOM) {
   2474       Vector<const uc16> quarks = elm.data.u_atom->data();
   2475       for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
   2476         if (first_element_checked && i == 0 && j == 0) continue;
   2477         if (DeterminedAlready(quick_check, elm.cp_offset + j)) continue;
   2478         EmitCharacterFunction* emit_function = NULL;
   2479         switch (pass) {
   2480           case NON_ASCII_MATCH:
   2481             ASSERT(ascii);
   2482             if (quarks[j] > String::kMaxAsciiCharCode) {
   2483               assembler->GoTo(backtrack);
   2484               return;
   2485             }
   2486             break;
   2487           case NON_LETTER_CHARACTER_MATCH:
   2488             emit_function = &EmitAtomNonLetter;
   2489             break;
   2490           case SIMPLE_CHARACTER_MATCH:
   2491             emit_function = &EmitSimpleCharacter;
   2492             break;
   2493           case CASE_CHARACTER_MATCH:
   2494             emit_function = &EmitAtomLetter;
   2495             break;
   2496           default:
   2497             break;
   2498         }
   2499         if (emit_function != NULL) {
   2500           bool bound_checked = emit_function(isolate,
   2501                                              compiler,
   2502                                              quarks[j],
   2503                                              backtrack,
   2504                                              cp_offset + j,
   2505                                              *checked_up_to < cp_offset + j,
   2506                                              preloaded);
   2507           if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
   2508         }
   2509       }
   2510     } else {
   2511       ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);
   2512       if (pass == CHARACTER_CLASS_MATCH) {
   2513         if (first_element_checked && i == 0) continue;
   2514         if (DeterminedAlready(quick_check, elm.cp_offset)) continue;
   2515         RegExpCharacterClass* cc = elm.data.u_char_class;
   2516         EmitCharClass(assembler,
   2517                       cc,
   2518                       ascii,
   2519                       backtrack,
   2520                       cp_offset,
   2521                       *checked_up_to < cp_offset,
   2522                       preloaded);
   2523         UpdateBoundsCheck(cp_offset, checked_up_to);
   2524       }
   2525     }
   2526   }
   2527 }
   2528 
   2529 
   2530 int TextNode::Length() {
   2531   TextElement elm = elms_->last();
   2532   ASSERT(elm.cp_offset >= 0);
   2533   if (elm.type == TextElement::ATOM) {
   2534     return elm.cp_offset + elm.data.u_atom->data().length();
   2535   } else {
   2536     return elm.cp_offset + 1;
   2537   }
   2538 }
   2539 
   2540 
   2541 bool TextNode::SkipPass(int int_pass, bool ignore_case) {
   2542   TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
   2543   if (ignore_case) {
   2544     return pass == SIMPLE_CHARACTER_MATCH;
   2545   } else {
   2546     return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
   2547   }
   2548 }
   2549 
   2550 
   2551 // This generates the code to match a text node.  A text node can contain
   2552 // straight character sequences (possibly to be matched in a case-independent
   2553 // way) and character classes.  For efficiency we do not do this in a single
   2554 // pass from left to right.  Instead we pass over the text node several times,
   2555 // emitting code for some character positions every time.  See the comment on
   2556 // TextEmitPass for details.
   2557 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
   2558   LimitResult limit_result = LimitVersions(compiler, trace);
   2559   if (limit_result == DONE) return;
   2560   ASSERT(limit_result == CONTINUE);
   2561 
   2562   if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
   2563     compiler->SetRegExpTooBig();
   2564     return;
   2565   }
   2566 
   2567   if (compiler->ascii()) {
   2568     int dummy = 0;
   2569     TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy);
   2570   }
   2571 
   2572   bool first_elt_done = false;
   2573   int bound_checked_to = trace->cp_offset() - 1;
   2574   bound_checked_to += trace->bound_checked_up_to();
   2575 
   2576   // If a character is preloaded into the current character register then
   2577   // check that now.
   2578   if (trace->characters_preloaded() == 1) {
   2579     for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
   2580       if (!SkipPass(pass, compiler->ignore_case())) {
   2581         TextEmitPass(compiler,
   2582                      static_cast<TextEmitPassType>(pass),
   2583                      true,
   2584                      trace,
   2585                      false,
   2586                      &bound_checked_to);
   2587       }
   2588     }
   2589     first_elt_done = true;
   2590   }
   2591 
   2592   for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
   2593     if (!SkipPass(pass, compiler->ignore_case())) {
   2594       TextEmitPass(compiler,
   2595                    static_cast<TextEmitPassType>(pass),
   2596                    false,
   2597                    trace,
   2598                    first_elt_done,
   2599                    &bound_checked_to);
   2600     }
   2601   }
   2602 
   2603   Trace successor_trace(*trace);
   2604   successor_trace.set_at_start(false);
   2605   successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);
   2606   RecursionCheck rc(compiler);
   2607   on_success()->Emit(compiler, &successor_trace);
   2608 }
   2609 
   2610 
   2611 void Trace::InvalidateCurrentCharacter() {
   2612   characters_preloaded_ = 0;
   2613 }
   2614 
   2615 
   2616 void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
   2617   ASSERT(by > 0);
   2618   // We don't have an instruction for shifting the current character register
   2619   // down or for using a shifted value for anything so lets just forget that
   2620   // we preloaded any characters into it.
   2621   characters_preloaded_ = 0;
   2622   // Adjust the offsets of the quick check performed information.  This
   2623   // information is used to find out what we already determined about the
   2624   // characters by means of mask and compare.
   2625   quick_check_performed_.Advance(by, compiler->ascii());
   2626   cp_offset_ += by;
   2627   if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
   2628     compiler->SetRegExpTooBig();
   2629     cp_offset_ = 0;
   2630   }
   2631   bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
   2632 }
   2633 
   2634 
   2635 void TextNode::MakeCaseIndependent(bool is_ascii) {
   2636   int element_count = elms_->length();
   2637   for (int i = 0; i < element_count; i++) {
   2638     TextElement elm = elms_->at(i);
   2639     if (elm.type == TextElement::CHAR_CLASS) {
   2640       RegExpCharacterClass* cc = elm.data.u_char_class;
   2641       // None of the standard character classes is different in the case
   2642       // independent case and it slows us down if we don't know that.
   2643       if (cc->is_standard()) continue;
   2644       ZoneList<CharacterRange>* ranges = cc->ranges();
   2645       int range_count = ranges->length();
   2646       for (int j = 0; j < range_count; j++) {
   2647         ranges->at(j).AddCaseEquivalents(ranges, is_ascii);
   2648       }
   2649     }
   2650   }
   2651 }
   2652 
   2653 
   2654 int TextNode::GreedyLoopTextLength() {
   2655   TextElement elm = elms_->at(elms_->length() - 1);
   2656   if (elm.type == TextElement::CHAR_CLASS) {
   2657     return elm.cp_offset + 1;
   2658   } else {
   2659     return elm.cp_offset + elm.data.u_atom->data().length();
   2660   }
   2661 }
   2662 
   2663 
   2664 // Finds the fixed match length of a sequence of nodes that goes from
   2665 // this alternative and back to this choice node.  If there are variable
   2666 // length nodes or other complications in the way then return a sentinel
   2667 // value indicating that a greedy loop cannot be constructed.
   2668 int ChoiceNode::GreedyLoopTextLengthForAlternative(
   2669     GuardedAlternative* alternative) {
   2670   int length = 0;
   2671   RegExpNode* node = alternative->node();
   2672   // Later we will generate code for all these text nodes using recursion
   2673   // so we have to limit the max number.
   2674   int recursion_depth = 0;
   2675   while (node != this) {
   2676     if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
   2677       return kNodeIsTooComplexForGreedyLoops;
   2678     }
   2679     int node_length = node->GreedyLoopTextLength();
   2680     if (node_length == kNodeIsTooComplexForGreedyLoops) {
   2681       return kNodeIsTooComplexForGreedyLoops;
   2682     }
   2683     length += node_length;
   2684     SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
   2685     node = seq_node->on_success();
   2686   }
   2687   return length;
   2688 }
   2689 
   2690 
   2691 void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
   2692   ASSERT_EQ(loop_node_, NULL);
   2693   AddAlternative(alt);
   2694   loop_node_ = alt.node();
   2695 }
   2696 
   2697 
   2698 void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
   2699   ASSERT_EQ(continue_node_, NULL);
   2700   AddAlternative(alt);
   2701   continue_node_ = alt.node();
   2702 }
   2703 
   2704 
   2705 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
   2706   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
   2707   if (trace->stop_node() == this) {
   2708     int text_length =
   2709         GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
   2710     ASSERT(text_length != kNodeIsTooComplexForGreedyLoops);
   2711     // Update the counter-based backtracking info on the stack.  This is an
   2712     // optimization for greedy loops (see below).
   2713     ASSERT(trace->cp_offset() == text_length);
   2714     macro_assembler->AdvanceCurrentPosition(text_length);
   2715     macro_assembler->GoTo(trace->loop_label());
   2716     return;
   2717   }
   2718   ASSERT(trace->stop_node() == NULL);
   2719   if (!trace->is_trivial()) {
   2720     trace->Flush(compiler, this);
   2721     return;
   2722   }
   2723   ChoiceNode::Emit(compiler, trace);
   2724 }
   2725 
   2726 
   2727 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
   2728                                            bool not_at_start) {
   2729   int preload_characters = EatsAtLeast(4, 0, not_at_start);
   2730   if (compiler->macro_assembler()->CanReadUnaligned()) {
   2731     bool ascii = compiler->ascii();
   2732     if (ascii) {
   2733       if (preload_characters > 4) preload_characters = 4;
   2734       // We can't preload 3 characters because there is no machine instruction
   2735       // to do that.  We can't just load 4 because we could be reading
   2736       // beyond the end of the string, which could cause a memory fault.
   2737       if (preload_characters == 3) preload_characters = 2;
   2738     } else {
   2739       if (preload_characters > 2) preload_characters = 2;
   2740     }
   2741   } else {
   2742     if (preload_characters > 1) preload_characters = 1;
   2743   }
   2744   return preload_characters;
   2745 }
   2746 
   2747 
   2748 // This class is used when generating the alternatives in a choice node.  It
   2749 // records the way the alternative is being code generated.
   2750 class AlternativeGeneration: public Malloced {
   2751  public:
   2752   AlternativeGeneration()
   2753       : possible_success(),
   2754         expects_preload(false),
   2755         after(),
   2756         quick_check_details() { }
   2757   Label possible_success;
   2758   bool expects_preload;
   2759   Label after;
   2760   QuickCheckDetails quick_check_details;
   2761 };
   2762 
   2763 
   2764 // Creates a list of AlternativeGenerations.  If the list has a reasonable
   2765 // size then it is on the stack, otherwise the excess is on the heap.
   2766 class AlternativeGenerationList {
   2767  public:
   2768   explicit AlternativeGenerationList(int count)
   2769       : alt_gens_(count) {
   2770     for (int i = 0; i < count && i < kAFew; i++) {
   2771       alt_gens_.Add(a_few_alt_gens_ + i);
   2772     }
   2773     for (int i = kAFew; i < count; i++) {
   2774       alt_gens_.Add(new AlternativeGeneration());
   2775     }
   2776   }
   2777   ~AlternativeGenerationList() {
   2778     for (int i = kAFew; i < alt_gens_.length(); i++) {
   2779       delete alt_gens_[i];
   2780       alt_gens_[i] = NULL;
   2781     }
   2782   }
   2783 
   2784   AlternativeGeneration* at(int i) {
   2785     return alt_gens_[i];
   2786   }
   2787 
   2788  private:
   2789   static const int kAFew = 10;
   2790   ZoneList<AlternativeGeneration*> alt_gens_;
   2791   AlternativeGeneration a_few_alt_gens_[kAFew];
   2792 };
   2793 
   2794 
   2795 /* Code generation for choice nodes.
   2796  *
   2797  * We generate quick checks that do a mask and compare to eliminate a
   2798  * choice.  If the quick check succeeds then it jumps to the continuation to
   2799  * do slow checks and check subsequent nodes.  If it fails (the common case)
   2800  * it falls through to the next choice.
   2801  *
   2802  * Here is the desired flow graph.  Nodes directly below each other imply
   2803  * fallthrough.  Alternatives 1 and 2 have quick checks.  Alternative
   2804  * 3 doesn't have a quick check so we have to call the slow check.
   2805  * Nodes are marked Qn for quick checks and Sn for slow checks.  The entire
   2806  * regexp continuation is generated directly after the Sn node, up to the
   2807  * next GoTo if we decide to reuse some already generated code.  Some
   2808  * nodes expect preload_characters to be preloaded into the current
   2809  * character register.  R nodes do this preloading.  Vertices are marked
   2810  * F for failures and S for success (possible success in the case of quick
   2811  * nodes).  L, V, < and > are used as arrow heads.
   2812  *
   2813  * ----------> R
   2814  *             |
   2815  *             V
   2816  *            Q1 -----> S1
   2817  *             |   S   /
   2818  *            F|      /
   2819  *             |    F/
   2820  *             |    /
   2821  *             |   R
   2822  *             |  /
   2823  *             V L
   2824  *            Q2 -----> S2
   2825  *             |   S   /
   2826  *            F|      /
   2827  *             |    F/
   2828  *             |    /
   2829  *             |   R
   2830  *             |  /
   2831  *             V L
   2832  *            S3
   2833  *             |
   2834  *            F|
   2835  *             |
   2836  *             R
   2837  *             |
   2838  * backtrack   V
   2839  * <----------Q4
   2840  *   \    F    |
   2841  *    \        |S
   2842  *     \   F   V
   2843  *      \-----S4
   2844  *
   2845  * For greedy loops we reverse our expectation and expect to match rather
   2846  * than fail. Therefore we want the loop code to look like this (U is the
   2847  * unwind code that steps back in the greedy loop).  The following alternatives
   2848  * look the same as above.
   2849  *              _____
   2850  *             /     \
   2851  *             V     |
   2852  * ----------> S1    |
   2853  *            /|     |
   2854  *           / |S    |
   2855  *         F/  \_____/
   2856  *         /
   2857  *        |<-----------
   2858  *        |            \
   2859  *        V             \
   2860  *        Q2 ---> S2     \
   2861  *        |  S   /       |
   2862  *       F|     /        |
   2863  *        |   F/         |
   2864  *        |   /          |
   2865  *        |  R           |
   2866  *        | /            |
   2867  *   F    VL             |
   2868  * <------U              |
   2869  * back   |S             |
   2870  *        \______________/
   2871  */
   2872 
   2873 
   2874 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
   2875   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
   2876   int choice_count = alternatives_->length();
   2877 #ifdef DEBUG
   2878   for (int i = 0; i < choice_count - 1; i++) {
   2879     GuardedAlternative alternative = alternatives_->at(i);
   2880     ZoneList<Guard*>* guards = alternative.guards();
   2881     int guard_count = (guards == NULL) ? 0 : guards->length();
   2882     for (int j = 0; j < guard_count; j++) {
   2883       ASSERT(!trace->mentions_reg(guards->at(j)->reg()));
   2884     }
   2885   }
   2886 #endif
   2887 
   2888   LimitResult limit_result = LimitVersions(compiler, trace);
   2889   if (limit_result == DONE) return;
   2890   ASSERT(limit_result == CONTINUE);
   2891 
   2892   int new_flush_budget = trace->flush_budget() / choice_count;
   2893   if (trace->flush_budget() == 0 && trace->actions() != NULL) {
   2894     trace->Flush(compiler, this);
   2895     return;
   2896   }
   2897 
   2898   RecursionCheck rc(compiler);
   2899 
   2900   Trace* current_trace = trace;
   2901 
   2902   int text_length = GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
   2903   bool greedy_loop = false;
   2904   Label greedy_loop_label;
   2905   Trace counter_backtrack_trace;
   2906   counter_backtrack_trace.set_backtrack(&greedy_loop_label);
   2907   if (not_at_start()) counter_backtrack_trace.set_at_start(false);
   2908 
   2909   if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
   2910     // Here we have special handling for greedy loops containing only text nodes
   2911     // and other simple nodes.  These are handled by pushing the current
   2912     // position on the stack and then incrementing the current position each
   2913     // time around the switch.  On backtrack we decrement the current position
   2914     // and check it against the pushed value.  This avoids pushing backtrack
   2915     // information for each iteration of the loop, which could take up a lot of
   2916     // space.
   2917     greedy_loop = true;
   2918     ASSERT(trace->stop_node() == NULL);
   2919     macro_assembler->PushCurrentPosition();
   2920     current_trace = &counter_backtrack_trace;
   2921     Label greedy_match_failed;
   2922     Trace greedy_match_trace;
   2923     if (not_at_start()) greedy_match_trace.set_at_start(false);
   2924     greedy_match_trace.set_backtrack(&greedy_match_failed);
   2925     Label loop_label;
   2926     macro_assembler->Bind(&loop_label);
   2927     greedy_match_trace.set_stop_node(this);
   2928     greedy_match_trace.set_loop_label(&loop_label);
   2929     alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
   2930     macro_assembler->Bind(&greedy_match_failed);
   2931   }
   2932 
   2933   Label second_choice;  // For use in greedy matches.
   2934   macro_assembler->Bind(&second_choice);
   2935 
   2936   int first_normal_choice =