Home | History | Annotate | Download | only in src
      1 // Copyright 2006-2009 the V8 project authors. All rights reserved.
      2 // Redistribution and use in source and binary forms, with or without
      3 // modification, are permitted provided that the following conditions are
      4 // met:
      5 //
      6 //     * Redistributions of source code must retain the above copyright
      7 //       notice, this list of conditions and the following disclaimer.
      8 //     * Redistributions in binary form must reproduce the above
      9 //       copyright notice, this list of conditions and the following
     10 //       disclaimer in the documentation and/or other materials provided
     11 //       with the distribution.
     12 //     * Neither the name of Google Inc. nor the names of its
     13 //       contributors may be used to endorse or promote products derived
     14 //       from this software without specific prior written permission.
     15 //
     16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27 
     28 #include "v8.h"
     29 
     30 #include "ast.h"
     31 #include "compiler.h"
     32 #include "execution.h"
     33 #include "factory.h"
     34 #include "jsregexp.h"
     35 #include "platform.h"
     36 #include "runtime.h"
     37 #include "top.h"
     38 #include "compilation-cache.h"
     39 #include "string-stream.h"
     40 #include "parser.h"
     41 #include "regexp-macro-assembler.h"
     42 #include "regexp-macro-assembler-tracer.h"
     43 #include "regexp-macro-assembler-irregexp.h"
     44 #include "regexp-stack.h"
     45 
     46 #ifdef V8_NATIVE_REGEXP
     47 #if V8_TARGET_ARCH_IA32
     48 #include "ia32/regexp-macro-assembler-ia32.h"
     49 #elif V8_TARGET_ARCH_X64
     50 #include "x64/regexp-macro-assembler-x64.h"
     51 #elif V8_TARGET_ARCH_ARM
     52 #include "arm/regexp-macro-assembler-arm.h"
     53 #else
     54 #error Unsupported target architecture.
     55 #endif
     56 #endif
     57 
     58 #include "interpreter-irregexp.h"
     59 
     60 
     61 namespace v8 {
     62 namespace internal {
     63 
     64 
     65 Handle<Object> RegExpImpl::CreateRegExpLiteral(Handle<JSFunction> constructor,
     66                                                Handle<String> pattern,
     67                                                Handle<String> flags,
     68                                                bool* has_pending_exception) {
     69   // Call the construct code with 2 arguments.
     70   Object** argv[2] = { Handle<Object>::cast(pattern).location(),
     71                        Handle<Object>::cast(flags).location() };
     72   return Execution::New(constructor, 2, argv, has_pending_exception);
     73 }
     74 
     75 
     76 static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) {
     77   int flags = JSRegExp::NONE;
     78   for (int i = 0; i < str->length(); i++) {
     79     switch (str->Get(i)) {
     80       case 'i':
     81         flags |= JSRegExp::IGNORE_CASE;
     82         break;
     83       case 'g':
     84         flags |= JSRegExp::GLOBAL;
     85         break;
     86       case 'm':
     87         flags |= JSRegExp::MULTILINE;
     88         break;
     89     }
     90   }
     91   return JSRegExp::Flags(flags);
     92 }
     93 
     94 
     95 static inline void ThrowRegExpException(Handle<JSRegExp> re,
     96                                         Handle<String> pattern,
     97                                         Handle<String> error_text,
     98                                         const char* message) {
     99   Handle<JSArray> array = Factory::NewJSArray(2);
    100   SetElement(array, 0, pattern);
    101   SetElement(array, 1, error_text);
    102   Handle<Object> regexp_err = Factory::NewSyntaxError(message, array);
    103   Top::Throw(*regexp_err);
    104 }
    105 
    106 
    107 // Generic RegExp methods. Dispatches to implementation specific methods.
    108 
    109 
    110 Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
    111                                    Handle<String> pattern,
    112                                    Handle<String> flag_str) {
    113   JSRegExp::Flags flags = RegExpFlagsFromString(flag_str);
    114   Handle<FixedArray> cached = CompilationCache::LookupRegExp(pattern, flags);
    115   bool in_cache = !cached.is_null();
    116   LOG(RegExpCompileEvent(re, in_cache));
    117 
    118   Handle<Object> result;
    119   if (in_cache) {
    120     re->set_data(*cached);
    121     return re;
    122   }
    123   FlattenString(pattern);
    124   CompilationZoneScope zone_scope(DELETE_ON_EXIT);
    125   RegExpCompileData parse_result;
    126   FlatStringReader reader(pattern);
    127   if (!ParseRegExp(&reader, flags.is_multiline(), &parse_result)) {
    128     // Throw an exception if we fail to parse the pattern.
    129     ThrowRegExpException(re,
    130                          pattern,
    131                          parse_result.error,
    132                          "malformed_regexp");
    133     return Handle<Object>::null();
    134   }
    135 
    136   if (parse_result.simple && !flags.is_ignore_case()) {
    137     // Parse-tree is a single atom that is equal to the pattern.
    138     AtomCompile(re, pattern, flags, pattern);
    139   } else if (parse_result.tree->IsAtom() &&
    140       !flags.is_ignore_case() &&
    141       parse_result.capture_count == 0) {
    142     RegExpAtom* atom = parse_result.tree->AsAtom();
    143     Vector<const uc16> atom_pattern = atom->data();
    144     Handle<String> atom_string = Factory::NewStringFromTwoByte(atom_pattern);
    145     AtomCompile(re, pattern, flags, atom_string);
    146   } else {
    147     IrregexpPrepare(re, pattern, flags, parse_result.capture_count);
    148   }
    149   ASSERT(re->data()->IsFixedArray());
    150   // Compilation succeeded so the data is set on the regexp
    151   // and we can store it in the cache.
    152   Handle<FixedArray> data(FixedArray::cast(re->data()));
    153   CompilationCache::PutRegExp(pattern, flags, data);
    154 
    155   return re;
    156 }
    157 
    158 
    159 Handle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
    160                                 Handle<String> subject,
    161                                 int index,
    162                                 Handle<JSArray> last_match_info) {
    163   switch (regexp->TypeTag()) {
    164     case JSRegExp::ATOM:
    165       return AtomExec(regexp, subject, index, last_match_info);
    166     case JSRegExp::IRREGEXP: {
    167       Handle<Object> result =
    168           IrregexpExec(regexp, subject, index, last_match_info);
    169       ASSERT(!result.is_null() || Top::has_pending_exception());
    170       return result;
    171     }
    172     default:
    173       UNREACHABLE();
    174       return Handle<Object>::null();
    175   }
    176 }
    177 
    178 
    179 // RegExp Atom implementation: Simple string search using indexOf.
    180 
    181 
    182 void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
    183                              Handle<String> pattern,
    184                              JSRegExp::Flags flags,
    185                              Handle<String> match_pattern) {
    186   Factory::SetRegExpAtomData(re,
    187                              JSRegExp::ATOM,
    188                              pattern,
    189                              flags,
    190                              match_pattern);
    191 }
    192 
    193 
    194 static void SetAtomLastCapture(FixedArray* array,
    195                                String* subject,
    196                                int from,
    197                                int to) {
    198   NoHandleAllocation no_handles;
    199   RegExpImpl::SetLastCaptureCount(array, 2);
    200   RegExpImpl::SetLastSubject(array, subject);
    201   RegExpImpl::SetLastInput(array, subject);
    202   RegExpImpl::SetCapture(array, 0, from);
    203   RegExpImpl::SetCapture(array, 1, to);
    204 }
    205 
    206 
    207 Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
    208                                     Handle<String> subject,
    209                                     int index,
    210                                     Handle<JSArray> last_match_info) {
    211   Handle<String> needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex)));
    212 
    213   uint32_t start_index = index;
    214 
    215   int value = Runtime::StringMatch(subject, needle, start_index);
    216   if (value == -1) return Factory::null_value();
    217   ASSERT(last_match_info->HasFastElements());
    218 
    219   {
    220     NoHandleAllocation no_handles;
    221     FixedArray* array = FixedArray::cast(last_match_info->elements());
    222     SetAtomLastCapture(array, *subject, value, value + needle->length());
    223   }
    224   return last_match_info;
    225 }
    226 
    227 
    228 // Irregexp implementation.
    229 
    230 // Ensures that the regexp object contains a compiled version of the
    231 // source for either ASCII or non-ASCII strings.
    232 // If the compiled version doesn't already exist, it is compiled
    233 // from the source pattern.
    234 // If compilation fails, an exception is thrown and this function
    235 // returns false.
    236 bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, bool is_ascii) {
    237   Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii));
    238 #ifdef V8_NATIVE_REGEXP
    239   if (compiled_code->IsCode()) return true;
    240 #else  // ! V8_NATIVE_REGEXP (RegExp interpreter code)
    241   if (compiled_code->IsByteArray()) return true;
    242 #endif
    243   return CompileIrregexp(re, is_ascii);
    244 }
    245 
    246 
    247 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, bool is_ascii) {
    248   // Compile the RegExp.
    249   CompilationZoneScope zone_scope(DELETE_ON_EXIT);
    250   Object* entry = re->DataAt(JSRegExp::code_index(is_ascii));
    251   if (entry->IsJSObject()) {
    252     // If it's a JSObject, a previous compilation failed and threw this object.
    253     // Re-throw the object without trying again.
    254     Top::Throw(entry);
    255     return false;
    256   }
    257   ASSERT(entry->IsTheHole());
    258 
    259   JSRegExp::Flags flags = re->GetFlags();
    260 
    261   Handle<String> pattern(re->Pattern());
    262   if (!pattern->IsFlat()) {
    263     FlattenString(pattern);
    264   }
    265 
    266   RegExpCompileData compile_data;
    267   FlatStringReader reader(pattern);
    268   if (!ParseRegExp(&reader, flags.is_multiline(), &compile_data)) {
    269     // Throw an exception if we fail to parse the pattern.
    270     // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
    271     ThrowRegExpException(re,
    272                          pattern,
    273                          compile_data.error,
    274                          "malformed_regexp");
    275     return false;
    276   }
    277   RegExpEngine::CompilationResult result =
    278       RegExpEngine::Compile(&compile_data,
    279                             flags.is_ignore_case(),
    280                             flags.is_multiline(),
    281                             pattern,
    282                             is_ascii);
    283   if (result.error_message != NULL) {
    284     // Unable to compile regexp.
    285     Handle<JSArray> array = Factory::NewJSArray(2);
    286     SetElement(array, 0, pattern);
    287     SetElement(array,
    288                1,
    289                Factory::NewStringFromUtf8(CStrVector(result.error_message)));
    290     Handle<Object> regexp_err =
    291         Factory::NewSyntaxError("malformed_regexp", array);
    292     Top::Throw(*regexp_err);
    293     re->SetDataAt(JSRegExp::code_index(is_ascii), *regexp_err);
    294     return false;
    295   }
    296 
    297   Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
    298   data->set(JSRegExp::code_index(is_ascii), result.code);
    299   int register_max = IrregexpMaxRegisterCount(*data);
    300   if (result.num_registers > register_max) {
    301     SetIrregexpMaxRegisterCount(*data, result.num_registers);
    302   }
    303 
    304   return true;
    305 }
    306 
    307 
    308 int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
    309   return Smi::cast(
    310       re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
    311 }
    312 
    313 
    314 void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
    315   re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
    316 }
    317 
    318 
    319 int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
    320   return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
    321 }
    322 
    323 
    324 int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
    325   return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
    326 }
    327 
    328 
    329 ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_ascii) {
    330   return ByteArray::cast(re->get(JSRegExp::code_index(is_ascii)));
    331 }
    332 
    333 
    334 Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_ascii) {
    335   return Code::cast(re->get(JSRegExp::code_index(is_ascii)));
    336 }
    337 
    338 
    339 void RegExpImpl::IrregexpPrepare(Handle<JSRegExp> re,
    340                                  Handle<String> pattern,
    341                                  JSRegExp::Flags flags,
    342                                  int capture_count) {
    343   // Initialize compiled code entries to null.
    344   Factory::SetRegExpIrregexpData(re,
    345                                  JSRegExp::IRREGEXP,
    346                                  pattern,
    347                                  flags,
    348                                  capture_count);
    349 }
    350 
    351 
    352 Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> jsregexp,
    353                                         Handle<String> subject,
    354                                         int previous_index,
    355                                         Handle<JSArray> last_match_info) {
    356   ASSERT_EQ(jsregexp->TypeTag(), JSRegExp::IRREGEXP);
    357 
    358   // Prepare space for the return values.
    359   int number_of_capture_registers =
    360       (IrregexpNumberOfCaptures(FixedArray::cast(jsregexp->data())) + 1) * 2;
    361 
    362 #ifndef V8_NATIVE_REGEXP
    363 #ifdef DEBUG
    364   if (FLAG_trace_regexp_bytecodes) {
    365     String* pattern = jsregexp->Pattern();
    366     PrintF("\n\nRegexp match:   /%s/\n\n", *(pattern->ToCString()));
    367     PrintF("\n\nSubject string: '%s'\n\n", *(subject->ToCString()));
    368   }
    369 #endif
    370 #endif
    371 
    372   if (!subject->IsFlat()) {
    373     FlattenString(subject);
    374   }
    375 
    376   last_match_info->EnsureSize(number_of_capture_registers + kLastMatchOverhead);
    377 
    378   Handle<FixedArray> array;
    379 
    380   // Dispatch to the correct RegExp implementation.
    381   Handle<FixedArray> regexp(FixedArray::cast(jsregexp->data()));
    382 
    383 #ifdef V8_NATIVE_REGEXP
    384 
    385   OffsetsVector captures(number_of_capture_registers);
    386   int* captures_vector = captures.vector();
    387   NativeRegExpMacroAssembler::Result res;
    388   do {
    389     bool is_ascii = subject->IsAsciiRepresentation();
    390     if (!EnsureCompiledIrregexp(jsregexp, is_ascii)) {
    391       return Handle<Object>::null();
    392     }
    393     Handle<Code> code(RegExpImpl::IrregexpNativeCode(*regexp, is_ascii));
    394     res = NativeRegExpMacroAssembler::Match(code,
    395                                             subject,
    396                                             captures_vector,
    397                                             captures.length(),
    398                                             previous_index);
    399     // If result is RETRY, the string have changed representation, and we
    400     // must restart from scratch.
    401   } while (res == NativeRegExpMacroAssembler::RETRY);
    402   if (res == NativeRegExpMacroAssembler::EXCEPTION) {
    403     ASSERT(Top::has_pending_exception());
    404     return Handle<Object>::null();
    405   }
    406   ASSERT(res == NativeRegExpMacroAssembler::SUCCESS
    407       || res == NativeRegExpMacroAssembler::FAILURE);
    408 
    409   if (res != NativeRegExpMacroAssembler::SUCCESS) return Factory::null_value();
    410 
    411   array = Handle<FixedArray>(FixedArray::cast(last_match_info->elements()));
    412   ASSERT(array->length() >= number_of_capture_registers + kLastMatchOverhead);
    413   // The captures come in (start, end+1) pairs.
    414   for (int i = 0; i < number_of_capture_registers; i += 2) {
    415     // Capture values are relative to start_offset only.
    416     // Convert them to be relative to start of string.
    417     if (captures_vector[i] >= 0) {
    418       captures_vector[i] += previous_index;
    419     }
    420     if (captures_vector[i + 1] >= 0) {
    421       captures_vector[i + 1] += previous_index;
    422     }
    423     SetCapture(*array, i, captures_vector[i]);
    424     SetCapture(*array, i + 1, captures_vector[i + 1]);
    425   }
    426 
    427 #else  // ! V8_NATIVE_REGEXP
    428 
    429   bool is_ascii = subject->IsAsciiRepresentation();
    430   if (!EnsureCompiledIrregexp(jsregexp, is_ascii)) {
    431     return Handle<Object>::null();
    432   }
    433   // Now that we have done EnsureCompiledIrregexp we can get the number of
    434   // registers.
    435   int number_of_registers =
    436       IrregexpNumberOfRegisters(FixedArray::cast(jsregexp->data()));
    437   OffsetsVector registers(number_of_registers);
    438   int* register_vector = registers.vector();
    439   for (int i = number_of_capture_registers - 1; i >= 0; i--) {
    440     register_vector[i] = -1;
    441   }
    442   Handle<ByteArray> byte_codes(IrregexpByteCode(*regexp, is_ascii));
    443 
    444   if (!IrregexpInterpreter::Match(byte_codes,
    445                                   subject,
    446                                   register_vector,
    447                                   previous_index)) {
    448     return Factory::null_value();
    449   }
    450 
    451   array = Handle<FixedArray>(FixedArray::cast(last_match_info->elements()));
    452   ASSERT(array->length() >= number_of_capture_registers + kLastMatchOverhead);
    453   // The captures come in (start, end+1) pairs.
    454   for (int i = 0; i < number_of_capture_registers; i += 2) {
    455     SetCapture(*array, i, register_vector[i]);
    456     SetCapture(*array, i + 1, register_vector[i + 1]);
    457   }
    458 
    459 #endif  // V8_NATIVE_REGEXP
    460 
    461   SetLastCaptureCount(*array, number_of_capture_registers);
    462   SetLastSubject(*array, *subject);
    463   SetLastInput(*array, *subject);
    464 
    465   return last_match_info;
    466 }
    467 
    468 
    469 // -------------------------------------------------------------------
    470 // Implementation of the Irregexp regular expression engine.
    471 //
    472 // The Irregexp regular expression engine is intended to be a complete
    473 // implementation of ECMAScript regular expressions.  It generates either
    474 // bytecodes or native code.
    475 
    476 //   The Irregexp regexp engine is structured in three steps.
    477 //   1) The parser generates an abstract syntax tree.  See ast.cc.
    478 //   2) From the AST a node network is created.  The nodes are all
    479 //      subclasses of RegExpNode.  The nodes represent states when
    480 //      executing a regular expression.  Several optimizations are
    481 //      performed on the node network.
    482 //   3) From the nodes we generate either byte codes or native code
    483 //      that can actually execute the regular expression (perform
    484 //      the search).  The code generation step is described in more
    485 //      detail below.
    486 
    487 // Code generation.
    488 //
    489 //   The nodes are divided into four main categories.
    490 //   * Choice nodes
    491 //        These represent places where the regular expression can
    492 //        match in more than one way.  For example on entry to an
    493 //        alternation (foo|bar) or a repetition (*, +, ? or {}).
    494 //   * Action nodes
    495 //        These represent places where some action should be
    496 //        performed.  Examples include recording the current position
    497 //        in the input string to a register (in order to implement
    498 //        captures) or other actions on register for example in order
    499 //        to implement the counters needed for {} repetitions.
    500 //   * Matching nodes
    501 //        These attempt to match some element part of the input string.
    502 //        Examples of elements include character classes, plain strings
    503 //        or back references.
    504 //   * End nodes
    505 //        These are used to implement the actions required on finding
    506 //        a successful match or failing to find a match.
    507 //
    508 //   The code generated (whether as byte codes or native code) maintains
    509 //   some state as it runs.  This consists of the following elements:
    510 //
    511 //   * The capture registers.  Used for string captures.
    512 //   * Other registers.  Used for counters etc.
    513 //   * The current position.
    514 //   * The stack of backtracking information.  Used when a matching node
    515 //     fails to find a match and needs to try an alternative.
    516 //
    517 // Conceptual regular expression execution model:
    518 //
    519 //   There is a simple conceptual model of regular expression execution
    520 //   which will be presented first.  The actual code generated is a more
    521 //   efficient simulation of the simple conceptual model:
    522 //
    523 //   * Choice nodes are implemented as follows:
    524 //     For each choice except the last {
    525 //       push current position
    526 //       push backtrack code location
    527 //       <generate code to test for choice>
    528 //       backtrack code location:
    529 //       pop current position
    530 //     }
    531 //     <generate code to test for last choice>
    532 //
    533 //   * Actions nodes are generated as follows
    534 //     <push affected registers on backtrack stack>
    535 //     <generate code to perform action>
    536 //     push backtrack code location
    537 //     <generate code to test for following nodes>
    538 //     backtrack code location:
    539 //     <pop affected registers to restore their state>
    540 //     <pop backtrack location from stack and go to it>
    541 //
    542 //   * Matching nodes are generated as follows:
    543 //     if input string matches at current position
    544 //       update current position
    545 //       <generate code to test for following nodes>
    546 //     else
    547 //       <pop backtrack location from stack and go to it>
    548 //
    549 //   Thus it can be seen that the current position is saved and restored
    550 //   by the choice nodes, whereas the registers are saved and restored by
    551 //   by the action nodes that manipulate them.
    552 //
    553 //   The other interesting aspect of this model is that nodes are generated
    554 //   at the point where they are needed by a recursive call to Emit().  If
    555 //   the node has already been code generated then the Emit() call will
    556 //   generate a jump to the previously generated code instead.  In order to
    557 //   limit recursion it is possible for the Emit() function to put the node
    558 //   on a work list for later generation and instead generate a jump.  The
    559 //   destination of the jump is resolved later when the code is generated.
    560 //
    561 // Actual regular expression code generation.
    562 //
    563 //   Code generation is actually more complicated than the above.  In order
    564 //   to improve the efficiency of the generated code some optimizations are
    565 //   performed
    566 //
    567 //   * Choice nodes have 1-character lookahead.
    568 //     A choice node looks at the following character and eliminates some of
    569 //     the choices immediately based on that character.  This is not yet
    570 //     implemented.
    571 //   * Simple greedy loops store reduced backtracking information.
    572 //     A quantifier like /.*foo/m will greedily match the whole input.  It will
    573 //     then need to backtrack to a point where it can match "foo".  The naive
    574 //     implementation of this would push each character position onto the
    575 //     backtracking stack, then pop them off one by one.  This would use space
    576 //     proportional to the length of the input string.  However since the "."
    577 //     can only match in one way and always has a constant length (in this case
    578 //     of 1) it suffices to store the current position on the top of the stack
    579 //     once.  Matching now becomes merely incrementing the current position and
    580 //     backtracking becomes decrementing the current position and checking the
    581 //     result against the stored current position.  This is faster and saves
    582 //     space.
    583 //   * The current state is virtualized.
    584 //     This is used to defer expensive operations until it is clear that they
    585 //     are needed and to generate code for a node more than once, allowing
    586 //     specialized an efficient versions of the code to be created. This is
    587 //     explained in the section below.
    588 //
    589 // Execution state virtualization.
    590 //
    591 //   Instead of emitting code, nodes that manipulate the state can record their
    592 //   manipulation in an object called the Trace.  The Trace object can record a
    593 //   current position offset, an optional backtrack code location on the top of
    594 //   the virtualized backtrack stack and some register changes.  When a node is
    595 //   to be emitted it can flush the Trace or update it.  Flushing the Trace
    596 //   will emit code to bring the actual state into line with the virtual state.
    597 //   Avoiding flushing the state can postpone some work (eg updates of capture
    598 //   registers).  Postponing work can save time when executing the regular
    599 //   expression since it may be found that the work never has to be done as a
    600 //   failure to match can occur.  In addition it is much faster to jump to a
    601 //   known backtrack code location than it is to pop an unknown backtrack
    602 //   location from the stack and jump there.
    603 //
    604 //   The virtual state found in the Trace affects code generation.  For example
    605 //   the virtual state contains the difference between the actual current
    606 //   position and the virtual current position, and matching code needs to use
    607 //   this offset to attempt a match in the correct location of the input
    608 //   string.  Therefore code generated for a non-trivial trace is specialized
    609 //   to that trace.  The code generator therefore has the ability to generate
    610 //   code for each node several times.  In order to limit the size of the
    611 //   generated code there is an arbitrary limit on how many specialized sets of
    612 //   code may be generated for a given node.  If the limit is reached, the
    613 //   trace is flushed and a generic version of the code for a node is emitted.
    614 //   This is subsequently used for that node.  The code emitted for non-generic
    615 //   trace is not recorded in the node and so it cannot currently be reused in
    616 //   the event that code generation is requested for an identical trace.
    617 
    618 
    619 void RegExpTree::AppendToText(RegExpText* text) {
    620   UNREACHABLE();
    621 }
    622 
    623 
    624 void RegExpAtom::AppendToText(RegExpText* text) {
    625   text->AddElement(TextElement::Atom(this));
    626 }
    627 
    628 
    629 void RegExpCharacterClass::AppendToText(RegExpText* text) {
    630   text->AddElement(TextElement::CharClass(this));
    631 }
    632 
    633 
    634 void RegExpText::AppendToText(RegExpText* text) {
    635   for (int i = 0; i < elements()->length(); i++)
    636     text->AddElement(elements()->at(i));
    637 }
    638 
    639 
    640 TextElement TextElement::Atom(RegExpAtom* atom) {
    641   TextElement result = TextElement(ATOM);
    642   result.data.u_atom = atom;
    643   return result;
    644 }
    645 
    646 
    647 TextElement TextElement::CharClass(
    648       RegExpCharacterClass* char_class) {
    649   TextElement result = TextElement(CHAR_CLASS);
    650   result.data.u_char_class = char_class;
    651   return result;
    652 }
    653 
    654 
    655 int TextElement::length() {
    656   if (type == ATOM) {
    657     return data.u_atom->length();
    658   } else {
    659     ASSERT(type == CHAR_CLASS);
    660     return 1;
    661   }
    662 }
    663 
    664 
    665 DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
    666   if (table_ == NULL) {
    667     table_ = new DispatchTable();
    668     DispatchTableConstructor cons(table_, ignore_case);
    669     cons.BuildTable(this);
    670   }
    671   return table_;
    672 }
    673 
    674 
    675 class RegExpCompiler {
    676  public:
    677   RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii);
    678 
    679   int AllocateRegister() {
    680     if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
    681       reg_exp_too_big_ = true;
    682       return next_register_;
    683     }
    684     return next_register_++;
    685   }
    686 
    687   RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
    688                                            RegExpNode* start,
    689                                            int capture_count,
    690                                            Handle<String> pattern);
    691 
    692   inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
    693 
    694   static const int kImplementationOffset = 0;
    695   static const int kNumberOfRegistersOffset = 0;
    696   static const int kCodeOffset = 1;
    697 
    698   RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
    699   EndNode* accept() { return accept_; }
    700 
    701   static const int kMaxRecursion = 100;
    702   inline int recursion_depth() { return recursion_depth_; }
    703   inline void IncrementRecursionDepth() { recursion_depth_++; }
    704   inline void DecrementRecursionDepth() { recursion_depth_--; }
    705 
    706   void SetRegExpTooBig() { reg_exp_too_big_ = true; }
    707 
    708   inline bool ignore_case() { return ignore_case_; }
    709   inline bool ascii() { return ascii_; }
    710 
    711   static const int kNoRegister = -1;
    712  private:
    713   EndNode* accept_;
    714   int next_register_;
    715   List<RegExpNode*>* work_list_;
    716   int recursion_depth_;
    717   RegExpMacroAssembler* macro_assembler_;
    718   bool ignore_case_;
    719   bool ascii_;
    720   bool reg_exp_too_big_;
    721 };
    722 
    723 
    724 class RecursionCheck {
    725  public:
    726   explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
    727     compiler->IncrementRecursionDepth();
    728   }
    729   ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
    730  private:
    731   RegExpCompiler* compiler_;
    732 };
    733 
    734 
    735 static RegExpEngine::CompilationResult IrregexpRegExpTooBig() {
    736   return RegExpEngine::CompilationResult("RegExp too big");
    737 }
    738 
    739 
    740 // Attempts to compile the regexp using an Irregexp code generator.  Returns
    741 // a fixed array or a null handle depending on whether it succeeded.
    742 RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii)
    743     : next_register_(2 * (capture_count + 1)),
    744       work_list_(NULL),
    745       recursion_depth_(0),
    746       ignore_case_(ignore_case),
    747       ascii_(ascii),
    748       reg_exp_too_big_(false) {
    749   accept_ = new EndNode(EndNode::ACCEPT);
    750   ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
    751 }
    752 
    753 
    754 RegExpEngine::CompilationResult RegExpCompiler::Assemble(
    755     RegExpMacroAssembler* macro_assembler,
    756     RegExpNode* start,
    757     int capture_count,
    758     Handle<String> pattern) {
    759 #ifdef DEBUG
    760   if (FLAG_trace_regexp_assembler)
    761     macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler);
    762   else
    763 #endif
    764     macro_assembler_ = macro_assembler;
    765   List <RegExpNode*> work_list(0);
    766   work_list_ = &work_list;
    767   Label fail;
    768   macro_assembler_->PushBacktrack(&fail);
    769   Trace new_trace;
    770   start->Emit(this, &new_trace);
    771   macro_assembler_->Bind(&fail);
    772   macro_assembler_->Fail();
    773   while (!work_list.is_empty()) {
    774     work_list.RemoveLast()->Emit(this, &new_trace);
    775   }
    776   if (reg_exp_too_big_) return IrregexpRegExpTooBig();
    777 
    778   Handle<Object> code = macro_assembler_->GetCode(pattern);
    779 
    780   work_list_ = NULL;
    781 #ifdef DEBUG
    782   if (FLAG_trace_regexp_assembler) {
    783     delete macro_assembler_;
    784   }
    785 #endif
    786   return RegExpEngine::CompilationResult(*code, next_register_);
    787 }
    788 
    789 
    790 bool Trace::DeferredAction::Mentions(int that) {
    791   if (type() == ActionNode::CLEAR_CAPTURES) {
    792     Interval range = static_cast<DeferredClearCaptures*>(this)->range();
    793     return range.Contains(that);
    794   } else {
    795     return reg() == that;
    796   }
    797 }
    798 
    799 
    800 bool Trace::mentions_reg(int reg) {
    801   for (DeferredAction* action = actions_;
    802        action != NULL;
    803        action = action->next()) {
    804     if (action->Mentions(reg))
    805       return true;
    806   }
    807   return false;
    808 }
    809 
    810 
    811 bool Trace::GetStoredPosition(int reg, int* cp_offset) {
    812   ASSERT_EQ(0, *cp_offset);
    813   for (DeferredAction* action = actions_;
    814        action != NULL;
    815        action = action->next()) {
    816     if (action->Mentions(reg)) {
    817       if (action->type() == ActionNode::STORE_POSITION) {
    818         *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
    819         return true;
    820       } else {
    821         return false;
    822       }
    823     }
    824   }
    825   return false;
    826 }
    827 
    828 
    829 int Trace::FindAffectedRegisters(OutSet* affected_registers) {
    830   int max_register = RegExpCompiler::kNoRegister;
    831   for (DeferredAction* action = actions_;
    832        action != NULL;
    833        action = action->next()) {
    834     if (action->type() == ActionNode::CLEAR_CAPTURES) {
    835       Interval range = static_cast<DeferredClearCaptures*>(action)->range();
    836       for (int i = range.from(); i <= range.to(); i++)
    837         affected_registers->Set(i);
    838       if (range.to() > max_register) max_register = range.to();
    839     } else {
    840       affected_registers->Set(action->reg());
    841       if (action->reg() > max_register) max_register = action->reg();
    842     }
    843   }
    844   return max_register;
    845 }
    846 
    847 
    848 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
    849                                      int max_register,
    850                                      OutSet& registers_to_pop,
    851                                      OutSet& registers_to_clear) {
    852   for (int reg = max_register; reg >= 0; reg--) {
    853     if (registers_to_pop.Get(reg)) assembler->PopRegister(reg);
    854     else if (registers_to_clear.Get(reg)) {
    855       int clear_to = reg;
    856       while (reg > 0 && registers_to_clear.Get(reg - 1)) {
    857         reg--;
    858       }
    859       assembler->ClearRegisters(reg, clear_to);
    860     }
    861   }
    862 }
    863 
    864 
    865 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
    866                                    int max_register,
    867                                    OutSet& affected_registers,
    868                                    OutSet* registers_to_pop,
    869                                    OutSet* registers_to_clear) {
    870   // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
    871   const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
    872 
    873   // Count pushes performed to force a stack limit check occasionally.
    874   int pushes = 0;
    875 
    876   for (int reg = 0; reg <= max_register; reg++) {
    877     if (!affected_registers.Get(reg)) {
    878       continue;
    879     }
    880 
    881     // The chronologically first deferred action in the trace
    882     // is used to infer the action needed to restore a register
    883     // to its previous state (or not, if it's safe to ignore it).
    884     enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
    885     DeferredActionUndoType undo_action = IGNORE;
    886 
    887     int value = 0;
    888     bool absolute = false;
    889     bool clear = false;
    890     int store_position = -1;
    891     // This is a little tricky because we are scanning the actions in reverse
    892     // historical order (newest first).
    893     for (DeferredAction* action = actions_;
    894          action != NULL;
    895          action = action->next()) {
    896       if (action->Mentions(reg)) {
    897         switch (action->type()) {
    898           case ActionNode::SET_REGISTER: {
    899             Trace::DeferredSetRegister* psr =
    900                 static_cast<Trace::DeferredSetRegister*>(action);
    901             if (!absolute) {
    902               value += psr->value();
    903               absolute = true;
    904             }
    905             // SET_REGISTER is currently only used for newly introduced loop
    906             // counters. They can have a significant previous value if they
    907             // occour in a loop. TODO(lrn): Propagate this information, so
    908             // we can set undo_action to IGNORE if we know there is no value to
    909             // restore.
    910             undo_action = RESTORE;
    911             ASSERT_EQ(store_position, -1);
    912             ASSERT(!clear);
    913             break;
    914           }
    915           case ActionNode::INCREMENT_REGISTER:
    916             if (!absolute) {
    917               value++;
    918             }
    919             ASSERT_EQ(store_position, -1);
    920             ASSERT(!clear);
    921             undo_action = RESTORE;
    922             break;
    923           case ActionNode::STORE_POSITION: {
    924             Trace::DeferredCapture* pc =
    925                 static_cast<Trace::DeferredCapture*>(action);
    926             if (!clear && store_position == -1) {
    927               store_position = pc->cp_offset();
    928             }
    929 
    930             // For captures we know that stores and clears alternate.
    931             // Other register, are never cleared, and if the occur
    932             // inside a loop, they might be assigned more than once.
    933             if (reg <= 1) {
    934               // Registers zero and one, aka "capture zero", is
    935               // always set correctly if we succeed. There is no
    936               // need to undo a setting on backtrack, because we
    937               // will set it again or fail.
    938               undo_action = IGNORE;
    939             } else {
    940               undo_action = pc->is_capture() ? CLEAR : RESTORE;
    941             }
    942             ASSERT(!absolute);
    943             ASSERT_EQ(value, 0);
    944             break;
    945           }
    946           case ActionNode::CLEAR_CAPTURES: {
    947             // Since we're scanning in reverse order, if we've already
    948             // set the position we have to ignore historically earlier
    949             // clearing operations.
    950             if (store_position == -1) {
    951               clear = true;
    952             }
    953             undo_action = RESTORE;
    954             ASSERT(!absolute);
    955             ASSERT_EQ(value, 0);
    956             break;
    957           }
    958           default:
    959             UNREACHABLE();
    960             break;
    961         }
    962       }
    963     }
    964     // Prepare for the undo-action (e.g., push if it's going to be popped).
    965     if (undo_action == RESTORE) {
    966       pushes++;
    967       RegExpMacroAssembler::StackCheckFlag stack_check =
    968           RegExpMacroAssembler::kNoStackLimitCheck;
    969       if (pushes == push_limit) {
    970         stack_check = RegExpMacroAssembler::kCheckStackLimit;
    971         pushes = 0;
    972       }
    973 
    974       assembler->PushRegister(reg, stack_check);
    975       registers_to_pop->Set(reg);
    976     } else if (undo_action == CLEAR) {
    977       registers_to_clear->Set(reg);
    978     }
    979     // Perform the chronologically last action (or accumulated increment)
    980     // for the register.
    981     if (store_position != -1) {
    982       assembler->WriteCurrentPositionToRegister(reg, store_position);
    983     } else if (clear) {
    984       assembler->ClearRegisters(reg, reg);
    985     } else if (absolute) {
    986       assembler->SetRegister(reg, value);
    987     } else if (value != 0) {
    988       assembler->AdvanceRegister(reg, value);
    989     }
    990   }
    991 }
    992 
    993 
    994 // This is called as we come into a loop choice node and some other tricky
    995 // nodes.  It normalizes the state of the code generator to ensure we can
    996 // generate generic code.
    997 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
    998   RegExpMacroAssembler* assembler = compiler->macro_assembler();
    999 
   1000   ASSERT(!is_trivial());
   1001 
   1002   if (actions_ == NULL && backtrack() == NULL) {
   1003     // Here we just have some deferred cp advances to fix and we are back to
   1004     // a normal situation.  We may also have to forget some information gained
   1005     // through a quick check that was already performed.
   1006     if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
   1007     // Create a new trivial state and generate the node with that.
   1008     Trace new_state;
   1009     successor->Emit(compiler, &new_state);
   1010     return;
   1011   }
   1012 
   1013   // Generate deferred actions here along with code to undo them again.
   1014   OutSet affected_registers;
   1015 
   1016   if (backtrack() != NULL) {
   1017     // Here we have a concrete backtrack location.  These are set up by choice
   1018     // nodes and so they indicate that we have a deferred save of the current
   1019     // position which we may need to emit here.
   1020     assembler->PushCurrentPosition();
   1021   }
   1022 
   1023   int max_register = FindAffectedRegisters(&affected_registers);
   1024   OutSet registers_to_pop;
   1025   OutSet registers_to_clear;
   1026   PerformDeferredActions(assembler,
   1027                          max_register,
   1028                          affected_registers,
   1029                          &registers_to_pop,
   1030                          &registers_to_clear);
   1031   if (cp_offset_ != 0) {
   1032     assembler->AdvanceCurrentPosition(cp_offset_);
   1033   }
   1034 
   1035   // Create a new trivial state and generate the node with that.
   1036   Label undo;
   1037   assembler->PushBacktrack(&undo);
   1038   Trace new_state;
   1039   successor->Emit(compiler, &new_state);
   1040 
   1041   // On backtrack we need to restore state.
   1042   assembler->Bind(&undo);
   1043   RestoreAffectedRegisters(assembler,
   1044                            max_register,
   1045                            registers_to_pop,
   1046                            registers_to_clear);
   1047   if (backtrack() == NULL) {
   1048     assembler->Backtrack();
   1049   } else {
   1050     assembler->PopCurrentPosition();
   1051     assembler->GoTo(backtrack());
   1052   }
   1053 }
   1054 
   1055 
   1056 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
   1057   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   1058 
   1059   // Omit flushing the trace. We discard the entire stack frame anyway.
   1060 
   1061   if (!label()->is_bound()) {
   1062     // We are completely independent of the trace, since we ignore it,
   1063     // so this code can be used as the generic version.
   1064     assembler->Bind(label());
   1065   }
   1066 
   1067   // Throw away everything on the backtrack stack since the start
   1068   // of the negative submatch and restore the character position.
   1069   assembler->ReadCurrentPositionFromRegister(current_position_register_);
   1070   assembler->ReadStackPointerFromRegister(stack_pointer_register_);
   1071   if (clear_capture_count_ > 0) {
   1072     // Clear any captures that might have been performed during the success
   1073     // of the body of the negative look-ahead.
   1074     int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
   1075     assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
   1076   }
   1077   // Now that we have unwound the stack we find at the top of the stack the
   1078   // backtrack that the BeginSubmatch node got.
   1079   assembler->Backtrack();
   1080 }
   1081 
   1082 
   1083 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
   1084   if (!trace->is_trivial()) {
   1085     trace->Flush(compiler, this);
   1086     return;
   1087   }
   1088   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   1089   if (!label()->is_bound()) {
   1090     assembler->Bind(label());
   1091   }
   1092   switch (action_) {
   1093     case ACCEPT:
   1094       assembler->Succeed();
   1095       return;
   1096     case BACKTRACK:
   1097       assembler->GoTo(trace->backtrack());
   1098       return;
   1099     case NEGATIVE_SUBMATCH_SUCCESS:
   1100       // This case is handled in a different virtual method.
   1101       UNREACHABLE();
   1102   }
   1103   UNIMPLEMENTED();
   1104 }
   1105 
   1106 
   1107 void GuardedAlternative::AddGuard(Guard* guard) {
   1108   if (guards_ == NULL)
   1109     guards_ = new ZoneList<Guard*>(1);
   1110   guards_->Add(guard);
   1111 }
   1112 
   1113 
   1114 ActionNode* ActionNode::SetRegister(int reg,
   1115                                     int val,
   1116                                     RegExpNode* on_success) {
   1117   ActionNode* result = new ActionNode(SET_REGISTER, on_success);
   1118   result->data_.u_store_register.reg = reg;
   1119   result->data_.u_store_register.value = val;
   1120   return result;
   1121 }
   1122 
   1123 
   1124 ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
   1125   ActionNode* result = new ActionNode(INCREMENT_REGISTER, on_success);
   1126   result->data_.u_increment_register.reg = reg;
   1127   return result;
   1128 }
   1129 
   1130 
   1131 ActionNode* ActionNode::StorePosition(int reg,
   1132                                       bool is_capture,
   1133                                       RegExpNode* on_success) {
   1134   ActionNode* result = new ActionNode(STORE_POSITION, on_success);
   1135   result->data_.u_position_register.reg = reg;
   1136   result->data_.u_position_register.is_capture = is_capture;
   1137   return result;
   1138 }
   1139 
   1140 
   1141 ActionNode* ActionNode::ClearCaptures(Interval range,
   1142                                       RegExpNode* on_success) {
   1143   ActionNode* result = new ActionNode(CLEAR_CAPTURES, on_success);
   1144   result->data_.u_clear_captures.range_from = range.from();
   1145   result->data_.u_clear_captures.range_to = range.to();
   1146   return result;
   1147 }
   1148 
   1149 
   1150 ActionNode* ActionNode::BeginSubmatch(int stack_reg,
   1151                                       int position_reg,
   1152                                       RegExpNode* on_success) {
   1153   ActionNode* result = new ActionNode(BEGIN_SUBMATCH, on_success);
   1154   result->data_.u_submatch.stack_pointer_register = stack_reg;
   1155   result->data_.u_submatch.current_position_register = position_reg;
   1156   return result;
   1157 }
   1158 
   1159 
   1160 ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
   1161                                                 int position_reg,
   1162                                                 int clear_register_count,
   1163                                                 int clear_register_from,
   1164                                                 RegExpNode* on_success) {
   1165   ActionNode* result = new ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
   1166   result->data_.u_submatch.stack_pointer_register = stack_reg;
   1167   result->data_.u_submatch.current_position_register = position_reg;
   1168   result->data_.u_submatch.clear_register_count = clear_register_count;
   1169   result->data_.u_submatch.clear_register_from = clear_register_from;
   1170   return result;
   1171 }
   1172 
   1173 
   1174 ActionNode* ActionNode::EmptyMatchCheck(int start_register,
   1175                                         int repetition_register,
   1176                                         int repetition_limit,
   1177                                         RegExpNode* on_success) {
   1178   ActionNode* result = new ActionNode(EMPTY_MATCH_CHECK, on_success);
   1179   result->data_.u_empty_match_check.start_register = start_register;
   1180   result->data_.u_empty_match_check.repetition_register = repetition_register;
   1181   result->data_.u_empty_match_check.repetition_limit = repetition_limit;
   1182   return result;
   1183 }
   1184 
   1185 
   1186 #define DEFINE_ACCEPT(Type)                                          \
   1187   void Type##Node::Accept(NodeVisitor* visitor) {                    \
   1188     visitor->Visit##Type(this);                                      \
   1189   }
   1190 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
   1191 #undef DEFINE_ACCEPT
   1192 
   1193 
   1194 void LoopChoiceNode::Accept(NodeVisitor* visitor) {
   1195   visitor->VisitLoopChoice(this);
   1196 }
   1197 
   1198 
   1199 // -------------------------------------------------------------------
   1200 // Emit code.
   1201 
   1202 
   1203 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
   1204                                Guard* guard,
   1205                                Trace* trace) {
   1206   switch (guard->op()) {
   1207     case Guard::LT:
   1208       ASSERT(!trace->mentions_reg(guard->reg()));
   1209       macro_assembler->IfRegisterGE(guard->reg(),
   1210                                     guard->value(),
   1211                                     trace->backtrack());
   1212       break;
   1213     case Guard::GEQ:
   1214       ASSERT(!trace->mentions_reg(guard->reg()));
   1215       macro_assembler->IfRegisterLT(guard->reg(),
   1216                                     guard->value(),
   1217                                     trace->backtrack());
   1218       break;
   1219   }
   1220 }
   1221 
   1222 
   1223 static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize;
   1224 static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;
   1225 
   1226 
   1227 // Returns the number of characters in the equivalence class, omitting those
   1228 // that cannot occur in the source string because it is ASCII.
   1229 static int GetCaseIndependentLetters(uc16 character,
   1230                                      bool ascii_subject,
   1231                                      unibrow::uchar* letters) {
   1232   int length = uncanonicalize.get(character, '\0', letters);
   1233   // Unibrow returns 0 or 1 for characters where case independependence is
   1234   // trivial.
   1235   if (length == 0) {
   1236     letters[0] = character;
   1237     length = 1;
   1238   }
   1239   if (!ascii_subject || character <= String::kMaxAsciiCharCode) {
   1240     return length;
   1241   }
   1242   // The standard requires that non-ASCII characters cannot have ASCII
   1243   // character codes in their equivalence class.
   1244   return 0;
   1245 }
   1246 
   1247 
   1248 static inline bool EmitSimpleCharacter(RegExpCompiler* compiler,
   1249                                        uc16 c,
   1250                                        Label* on_failure,
   1251                                        int cp_offset,
   1252                                        bool check,
   1253                                        bool preloaded) {
   1254   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   1255   bool bound_checked = false;
   1256   if (!preloaded) {
   1257     assembler->LoadCurrentCharacter(
   1258         cp_offset,
   1259         on_failure,
   1260         check);
   1261     bound_checked = true;
   1262   }
   1263   assembler->CheckNotCharacter(c, on_failure);
   1264   return bound_checked;
   1265 }
   1266 
   1267 
   1268 // Only emits non-letters (things that don't have case).  Only used for case
   1269 // independent matches.
   1270 static inline bool EmitAtomNonLetter(RegExpCompiler* compiler,
   1271                                      uc16 c,
   1272                                      Label* on_failure,
   1273                                      int cp_offset,
   1274                                      bool check,
   1275                                      bool preloaded) {
   1276   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
   1277   bool ascii = compiler->ascii();
   1278   unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
   1279   int length = GetCaseIndependentLetters(c, ascii, chars);
   1280   if (length < 1) {
   1281     // This can't match.  Must be an ASCII subject and a non-ASCII character.
   1282     // We do not need to do anything since the ASCII pass already handled this.
   1283     return false;  // Bounds not checked.
   1284   }
   1285   bool checked = false;
   1286   // We handle the length > 1 case in a later pass.
   1287   if (length == 1) {
   1288     if (ascii && c > String::kMaxAsciiCharCodeU) {
   1289       // Can't match - see above.
   1290       return false;  // Bounds not checked.
   1291     }
   1292     if (!preloaded) {
   1293       macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
   1294       checked = check;
   1295     }
   1296     macro_assembler->CheckNotCharacter(c, on_failure);
   1297   }
   1298   return checked;
   1299 }
   1300 
   1301 
   1302 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
   1303                                       bool ascii,
   1304                                       uc16 c1,
   1305                                       uc16 c2,
   1306                                       Label* on_failure) {
   1307   uc16 char_mask;
   1308   if (ascii) {
   1309     char_mask = String::kMaxAsciiCharCode;
   1310   } else {
   1311     char_mask = String::kMaxUC16CharCode;
   1312   }
   1313   uc16 exor = c1 ^ c2;
   1314   // Check whether exor has only one bit set.
   1315   if (((exor - 1) & exor) == 0) {
   1316     // If c1 and c2 differ only by one bit.
   1317     // Ecma262UnCanonicalize always gives the highest number last.
   1318     ASSERT(c2 > c1);
   1319     uc16 mask = char_mask ^ exor;
   1320     macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
   1321     return true;
   1322   }
   1323   ASSERT(c2 > c1);
   1324   uc16 diff = c2 - c1;
   1325   if (((diff - 1) & diff) == 0 && c1 >= diff) {
   1326     // If the characters differ by 2^n but don't differ by one bit then
   1327     // subtract the difference from the found character, then do the or
   1328     // trick.  We avoid the theoretical case where negative numbers are
   1329     // involved in order to simplify code generation.
   1330     uc16 mask = char_mask ^ diff;
   1331     macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
   1332                                                     diff,
   1333                                                     mask,
   1334                                                     on_failure);
   1335     return true;
   1336   }
   1337   return false;
   1338 }
   1339 
   1340 
   1341 typedef bool EmitCharacterFunction(RegExpCompiler* compiler,
   1342                                    uc16 c,
   1343                                    Label* on_failure,
   1344                                    int cp_offset,
   1345                                    bool check,
   1346                                    bool preloaded);
   1347 
   1348 // Only emits letters (things that have case).  Only used for case independent
   1349 // matches.
   1350 static inline bool EmitAtomLetter(RegExpCompiler* compiler,
   1351                                   uc16 c,
   1352                                   Label* on_failure,
   1353                                   int cp_offset,
   1354                                   bool check,
   1355                                   bool preloaded) {
   1356   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
   1357   bool ascii = compiler->ascii();
   1358   unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
   1359   int length = GetCaseIndependentLetters(c, ascii, chars);
   1360   if (length <= 1) return false;
   1361   // We may not need to check against the end of the input string
   1362   // if this character lies before a character that matched.
   1363   if (!preloaded) {
   1364     macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
   1365   }
   1366   Label ok;
   1367   ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
   1368   switch (length) {
   1369     case 2: {
   1370       if (ShortCutEmitCharacterPair(macro_assembler,
   1371                                     ascii,
   1372                                     chars[0],
   1373                                     chars[1],
   1374                                     on_failure)) {
   1375       } else {
   1376         macro_assembler->CheckCharacter(chars[0], &ok);
   1377         macro_assembler->CheckNotCharacter(chars[1], on_failure);
   1378         macro_assembler->Bind(&ok);
   1379       }
   1380       break;
   1381     }
   1382     case 4:
   1383       macro_assembler->CheckCharacter(chars[3], &ok);
   1384       // Fall through!
   1385     case 3:
   1386       macro_assembler->CheckCharacter(chars[0], &ok);
   1387       macro_assembler->CheckCharacter(chars[1], &ok);
   1388       macro_assembler->CheckNotCharacter(chars[2], on_failure);
   1389       macro_assembler->Bind(&ok);
   1390       break;
   1391     default:
   1392       UNREACHABLE();
   1393       break;
   1394   }
   1395   return true;
   1396 }
   1397 
   1398 
   1399 static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
   1400                           RegExpCharacterClass* cc,
   1401                           bool ascii,
   1402                           Label* on_failure,
   1403                           int cp_offset,
   1404                           bool check_offset,
   1405                           bool preloaded) {
   1406   ZoneList<CharacterRange>* ranges = cc->ranges();
   1407   int max_char;
   1408   if (ascii) {
   1409     max_char = String::kMaxAsciiCharCode;
   1410   } else {
   1411     max_char = String::kMaxUC16CharCode;
   1412   }
   1413 
   1414   Label success;
   1415 
   1416   Label* char_is_in_class =
   1417       cc->is_negated() ? on_failure : &success;
   1418 
   1419   int range_count = ranges->length();
   1420 
   1421   int last_valid_range = range_count - 1;
   1422   while (last_valid_range >= 0) {
   1423     CharacterRange& range = ranges->at(last_valid_range);
   1424     if (range.from() <= max_char) {
   1425       break;
   1426     }
   1427     last_valid_range--;
   1428   }
   1429 
   1430   if (last_valid_range < 0) {
   1431     if (!cc->is_negated()) {
   1432       // TODO(plesner): We can remove this when the node level does our
   1433       // ASCII optimizations for us.
   1434       macro_assembler->GoTo(on_failure);
   1435     }
   1436     if (check_offset) {
   1437       macro_assembler->CheckPosition(cp_offset, on_failure);
   1438     }
   1439     return;
   1440   }
   1441 
   1442   if (last_valid_range == 0 &&
   1443       !cc->is_negated() &&
   1444       ranges->at(0).IsEverything(max_char)) {
   1445     // This is a common case hit by non-anchored expressions.
   1446     if (check_offset) {
   1447       macro_assembler->CheckPosition(cp_offset, on_failure);
   1448     }
   1449     return;
   1450   }
   1451 
   1452   if (!preloaded) {
   1453     macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
   1454   }
   1455 
   1456   if (cc->is_standard() &&
   1457         macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
   1458                                                     on_failure)) {
   1459       return;
   1460   }
   1461 
   1462   for (int i = 0; i < last_valid_range; i++) {
   1463     CharacterRange& range = ranges->at(i);
   1464     Label next_range;
   1465     uc16 from = range.from();
   1466     uc16 to = range.to();
   1467     if (from > max_char) {
   1468       continue;
   1469     }
   1470     if (to > max_char) to = max_char;
   1471     if (to == from) {
   1472       macro_assembler->CheckCharacter(to, char_is_in_class);
   1473     } else {
   1474       if (from != 0) {
   1475         macro_assembler->CheckCharacterLT(from, &next_range);
   1476       }
   1477       if (to != max_char) {
   1478         macro_assembler->CheckCharacterLT(to + 1, char_is_in_class);
   1479       } else {
   1480         macro_assembler->GoTo(char_is_in_class);
   1481       }
   1482     }
   1483     macro_assembler->Bind(&next_range);
   1484   }
   1485 
   1486   CharacterRange& range = ranges->at(last_valid_range);
   1487   uc16 from = range.from();
   1488   uc16 to = range.to();
   1489 
   1490   if (to > max_char) to = max_char;
   1491   ASSERT(to >= from);
   1492 
   1493   if (to == from) {
   1494     if (cc->is_negated()) {
   1495       macro_assembler->CheckCharacter(to, on_failure);
   1496     } else {
   1497       macro_assembler->CheckNotCharacter(to, on_failure);
   1498     }
   1499   } else {
   1500     if (from != 0) {
   1501       if (cc->is_negated()) {
   1502         macro_assembler->CheckCharacterLT(from, &success);
   1503       } else {
   1504         macro_assembler->CheckCharacterLT(from, on_failure);
   1505       }
   1506     }
   1507     if (to != String::kMaxUC16CharCode) {
   1508       if (cc->is_negated()) {
   1509         macro_assembler->CheckCharacterLT(to + 1, on_failure);
   1510       } else {
   1511         macro_assembler->CheckCharacterGT(to, on_failure);
   1512       }
   1513     } else {
   1514       if (cc->is_negated()) {
   1515         macro_assembler->GoTo(on_failure);
   1516       }
   1517     }
   1518   }
   1519   macro_assembler->Bind(&success);
   1520 }
   1521 
   1522 
   1523 RegExpNode::~RegExpNode() {
   1524 }
   1525 
   1526 
   1527 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
   1528                                                   Trace* trace) {
   1529   // If we are generating a greedy loop then don't stop and don't reuse code.
   1530   if (trace->stop_node() != NULL) {
   1531     return CONTINUE;
   1532   }
   1533 
   1534   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
   1535   if (trace->is_trivial()) {
   1536     if (label_.is_bound()) {
   1537       // We are being asked to generate a generic version, but that's already
   1538       // been done so just go to it.
   1539       macro_assembler->GoTo(&label_);
   1540       return DONE;
   1541     }
   1542     if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
   1543       // To avoid too deep recursion we push the node to the work queue and just
   1544       // generate a goto here.
   1545       compiler->AddWork(this);
   1546       macro_assembler->GoTo(&label_);
   1547       return DONE;
   1548     }
   1549     // Generate generic version of the node and bind the label for later use.
   1550     macro_assembler->Bind(&label_);
   1551     return CONTINUE;
   1552   }
   1553 
   1554   // We are being asked to make a non-generic version.  Keep track of how many
   1555   // non-generic versions we generate so as not to overdo it.
   1556   trace_count_++;
   1557   if (FLAG_regexp_optimization &&
   1558       trace_count_ < kMaxCopiesCodeGenerated &&
   1559       compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {
   1560     return CONTINUE;
   1561   }
   1562 
   1563   // If we get here code has been generated for this node too many times or
   1564   // recursion is too deep.  Time to switch to a generic version.  The code for
   1565   // generic versions above can handle deep recursion properly.
   1566   trace->Flush(compiler, this);
   1567   return DONE;
   1568 }
   1569 
   1570 
   1571 int ActionNode::EatsAtLeast(int still_to_find, int recursion_depth) {
   1572   if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
   1573   if (type_ == POSITIVE_SUBMATCH_SUCCESS) return 0;  // Rewinds input!
   1574   return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1);
   1575 }
   1576 
   1577 
   1578 int AssertionNode::EatsAtLeast(int still_to_find, int recursion_depth) {
   1579   if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
   1580   return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1);
   1581 }
   1582 
   1583 
   1584 int BackReferenceNode::EatsAtLeast(int still_to_find, int recursion_depth) {
   1585   if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
   1586   return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1);
   1587 }
   1588 
   1589 
   1590 int TextNode::EatsAtLeast(int still_to_find, int recursion_depth) {
   1591   int answer = Length();
   1592   if (answer >= still_to_find) return answer;
   1593   if (recursion_depth > RegExpCompiler::kMaxRecursion) return answer;
   1594   return answer + on_success()->EatsAtLeast(still_to_find - answer,
   1595                                             recursion_depth + 1);
   1596 }
   1597 
   1598 
   1599 int NegativeLookaheadChoiceNode::EatsAtLeast(int still_to_find,
   1600                                              int recursion_depth) {
   1601   if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
   1602   // Alternative 0 is the negative lookahead, alternative 1 is what comes
   1603   // afterwards.
   1604   RegExpNode* node = alternatives_->at(1).node();
   1605   return node->EatsAtLeast(still_to_find, recursion_depth + 1);
   1606 }
   1607 
   1608 
   1609 void NegativeLookaheadChoiceNode::GetQuickCheckDetails(
   1610     QuickCheckDetails* details,
   1611     RegExpCompiler* compiler,
   1612     int filled_in,
   1613     bool not_at_start) {
   1614   // Alternative 0 is the negative lookahead, alternative 1 is what comes
   1615   // afterwards.
   1616   RegExpNode* node = alternatives_->at(1).node();
   1617   return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
   1618 }
   1619 
   1620 
   1621 int ChoiceNode::EatsAtLeastHelper(int still_to_find,
   1622                                   int recursion_depth,
   1623                                   RegExpNode* ignore_this_node) {
   1624   if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
   1625   int min = 100;
   1626   int choice_count = alternatives_->length();
   1627   for (int i = 0; i < choice_count; i++) {
   1628     RegExpNode* node = alternatives_->at(i).node();
   1629     if (node == ignore_this_node) continue;
   1630     int node_eats_at_least = node->EatsAtLeast(still_to_find,
   1631                                                recursion_depth + 1);
   1632     if (node_eats_at_least < min) min = node_eats_at_least;
   1633   }
   1634   return min;
   1635 }
   1636 
   1637 
   1638 int LoopChoiceNode::EatsAtLeast(int still_to_find, int recursion_depth) {
   1639   return EatsAtLeastHelper(still_to_find, recursion_depth, loop_node_);
   1640 }
   1641 
   1642 
   1643 int ChoiceNode::EatsAtLeast(int still_to_find, int recursion_depth) {
   1644   return EatsAtLeastHelper(still_to_find, recursion_depth, NULL);
   1645 }
   1646 
   1647 
   1648 // Takes the left-most 1-bit and smears it out, setting all bits to its right.
   1649 static inline uint32_t SmearBitsRight(uint32_t v) {
   1650   v |= v >> 1;
   1651   v |= v >> 2;
   1652   v |= v >> 4;
   1653   v |= v >> 8;
   1654   v |= v >> 16;
   1655   return v;
   1656 }
   1657 
   1658 
   1659 bool QuickCheckDetails::Rationalize(bool asc) {
   1660   bool found_useful_op = false;
   1661   uint32_t char_mask;
   1662   if (asc) {
   1663     char_mask = String::kMaxAsciiCharCode;
   1664   } else {
   1665     char_mask = String::kMaxUC16CharCode;
   1666   }
   1667   mask_ = 0;
   1668   value_ = 0;
   1669   int char_shift = 0;
   1670   for (int i = 0; i < characters_; i++) {
   1671     Position* pos = &positions_[i];
   1672     if ((pos->mask & String::kMaxAsciiCharCode) != 0) {
   1673       found_useful_op = true;
   1674     }
   1675     mask_ |= (pos->mask & char_mask) << char_shift;
   1676     value_ |= (pos->value & char_mask) << char_shift;
   1677     char_shift += asc ? 8 : 16;
   1678   }
   1679   return found_useful_op;
   1680 }
   1681 
   1682 
   1683 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
   1684                                 Trace* trace,
   1685                                 bool preload_has_checked_bounds,
   1686                                 Label* on_possible_success,
   1687                                 QuickCheckDetails* details,
   1688                                 bool fall_through_on_failure) {
   1689   if (details->characters() == 0) return false;
   1690   GetQuickCheckDetails(details, compiler, 0, trace->at_start() == Trace::FALSE);
   1691   if (details->cannot_match()) return false;
   1692   if (!details->Rationalize(compiler->ascii())) return false;
   1693   ASSERT(details->characters() == 1 ||
   1694          compiler->macro_assembler()->CanReadUnaligned());
   1695   uint32_t mask = details->mask();
   1696   uint32_t value = details->value();
   1697 
   1698   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   1699 
   1700   if (trace->characters_preloaded() != details->characters()) {
   1701     assembler->LoadCurrentCharacter(trace->cp_offset(),
   1702                                     trace->backtrack(),
   1703                                     !preload_has_checked_bounds,
   1704                                     details->characters());
   1705   }
   1706 
   1707 
   1708   bool need_mask = true;
   1709 
   1710   if (details->characters() == 1) {
   1711     // If number of characters preloaded is 1 then we used a byte or 16 bit
   1712     // load so the value is already masked down.
   1713     uint32_t char_mask;
   1714     if (compiler->ascii()) {
   1715       char_mask = String::kMaxAsciiCharCode;
   1716     } else {
   1717       char_mask = String::kMaxUC16CharCode;
   1718     }
   1719     if ((mask & char_mask) == char_mask) need_mask = false;
   1720     mask &= char_mask;
   1721   } else {
   1722     // For 2-character preloads in ASCII mode we also use a 16 bit load with
   1723     // zero extend.
   1724     if (details->characters() == 2 && compiler->ascii()) {
   1725       if ((mask & 0xffff) == 0xffff) need_mask = false;
   1726     } else {
   1727       if (mask == 0xffffffff) need_mask = false;
   1728     }
   1729   }
   1730 
   1731   if (fall_through_on_failure) {
   1732     if (need_mask) {
   1733       assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
   1734     } else {
   1735       assembler->CheckCharacter(value, on_possible_success);
   1736     }
   1737   } else {
   1738     if (need_mask) {
   1739       assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
   1740     } else {
   1741       assembler->CheckNotCharacter(value, trace->backtrack());
   1742     }
   1743   }
   1744   return true;
   1745 }
   1746 
   1747 
   1748 // Here is the meat of GetQuickCheckDetails (see also the comment on the
   1749 // super-class in the .h file).
   1750 //
   1751 // We iterate along the text object, building up for each character a
   1752 // mask and value that can be used to test for a quick failure to match.
   1753 // The masks and values for the positions will be combined into a single
   1754 // machine word for the current character width in order to be used in
   1755 // generating a quick check.
   1756 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
   1757                                     RegExpCompiler* compiler,
   1758                                     int characters_filled_in,
   1759                                     bool not_at_start) {
   1760   ASSERT(characters_filled_in < details->characters());
   1761   int characters = details->characters();
   1762   int char_mask;
   1763   int char_shift;
   1764   if (compiler->ascii()) {
   1765     char_mask = String::kMaxAsciiCharCode;
   1766     char_shift = 8;
   1767   } else {
   1768     char_mask = String::kMaxUC16CharCode;
   1769     char_shift = 16;
   1770   }
   1771   for (int k = 0; k < elms_->length(); k++) {
   1772     TextElement elm = elms_->at(k);
   1773     if (elm.type == TextElement::ATOM) {
   1774       Vector<const uc16> quarks = elm.data.u_atom->data();
   1775       for (int i = 0; i < characters && i < quarks.length(); i++) {
   1776         QuickCheckDetails::Position* pos =
   1777             details->positions(characters_filled_in);
   1778         uc16 c = quarks[i];
   1779         if (c > char_mask) {
   1780           // If we expect a non-ASCII character from an ASCII string,
   1781           // there is no way we can match. Not even case independent
   1782           // matching can turn an ASCII character into non-ASCII or
   1783           // vice versa.
   1784           details->set_cannot_match();
   1785           pos->determines_perfectly = false;
   1786           return;
   1787         }
   1788         if (compiler->ignore_case()) {
   1789           unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
   1790           int length = GetCaseIndependentLetters(c, compiler->ascii(), chars);
   1791           ASSERT(length != 0);  // Can only happen if c > char_mask (see above).
   1792           if (length == 1) {
   1793             // This letter has no case equivalents, so it's nice and simple
   1794             // and the mask-compare will determine definitely whether we have
   1795             // a match at this character position.
   1796             pos->mask = char_mask;
   1797             pos->value = c;
   1798             pos->determines_perfectly = true;
   1799           } else {
   1800             uint32_t common_bits = char_mask;
   1801             uint32_t bits = chars[0];
   1802             for (int j = 1; j < length; j++) {
   1803               uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
   1804               common_bits ^= differing_bits;
   1805               bits &= common_bits;
   1806             }
   1807             // If length is 2 and common bits has only one zero in it then
   1808             // our mask and compare instruction will determine definitely
   1809             // whether we have a match at this character position.  Otherwise
   1810             // it can only be an approximate check.
   1811             uint32_t one_zero = (common_bits | ~char_mask);
   1812             if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
   1813               pos->determines_perfectly = true;
   1814             }
   1815             pos->mask = common_bits;
   1816             pos->value = bits;
   1817           }
   1818         } else {
   1819           // Don't ignore case.  Nice simple case where the mask-compare will
   1820           // determine definitely whether we have a match at this character
   1821           // position.
   1822           pos->mask = char_mask;
   1823           pos->value = c;
   1824           pos->determines_perfectly = true;
   1825         }
   1826         characters_filled_in++;
   1827         ASSERT(characters_filled_in <= details->characters());
   1828         if (characters_filled_in == details->characters()) {
   1829           return;
   1830         }
   1831       }
   1832     } else {
   1833       QuickCheckDetails::Position* pos =
   1834           details->positions(characters_filled_in);
   1835       RegExpCharacterClass* tree = elm.data.u_char_class;
   1836       ZoneList<CharacterRange>* ranges = tree->ranges();
   1837       if (tree->is_negated()) {
   1838         // A quick check uses multi-character mask and compare.  There is no
   1839         // useful way to incorporate a negative char class into this scheme
   1840         // so we just conservatively create a mask and value that will always
   1841         // succeed.
   1842         pos->mask = 0;
   1843         pos->value = 0;
   1844       } else {
   1845         int first_range = 0;
   1846         while (ranges->at(first_range).from() > char_mask) {
   1847           first_range++;
   1848           if (first_range == ranges->length()) {
   1849             details->set_cannot_match();
   1850             pos->determines_perfectly = false;
   1851             return;
   1852           }
   1853         }
   1854         CharacterRange range = ranges->at(first_range);
   1855         uc16 from = range.from();
   1856         uc16 to = range.to();
   1857         if (to > char_mask) {
   1858           to = char_mask;
   1859         }
   1860         uint32_t differing_bits = (from ^ to);
   1861         // A mask and compare is only perfect if the differing bits form a
   1862         // number like 00011111 with one single block of trailing 1s.
   1863         if ((differing_bits & (differing_bits + 1)) == 0 &&
   1864              from + differing_bits == to) {
   1865           pos->determines_perfectly = true;
   1866         }
   1867         uint32_t common_bits = ~SmearBitsRight(differing_bits);
   1868         uint32_t bits = (from & common_bits);
   1869         for (int i = first_range + 1; i < ranges->length(); i++) {
   1870           CharacterRange range = ranges->at(i);
   1871           uc16 from = range.from();
   1872           uc16 to = range.to();
   1873           if (from > char_mask) continue;
   1874           if (to > char_mask) to = char_mask;
   1875           // Here we are combining more ranges into the mask and compare
   1876           // value.  With each new range the mask becomes more sparse and
   1877           // so the chances of a false positive rise.  A character class
   1878           // with multiple ranges is assumed never to be equivalent to a
   1879           // mask and compare operation.
   1880           pos->determines_perfectly = false;
   1881           uint32_t new_common_bits = (from ^ to);
   1882           new_common_bits = ~SmearBitsRight(new_common_bits);
   1883           common_bits &= new_common_bits;
   1884           bits &= new_common_bits;
   1885           uint32_t differing_bits = (from & common_bits) ^ bits;
   1886           common_bits ^= differing_bits;
   1887           bits &= common_bits;
   1888         }
   1889         pos->mask = common_bits;
   1890         pos->value = bits;
   1891       }
   1892       characters_filled_in++;
   1893       ASSERT(characters_filled_in <= details->characters());
   1894       if (characters_filled_in == details->characters()) {
   1895         return;
   1896       }
   1897     }
   1898   }
   1899   ASSERT(characters_filled_in != details->characters());
   1900   on_success()-> GetQuickCheckDetails(details,
   1901                                       compiler,
   1902                                       characters_filled_in,
   1903                                       true);
   1904 }
   1905 
   1906 
   1907 void QuickCheckDetails::Clear() {
   1908   for (int i = 0; i < characters_; i++) {
   1909     positions_[i].mask = 0;
   1910     positions_[i].value = 0;
   1911     positions_[i].determines_perfectly = false;
   1912   }
   1913   characters_ = 0;
   1914 }
   1915 
   1916 
   1917 void QuickCheckDetails::Advance(int by, bool ascii) {
   1918   ASSERT(by >= 0);
   1919   if (by >= characters_) {
   1920     Clear();
   1921     return;
   1922   }
   1923   for (int i = 0; i < characters_ - by; i++) {
   1924     positions_[i] = positions_[by + i];
   1925   }
   1926   for (int i = characters_ - by; i < characters_; i++) {
   1927     positions_[i].mask = 0;
   1928     positions_[i].value = 0;
   1929     positions_[i].determines_perfectly = false;
   1930   }
   1931   characters_ -= by;
   1932   // We could change mask_ and value_ here but we would never advance unless
   1933   // they had already been used in a check and they won't be used again because
   1934   // it would gain us nothing.  So there's no point.
   1935 }
   1936 
   1937 
   1938 void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
   1939   ASSERT(characters_ == other->characters_);
   1940   if (other->cannot_match_) {
   1941     return;
   1942   }
   1943   if (cannot_match_) {
   1944     *this = *other;
   1945     return;
   1946   }
   1947   for (int i = from_index; i < characters_; i++) {
   1948     QuickCheckDetails::Position* pos = positions(i);
   1949     QuickCheckDetails::Position* other_pos = other->positions(i);
   1950     if (pos->mask != other_pos->mask ||
   1951         pos->value != other_pos->value ||
   1952         !other_pos->determines_perfectly) {
   1953       // Our mask-compare operation will be approximate unless we have the
   1954       // exact same operation on both sides of the alternation.
   1955       pos->determines_perfectly = false;
   1956     }
   1957     pos->mask &= other_pos->mask;
   1958     pos->value &= pos->mask;
   1959     other_pos->value &= pos->mask;
   1960     uc16 differing_bits = (pos->value ^ other_pos->value);
   1961     pos->mask &= ~differing_bits;
   1962     pos->value &= pos->mask;
   1963   }
   1964 }
   1965 
   1966 
   1967 class VisitMarker {
   1968  public:
   1969   explicit VisitMarker(NodeInfo* info) : info_(info) {
   1970     ASSERT(!info->visited);
   1971     info->visited = true;
   1972   }
   1973   ~VisitMarker() {
   1974     info_->visited = false;
   1975   }
   1976  private:
   1977   NodeInfo* info_;
   1978 };
   1979 
   1980 
   1981 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
   1982                                           RegExpCompiler* compiler,
   1983                                           int characters_filled_in,
   1984                                           bool not_at_start) {
   1985   if (body_can_be_zero_length_ || info()->visited) return;
   1986   VisitMarker marker(info());
   1987   return ChoiceNode::GetQuickCheckDetails(details,
   1988                                           compiler,
   1989                                           characters_filled_in,
   1990                                           not_at_start);
   1991 }
   1992 
   1993 
   1994 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
   1995                                       RegExpCompiler* compiler,
   1996                                       int characters_filled_in,
   1997                                       bool not_at_start) {
   1998   not_at_start = (not_at_start || not_at_start_);
   1999   int choice_count = alternatives_->length();
   2000   ASSERT(choice_count > 0);
   2001   alternatives_->at(0).node()->GetQuickCheckDetails(details,
   2002                                                     compiler,
   2003                                                     characters_filled_in,
   2004                                                     not_at_start);
   2005   for (int i = 1; i < choice_count; i++) {
   2006     QuickCheckDetails new_details(details->characters());
   2007     RegExpNode* node = alternatives_->at(i).node();
   2008     node->GetQuickCheckDetails(&new_details, compiler,
   2009                                characters_filled_in,
   2010                                not_at_start);
   2011     // Here we merge the quick match details of the two branches.
   2012     details->Merge(&new_details, characters_filled_in);
   2013   }
   2014 }
   2015 
   2016 
   2017 // Check for [0-9A-Z_a-z].
   2018 static void EmitWordCheck(RegExpMacroAssembler* assembler,
   2019                           Label* word,
   2020                           Label* non_word,
   2021                           bool fall_through_on_word) {
   2022   if (assembler->CheckSpecialCharacterClass(
   2023           fall_through_on_word ? 'w' : 'W',
   2024           fall_through_on_word ? non_word : word)) {
   2025     // Optimized implementation available.
   2026     return;
   2027   }
   2028   assembler->CheckCharacterGT('z', non_word);
   2029   assembler->CheckCharacterLT('0', non_word);
   2030   assembler->CheckCharacterGT('a' - 1, word);
   2031   assembler->CheckCharacterLT('9' + 1, word);
   2032   assembler->CheckCharacterLT('A', non_word);
   2033   assembler->CheckCharacterLT('Z' + 1, word);
   2034   if (fall_through_on_word) {
   2035     assembler->CheckNotCharacter('_', non_word);
   2036   } else {
   2037     assembler->CheckCharacter('_', word);
   2038   }
   2039 }
   2040 
   2041 
   2042 // Emit the code to check for a ^ in multiline mode (1-character lookbehind
   2043 // that matches newline or the start of input).
   2044 static void EmitHat(RegExpCompiler* compiler,
   2045                     RegExpNode* on_success,
   2046                     Trace* trace) {
   2047   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   2048   // We will be loading the previous character into the current character
   2049   // register.
   2050   Trace new_trace(*trace);
   2051   new_trace.InvalidateCurrentCharacter();
   2052 
   2053   Label ok;
   2054   if (new_trace.cp_offset() == 0) {
   2055     // The start of input counts as a newline in this context, so skip to
   2056     // ok if we are at the start.
   2057     assembler->CheckAtStart(&ok);
   2058   }
   2059   // We already checked that we are not at the start of input so it must be
   2060   // OK to load the previous character.
   2061   assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
   2062                                   new_trace.backtrack(),
   2063                                   false);
   2064   if (!assembler->CheckSpecialCharacterClass('n',
   2065                                              new_trace.backtrack())) {
   2066     // Newline means \n, \r, 0x2028 or 0x2029.
   2067     if (!compiler->ascii()) {
   2068       assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
   2069     }
   2070     assembler->CheckCharacter('\n', &ok);
   2071     assembler->CheckNotCharacter('\r', new_trace.backtrack());
   2072   }
   2073   assembler->Bind(&ok);
   2074   on_success->Emit(compiler, &new_trace);
   2075 }
   2076 
   2077 
   2078 // Emit the code to handle \b and \B (word-boundary or non-word-boundary)
   2079 // when we know whether the next character must be a word character or not.
   2080 static void EmitHalfBoundaryCheck(AssertionNode::AssertionNodeType type,
   2081                                   RegExpCompiler* compiler,
   2082                                   RegExpNode* on_success,
   2083                                   Trace* trace) {
   2084   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   2085   Label done;
   2086 
   2087   Trace new_trace(*trace);
   2088 
   2089   bool expect_word_character = (type == AssertionNode::AFTER_WORD_CHARACTER);
   2090   Label* on_word = expect_word_character ? &done : new_trace.backtrack();
   2091   Label* on_non_word = expect_word_character ? new_trace.backtrack() : &done;
   2092 
   2093   // Check whether previous character was a word character.
   2094   switch (trace->at_start()) {
   2095     case Trace::TRUE:
   2096       if (expect_word_character) {
   2097         assembler->GoTo(on_non_word);
   2098       }
   2099       break;
   2100     case Trace::UNKNOWN:
   2101       ASSERT_EQ(0, trace->cp_offset());
   2102       assembler->CheckAtStart(on_non_word);
   2103       // Fall through.
   2104     case Trace::FALSE:
   2105       int prev_char_offset = trace->cp_offset() - 1;
   2106       assembler->LoadCurrentCharacter(prev_char_offset, NULL, false, 1);
   2107       EmitWordCheck(assembler, on_word, on_non_word, expect_word_character);
   2108       // We may or may not have loaded the previous character.
   2109       new_trace.InvalidateCurrentCharacter();
   2110   }
   2111 
   2112   assembler->Bind(&done);
   2113 
   2114   on_success->Emit(compiler, &new_trace);
   2115 }
   2116 
   2117 
   2118 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
   2119 static void EmitBoundaryCheck(AssertionNode::AssertionNodeType type,
   2120                               RegExpCompiler* compiler,
   2121                               RegExpNode* on_success,
   2122                               Trace* trace) {
   2123   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   2124   Label before_non_word;
   2125   Label before_word;
   2126   if (trace->characters_preloaded() != 1) {
   2127     assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
   2128   }
   2129   // Fall through on non-word.
   2130   EmitWordCheck(assembler, &before_word, &before_non_word, false);
   2131 
   2132   // We will be loading the previous character into the current character
   2133   // register.
   2134   Trace new_trace(*trace);
   2135   new_trace.InvalidateCurrentCharacter();
   2136 
   2137   Label ok;
   2138   Label* boundary;
   2139   Label* not_boundary;
   2140   if (type == AssertionNode::AT_BOUNDARY) {
   2141     boundary = &ok;
   2142     not_boundary = new_trace.backtrack();
   2143   } else {
   2144     not_boundary = &ok;
   2145     boundary = new_trace.backtrack();
   2146   }
   2147 
   2148   // Next character is not a word character.
   2149   assembler->Bind(&before_non_word);
   2150   if (new_trace.cp_offset() == 0) {
   2151     // The start of input counts as a non-word character, so the question is
   2152     // decided if we are at the start.
   2153     assembler->CheckAtStart(not_boundary);
   2154   }
   2155   // We already checked that we are not at the start of input so it must be
   2156   // OK to load the previous character.
   2157   assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
   2158                                   &ok,  // Unused dummy label in this call.
   2159                                   false);
   2160   // Fall through on non-word.
   2161   EmitWordCheck(assembler, boundary, not_boundary, false);
   2162   assembler->GoTo(not_boundary);
   2163 
   2164   // Next character is a word character.
   2165   assembler->Bind(&before_word);
   2166   if (new_trace.cp_offset() == 0) {
   2167     // The start of input counts as a non-word character, so the question is
   2168     // decided if we are at the start.
   2169     assembler->CheckAtStart(boundary);
   2170   }
   2171   // We already checked that we are not at the start of input so it must be
   2172   // OK to load the previous character.
   2173   assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
   2174                                   &ok,  // Unused dummy label in this call.
   2175                                   false);
   2176   bool fall_through_on_word = (type == AssertionNode::AT_NON_BOUNDARY);
   2177   EmitWordCheck(assembler, not_boundary, boundary, fall_through_on_word);
   2178 
   2179   assembler->Bind(&ok);
   2180 
   2181   on_success->Emit(compiler, &new_trace);
   2182 }
   2183 
   2184 
   2185 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
   2186                                          RegExpCompiler* compiler,
   2187                                          int filled_in,
   2188                                          bool not_at_start) {
   2189   if (type_ == AT_START && not_at_start) {
   2190     details->set_cannot_match();
   2191     return;
   2192   }
   2193   return on_success()->GetQuickCheckDetails(details,
   2194                                             compiler,
   2195                                             filled_in,
   2196                                             not_at_start);
   2197 }
   2198 
   2199 
   2200 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
   2201   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   2202   switch (type_) {
   2203     case AT_END: {
   2204       Label ok;
   2205       assembler->CheckPosition(trace->cp_offset(), &ok);
   2206       assembler->GoTo(trace->backtrack());
   2207       assembler->Bind(&ok);
   2208       break;
   2209     }
   2210     case AT_START: {
   2211       if (trace->at_start() == Trace::FALSE) {
   2212         assembler->GoTo(trace->backtrack());
   2213         return;
   2214       }
   2215       if (trace->at_start() == Trace::UNKNOWN) {
   2216         assembler->CheckNotAtStart(trace->backtrack());
   2217         Trace at_start_trace = *trace;
   2218         at_start_trace.set_at_start(true);
   2219         on_success()->Emit(compiler, &at_start_trace);
   2220         return;
   2221       }
   2222     }
   2223     break;
   2224     case AFTER_NEWLINE:
   2225       EmitHat(compiler, on_success(), trace);
   2226       return;
   2227     case AT_BOUNDARY:
   2228     case AT_NON_BOUNDARY: {
   2229       EmitBoundaryCheck(type_, compiler, on_success(), trace);
   2230       return;
   2231     }
   2232     case AFTER_WORD_CHARACTER:
   2233     case AFTER_NONWORD_CHARACTER: {
   2234       EmitHalfBoundaryCheck(type_, compiler, on_success(), trace);
   2235     }
   2236   }
   2237   on_success()->Emit(compiler, trace);
   2238 }
   2239 
   2240 
   2241 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
   2242   if (quick_check == NULL) return false;
   2243   if (offset >= quick_check->characters()) return false;
   2244   return quick_check->positions(offset)->determines_perfectly;
   2245 }
   2246 
   2247 
   2248 static void UpdateBoundsCheck(int index, int* checked_up_to) {
   2249   if (index > *checked_up_to) {
   2250     *checked_up_to = index;
   2251   }
   2252 }
   2253 
   2254 
   2255 // We call this repeatedly to generate code for each pass over the text node.
   2256 // The passes are in increasing order of difficulty because we hope one
   2257 // of the first passes will fail in which case we are saved the work of the
   2258 // later passes.  for example for the case independent regexp /%[asdfghjkl]a/
   2259 // we will check the '%' in the first pass, the case independent 'a' in the
   2260 // second pass and the character class in the last pass.
   2261 //
   2262 // The passes are done from right to left, so for example to test for /bar/
   2263 // we will first test for an 'r' with offset 2, then an 'a' with offset 1
   2264 // and then a 'b' with offset 0.  This means we can avoid the end-of-input
   2265 // bounds check most of the time.  In the example we only need to check for
   2266 // end-of-input when loading the putative 'r'.
   2267 //
   2268 // A slight complication involves the fact that the first character may already
   2269 // be fetched into a register by the previous node.  In this case we want to
   2270 // do the test for that character first.  We do this in separate passes.  The
   2271 // 'preloaded' argument indicates that we are doing such a 'pass'.  If such a
   2272 // pass has been performed then subsequent passes will have true in
   2273 // first_element_checked to indicate that that character does not need to be
   2274 // checked again.
   2275 //
   2276 // In addition to all this we are passed a Trace, which can
   2277 // contain an AlternativeGeneration object.  In this AlternativeGeneration
   2278 // object we can see details of any quick check that was already passed in
   2279 // order to get to the code we are now generating.  The quick check can involve
   2280 // loading characters, which means we do not need to recheck the bounds
   2281 // up to the limit the quick check already checked.  In addition the quick
   2282 // check can have involved a mask and compare operation which may simplify
   2283 // or obviate the need for further checks at some character positions.
   2284 void TextNode::TextEmitPass(RegExpCompiler* compiler,
   2285                             TextEmitPassType pass,
   2286                             bool preloaded,
   2287                             Trace* trace,
   2288                             bool first_element_checked,
   2289                             int* checked_up_to) {
   2290   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   2291   bool ascii = compiler->ascii();
   2292   Label* backtrack = trace->backtrack();
   2293   QuickCheckDetails* quick_check = trace->quick_check_performed();
   2294   int element_count = elms_->length();
   2295   for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
   2296     TextElement elm = elms_->at(i);
   2297     int cp_offset = trace->cp_offset() + elm.cp_offset;
   2298     if (elm.type == TextElement::ATOM) {
   2299       Vector<const uc16> quarks = elm.data.u_atom->data();
   2300       for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
   2301         if (first_element_checked && i == 0 && j == 0) continue;
   2302         if (DeterminedAlready(quick_check, elm.cp_offset + j)) continue;
   2303         EmitCharacterFunction* emit_function = NULL;
   2304         switch (pass) {
   2305           case NON_ASCII_MATCH:
   2306             ASSERT(ascii);
   2307             if (quarks[j] > String::kMaxAsciiCharCode) {
   2308               assembler->GoTo(backtrack);
   2309               return;
   2310             }
   2311             break;
   2312           case NON_LETTER_CHARACTER_MATCH:
   2313             emit_function = &EmitAtomNonLetter;
   2314             break;
   2315           case SIMPLE_CHARACTER_MATCH:
   2316             emit_function = &EmitSimpleCharacter;
   2317             break;
   2318           case CASE_CHARACTER_MATCH:
   2319             emit_function = &EmitAtomLetter;
   2320             break;
   2321           default:
   2322             break;
   2323         }
   2324         if (emit_function != NULL) {
   2325           bool bound_checked = emit_function(compiler,
   2326                                              quarks[j],
   2327                                              backtrack,
   2328                                              cp_offset + j,
   2329                                              *checked_up_to < cp_offset + j,
   2330                                              preloaded);
   2331           if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
   2332         }
   2333       }
   2334     } else {
   2335       ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);
   2336       if (pass == CHARACTER_CLASS_MATCH) {
   2337         if (first_element_checked && i == 0) continue;
   2338         if (DeterminedAlready(quick_check, elm.cp_offset)) continue;
   2339         RegExpCharacterClass* cc = elm.data.u_char_class;
   2340         EmitCharClass(assembler,
   2341                       cc,
   2342                       ascii,
   2343                       backtrack,
   2344                       cp_offset,
   2345                       *checked_up_to < cp_offset,
   2346                       preloaded);
   2347         UpdateBoundsCheck(cp_offset, checked_up_to);
   2348       }
   2349     }
   2350   }
   2351 }
   2352 
   2353 
   2354 int TextNode::Length() {
   2355   TextElement elm = elms_->last();
   2356   ASSERT(elm.cp_offset >= 0);
   2357   if (elm.type == TextElement::ATOM) {
   2358     return elm.cp_offset + elm.data.u_atom->data().length();
   2359   } else {
   2360     return elm.cp_offset + 1;
   2361   }
   2362 }
   2363 
   2364 
   2365 bool TextNode::SkipPass(int int_pass, bool ignore_case) {
   2366   TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
   2367   if (ignore_case) {
   2368     return pass == SIMPLE_CHARACTER_MATCH;
   2369   } else {
   2370     return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
   2371   }
   2372 }
   2373 
   2374 
   2375 // This generates the code to match a text node.  A text node can contain
   2376 // straight character sequences (possibly to be matched in a case-independent
   2377 // way) and character classes.  For efficiency we do not do this in a single
   2378 // pass from left to right.  Instead we pass over the text node several times,
   2379 // emitting code for some character positions every time.  See the comment on
   2380 // TextEmitPass for details.
   2381 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
   2382   LimitResult limit_result = LimitVersions(compiler, trace);
   2383   if (limit_result == DONE) return;
   2384   ASSERT(limit_result == CONTINUE);
   2385 
   2386   if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
   2387     compiler->SetRegExpTooBig();
   2388     return;
   2389   }
   2390 
   2391   if (compiler->ascii()) {
   2392     int dummy = 0;
   2393     TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy);
   2394   }
   2395 
   2396   bool first_elt_done = false;
   2397   int bound_checked_to = trace->cp_offset() - 1;
   2398   bound_checked_to += trace->bound_checked_up_to();
   2399 
   2400   // If a character is preloaded into the current character register then
   2401   // check that now.
   2402   if (trace->characters_preloaded() == 1) {
   2403     for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
   2404       if (!SkipPass(pass, compiler->ignore_case())) {
   2405         TextEmitPass(compiler,
   2406                      static_cast<TextEmitPassType>(pass),
   2407                      true,
   2408                      trace,
   2409                      false,
   2410                      &bound_checked_to);
   2411       }
   2412     }
   2413     first_elt_done = true;
   2414   }
   2415 
   2416   for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
   2417     if (!SkipPass(pass, compiler->ignore_case())) {
   2418       TextEmitPass(compiler,
   2419                    static_cast<TextEmitPassType>(pass),
   2420                    false,
   2421                    trace,
   2422                    first_elt_done,
   2423                    &bound_checked_to);
   2424     }
   2425   }
   2426 
   2427   Trace successor_trace(*trace);
   2428   successor_trace.set_at_start(false);
   2429   successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);
   2430   RecursionCheck rc(compiler);
   2431   on_success()->Emit(compiler, &successor_trace);
   2432 }
   2433 
   2434 
   2435 void Trace::InvalidateCurrentCharacter() {
   2436   characters_preloaded_ = 0;
   2437 }
   2438 
   2439 
   2440 void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
   2441   ASSERT(by > 0);
   2442   // We don't have an instruction for shifting the current character register
   2443   // down or for using a shifted value for anything so lets just forget that
   2444   // we preloaded any characters into it.
   2445   characters_preloaded_ = 0;
   2446   // Adjust the offsets of the quick check performed information.  This
   2447   // information is used to find out what we already determined about the
   2448   // characters by means of mask and compare.
   2449   quick_check_performed_.Advance(by, compiler->ascii());
   2450   cp_offset_ += by;
   2451   if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
   2452     compiler->SetRegExpTooBig();
   2453     cp_offset_ = 0;
   2454   }
   2455   bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
   2456 }
   2457 
   2458 
   2459 void TextNode::MakeCaseIndependent(bool is_ascii) {
   2460   int element_count = elms_->length();
   2461   for (int i = 0; i < element_count; i++) {
   2462     TextElement elm = elms_->at(i);
   2463     if (elm.type == TextElement::CHAR_CLASS) {
   2464       RegExpCharacterClass* cc = elm.data.u_char_class;
   2465       // None of the standard character classses is different in the case
   2466       // independent case and it slows us down if we don't know that.
   2467       if (cc->is_standard()) continue;
   2468       ZoneList<CharacterRange>* ranges = cc->ranges();
   2469       int range_count = ranges->length();
   2470       for (int j = 0; j < range_count; j++) {
   2471         ranges->at(j).AddCaseEquivalents(ranges, is_ascii);
   2472       }
   2473     }
   2474   }
   2475 }
   2476 
   2477 
   2478 int TextNode::GreedyLoopTextLength() {
   2479   TextElement elm = elms_->at(elms_->length() - 1);
   2480   if (elm.type == TextElement::CHAR_CLASS) {
   2481     return elm.cp_offset + 1;
   2482   } else {
   2483     return elm.cp_offset + elm.data.u_atom->data().length();
   2484   }
   2485 }
   2486 
   2487 
   2488 // Finds the fixed match length of a sequence of nodes that goes from
   2489 // this alternative and back to this choice node.  If there are variable
   2490 // length nodes or other complications in the way then return a sentinel
   2491 // value indicating that a greedy loop cannot be constructed.
   2492 int ChoiceNode::GreedyLoopTextLength(GuardedAlternative* alternative) {
   2493   int length = 0;
   2494   RegExpNode* node = alternative->node();
   2495   // Later we will generate code for all these text nodes using recursion
   2496   // so we have to limit the max number.
   2497   int recursion_depth = 0;
   2498   while (node != this) {
   2499     if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
   2500       return kNodeIsTooComplexForGreedyLoops;
   2501     }
   2502     int node_length = node->GreedyLoopTextLength();
   2503     if (node_length == kNodeIsTooComplexForGreedyLoops) {
   2504       return kNodeIsTooComplexForGreedyLoops;
   2505     }
   2506     length += node_length;
   2507     SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
   2508     node = seq_node->on_success();
   2509   }
   2510   return length;
   2511 }
   2512 
   2513 
   2514 void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
   2515   ASSERT_EQ(loop_node_, NULL);
   2516   AddAlternative(alt);
   2517   loop_node_ = alt.node();
   2518 }
   2519 
   2520 
   2521 void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
   2522   ASSERT_EQ(continue_node_, NULL);
   2523   AddAlternative(alt);
   2524   continue_node_ = alt.node();
   2525 }
   2526 
   2527 
   2528 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
   2529   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
   2530   if (trace->stop_node() == this) {
   2531     int text_length = GreedyLoopTextLength(&(alternatives_->at(0)));
   2532     ASSERT(text_length != kNodeIsTooComplexForGreedyLoops);
   2533     // Update the counter-based backtracking info on the stack.  This is an
   2534     // optimization for greedy loops (see below).
   2535     ASSERT(trace->cp_offset() == text_length);
   2536     macro_assembler->AdvanceCurrentPosition(text_length);
   2537     macro_assembler->GoTo(trace->loop_label());
   2538     return;
   2539   }
   2540   ASSERT(trace->stop_node() == NULL);
   2541   if (!trace->is_trivial()) {
   2542     trace->Flush(compiler, this);
   2543     return;
   2544   }
   2545   ChoiceNode::Emit(compiler, trace);
   2546 }
   2547 
   2548 
   2549 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler) {
   2550   int preload_characters = EatsAtLeast(4, 0);
   2551   if (compiler->macro_assembler()->CanReadUnaligned()) {
   2552     bool ascii = compiler->ascii();
   2553     if (ascii) {
   2554       if (preload_characters > 4) preload_characters = 4;
   2555       // We can't preload 3 characters because there is no machine instruction
   2556       // to do that.  We can't just load 4 because we could be reading
   2557       // beyond the end of the string, which could cause a memory fault.
   2558       if (preload_characters == 3) preload_characters = 2;
   2559     } else {
   2560       if (preload_characters > 2) preload_characters = 2;
   2561     }
   2562   } else {
   2563     if (preload_characters > 1) preload_characters = 1;
   2564   }
   2565   return preload_characters;
   2566 }
   2567 
   2568 
   2569 // This class is used when generating the alternatives in a choice node.  It
   2570 // records the way the alternative is being code generated.
   2571 class AlternativeGeneration: public Malloced {
   2572  public:
   2573   AlternativeGeneration()
   2574       : possible_success(),
   2575         expects_preload(false),
   2576         after(),
   2577         quick_check_details() { }
   2578   Label possible_success;
   2579   bool expects_preload;
   2580   Label after;
   2581   QuickCheckDetails quick_check_details;
   2582 };
   2583 
   2584 
   2585 // Creates a list of AlternativeGenerations.  If the list has a reasonable
   2586 // size then it is on the stack, otherwise the excess is on the heap.
   2587 class AlternativeGenerationList {
   2588  public:
   2589   explicit AlternativeGenerationList(int count)
   2590       : alt_gens_(count) {
   2591     for (int i = 0; i < count && i < kAFew; i++) {
   2592       alt_gens_.Add(a_few_alt_gens_ + i);
   2593     }
   2594     for (int i = kAFew; i < count; i++) {
   2595       alt_gens_.Add(new AlternativeGeneration());
   2596     }
   2597   }
   2598   ~AlternativeGenerationList() {
   2599     for (int i = kAFew; i < alt_gens_.length(); i++) {
   2600       delete alt_gens_[i];
   2601       alt_gens_[i] = NULL;
   2602     }
   2603   }
   2604 
   2605   AlternativeGeneration* at(int i) {
   2606     return alt_gens_[i];
   2607   }
   2608  private:
   2609   static const int kAFew = 10;
   2610   ZoneList<AlternativeGeneration*> alt_gens_;
   2611   AlternativeGeneration a_few_alt_gens_[kAFew];
   2612 };
   2613 
   2614 
   2615 /* Code generation for choice nodes.
   2616  *
   2617  * We generate quick checks that do a mask and compare to eliminate a
   2618  * choice.  If the quick check succeeds then it jumps to the continuation to
   2619  * do slow checks and check subsequent nodes.  If it fails (the common case)
   2620  * it falls through to the next choice.
   2621  *
   2622  * Here is the desired flow graph.  Nodes directly below each other imply
   2623  * fallthrough.  Alternatives 1 and 2 have quick checks.  Alternative
   2624  * 3 doesn't have a quick check so we have to call the slow check.
   2625  * Nodes are marked Qn for quick checks and Sn for slow checks.  The entire
   2626  * regexp continuation is generated directly after the Sn node, up to the
   2627  * next GoTo if we decide to reuse some already generated code.  Some
   2628  * nodes expect preload_characters to be preloaded into the current
   2629  * character register.  R nodes do this preloading.  Vertices are marked
   2630  * F for failures and S for success (possible success in the case of quick
   2631  * nodes).  L, V, < and > are used as arrow heads.
   2632  *
   2633  * ----------> R
   2634  *             |
   2635  *             V
   2636  *            Q1 -----> S1
   2637  *             |   S   /
   2638  *            F|      /
   2639  *             |    F/
   2640  *             |    /
   2641  *             |   R
   2642  *             |  /
   2643  *             V L
   2644  *            Q2 -----> S2
   2645  *             |   S   /
   2646  *            F|      /
   2647  *             |    F/
   2648  *             |    /
   2649  *             |   R
   2650  *             |  /
   2651  *             V L
   2652  *            S3
   2653  *             |
   2654  *            F|
   2655  *             |
   2656  *             R
   2657  *             |
   2658  * backtrack   V
   2659  * <----------Q4
   2660  *   \    F    |
   2661  *    \        |S
   2662  *     \   F   V
   2663  *      \-----S4
   2664  *
   2665  * For greedy loops we reverse our expectation and expect to match rather
   2666  * than fail. Therefore we want the loop code to look like this (U is the
   2667  * unwind code that steps back in the greedy loop).  The following alternatives
   2668  * look the same as above.
   2669  *              _____
   2670  *             /     \
   2671  *             V     |
   2672  * ----------> S1    |
   2673  *            /|     |
   2674  *           / |S    |
   2675  *         F/  \_____/
   2676  *         /
   2677  *        |<-----------
   2678  *        |            \
   2679  *        V             \
   2680  *        Q2 ---> S2     \
   2681  *        |  S   /       |
   2682  *       F|     /        |
   2683  *        |   F/         |
   2684  *        |   /          |
   2685  *        |  R           |
   2686  *        | /            |
   2687  *   F    VL             |
   2688  * <------U              |
   2689  * back   |S             |
   2690  *        \______________/
   2691  */
   2692 
   2693 
   2694 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
   2695   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
   2696   int choice_count = alternatives_->length();
   2697 #ifdef DEBUG
   2698   for (int i = 0; i < choice_count - 1; i++) {
   2699     GuardedAlternative alternative = alternatives_->at(i);
   2700     ZoneList<Guard*>* guards = alternative.guards();
   2701     int guard_count = (guards == NULL) ? 0 : guards->length();
   2702     for (int j = 0; j < guard_count; j++) {
   2703       ASSERT(!trace->mentions_reg(guards->at(j)->reg()));
   2704     }
   2705   }
   2706 #endif
   2707 
   2708   LimitResult limit_result = LimitVersions(compiler, trace);
   2709   if (limit_result == DONE) return;
   2710   ASSERT(limit_result == CONTINUE);
   2711 
   2712   int new_flush_budget = trace->flush_budget() / choice_count;
   2713   if (trace->flush_budget() == 0 && trace->actions() != NULL) {
   2714     trace->Flush(compiler, this);
   2715     return;
   2716   }
   2717 
   2718   RecursionCheck rc(compiler);
   2719 
   2720   Trace* current_trace = trace;
   2721 
   2722   int text_length = GreedyLoopTextLength(&(alternatives_->at(0)));
   2723   bool greedy_loop = false;
   2724   Label greedy_loop_label;
   2725   Trace counter_backtrack_trace;
   2726   counter_backtrack_trace.set_backtrack(&greedy_loop_label);
   2727   if (not_at_start()) counter_backtrack_trace.set_at_start(false);
   2728 
   2729   if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
   2730     // Here we have special handling for greedy loops containing only text nodes
   2731     // and other simple nodes.  These are handled by pushing the current
   2732     // position on the stack and then incrementing the current position each
   2733     // time around the switch.  On backtrack we decrement the current position
   2734     // and check it against the pushed value.  This avoids pushing backtrack
   2735     // information for each iteration of the loop, which could take up a lot of
   2736     // space.
   2737     greedy_loop = true;
   2738     ASSERT(trace->stop_node() == NULL);
   2739     macro_assembler->PushCurrentPosition();
   2740     current_trace = &counter_backtrack_trace;
   2741     Label greedy_match_failed;
   2742     Trace greedy_match_trace;
   2743     if (not_at_start()) greedy_match_trace.set_at_start(false);
   2744     greedy_match_trace.set_backtrack(&greedy_match_failed);
   2745     Label loop_label;
   2746     macro_assembler->Bind(&loop_label);
   2747     greedy_match_trace.set_stop_node(this);
   2748     greedy_match_trace.set_loop_label(&loop_label);
   2749     alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
   2750     macro_assembler->Bind(&greedy_match_failed);
   2751   }
   2752 
   2753   Label second_choice;  // For use in greedy matches.
   2754   macro_assembler->Bind(&second_choice);
   2755 
   2756   int first_normal_choice = greedy_loop ? 1 : 0;
   2757 
   2758   int preload_characters = CalculatePreloadCharacters(compiler);
   2759   bool preload_is_current =
   2760       (current_trace->characters_preloaded() == preload_characters);
   2761   bool preload_has_checked_bounds = preload_is_current;
   2762 
   2763   AlternativeGenerationList alt_gens(choice_count);
   2764 
   2765   // For now we just call all choices one after the other.  The idea ultimately
   2766   // is to use the Dispatch table to try only the relevant ones.
   2767   for (int i = first_normal_choice; i < choice_count; i++) {
   2768     GuardedAlternative alternative = alternatives_->at(i);
   2769     AlternativeGeneration* alt_gen = alt_gens.at(i);
   2770     alt_gen->quick_check_details.set_characters(preload_characters);
   2771     ZoneList<Guard*>* guards = alternative.guards();
   2772     int guard_count = (guards == NULL) ? 0 : guards->length();
   2773     Trace new_trace(*current_trace);
   2774     new_trace.set_characters_preloaded(preload_is_current ?
   2775                                          preload_characters :
   2776                                          0);
   2777     if (preload_has_checked_bounds) {
   2778       new_trace.set_bound_checked_up_to(preload_characters);
   2779     }
   2780     new_trace.quick_check_performed()->Clear();
   2781     if (not_at_start_) new_trace.set_at_start(Trace::FALSE);
   2782     alt_gen->expects_preload = preload_is_current;
   2783     bool generate_full_check_inline = false;
   2784     if (FLAG_regexp_optimization &&
   2785         try_to_emit_quick_check_for_alternative(i) &&
   2786         alternative.node()->EmitQuickCheck(compiler,
   2787                                            &new_trace,
   2788                                            preload_has_checked_bounds,
   2789                                            &alt_gen->possible_success,
   2790                                            &alt_gen->quick_check_details,
   2791                                            i < choice_count - 1)) {
   2792       // Quick check was generated for this choice.
   2793       preload_is_current = true;
   2794       preload_has_checked_bounds = true;
   2795       // On the last choice in the ChoiceNode we generated the quick
   2796       // check to fall through on possible success.  So now we need to
   2797       // generate the full check inline.
   2798       if (i == choice_count - 1) {
   2799         macro_assembler->Bind(&alt_gen->possible_success);
   2800         new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
   2801         new_trace.set_characters_preloaded(preload_characters);
   2802         new_trace.set_bound_checked_up_to(preload_characters);
   2803         generate_full_check_inline = true;
   2804       }
   2805     } else if (alt_gen->quick_check_details.cannot_match()) {
   2806       if (i == choice_count - 1 && !greedy_loop) {
   2807         macro_assembler->GoTo(trace->backtrack());
   2808       }
   2809       continue;
   2810     } else {
   2811       // No quick check was generated.  Put the full code here.
   2812       // If this is not the first choice then there could be slow checks from
   2813       // previous cases that go here when they fail.  There's no reason to
   2814       // insist that they preload characters since the slow check we are about
   2815       // to generate probably can't use it.
   2816       if (i != first_normal_choice) {
   2817         alt_gen->expects_preload = false;
   2818         new_trace.InvalidateCurrentCharacter();
   2819       }
   2820       if (i < choice_count - 1) {
   2821         new_trace.set_backtrack(&alt_gen->after);
   2822       }
   2823       generate_full_check_inline = true;
   2824     }
   2825     if (generate_full_check_inline) {
   2826       if (new_trace.actions() != NULL) {
   2827         new_trace.set_flush_budget(new_flush_budget);
   2828       }
   2829       for (int j = 0; j < guard_count; j++) {
   2830         GenerateGuard(macro_assembler, guards->at(j), &new_trace);
   2831       }
   2832       alternative.node()->Emit(compiler, &new_trace);
   2833       preload_is_current = false;
   2834     }
   2835     macro_assembler->Bind(&alt_gen->after);
   2836   }
   2837   if (greedy_loop) {
   2838     macro_assembler->Bind(&greedy_loop_label);
   2839     // If we have unwound to the bottom then backtrack.
   2840     macro_assembler->CheckGreedyLoop(trace->backtrack());
   2841     // Otherwise try the second priority at an earlier position.
   2842     macro_assembler->AdvanceCurrentPosition(-text_length);
   2843     macro_assembler->GoTo(&second_choice);
   2844   }
   2845 
   2846   // At this point we need to generate slow checks for the alternatives where
   2847   // the quick check was inlined.  We can recognize these because the associated
   2848   // label was bound.
   2849   for (int i = first_normal_choice; i < choice_count - 1; i++) {
   2850     AlternativeGeneration* alt_gen = alt_gens.at(i);
   2851     Trace new_trace(*current_trace);
   2852     // If there are actions to be flushed we have to limit how many times
   2853     // they are flushed.  Take the budget of the parent trace and distribute
   2854     // it fairly amongst the children.
   2855     if (new_trace.actions() != NULL) {
   2856       new_trace.set_flush_budget(new_flush_budget);
   2857     }
   2858     EmitOutOfLineContinuation(compiler,
   2859                               &new_trace,
   2860                               alternatives_->at(i),
   2861                               alt_gen,
   2862                               preload_characters,
   2863                               alt_gens.at(i + 1)->expects_preload);
   2864   }
   2865 }
   2866 
   2867 
   2868 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
   2869                                            Trace* trace,
   2870                                            GuardedAlternative alternative,
   2871                                            AlternativeGeneration* alt_gen,
   2872                                            int preload_characters,
   2873                                            bool next_expects_preload) {
   2874   if (!alt_gen->possible_success.is_linked()) return;
   2875 
   2876   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
   2877   macro_assembler->Bind(&alt_gen->possible_success);
   2878   Trace out_of_line_trace(*trace);
   2879   out_of_line_trace.set_characters_preloaded(preload_characters);
   2880   out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
   2881   if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE);
   2882   ZoneList<Guard*>* guards = alternative.guards();
   2883   int guard_count = (guards == NULL) ? 0 : guards->length();
   2884   if (next_expects_preload) {
   2885     Label reload_current_char;
   2886     out_of_line_trace.set_backtrack(&reload_current_char);
   2887     for (int j = 0; j < guard_count; j++) {
   2888       GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
   2889     }
   2890     alternative.node()->Emit(compiler, &out_of_line_trace);
   2891     macro_assembler->Bind(&reload_current_char);
   2892     // Reload the current character, since the next quick check expects that.
   2893     // We don't need to check bounds here because we only get into this
   2894     // code through a quick check which already did the checked load.
   2895     macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
   2896                                           NULL,
   2897                                           false,
   2898                                           preload_characters);
   2899     macro_assembler->GoTo(&(alt_gen->after));
   2900   } else {
   2901     out_of_line_trace.set_backtrack(&(alt_gen->after));
   2902     for (int j = 0; j < guard_count; j++) {
   2903       GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
   2904     }
   2905     alternative.node()->Emit(compiler, &out_of_line_trace);
   2906   }
   2907 }
   2908 
   2909 
   2910 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
   2911   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   2912   LimitResult limit_result = LimitVersions(compiler, trace);
   2913   if (limit_result == DONE) return;
   2914   ASSERT(limit_result == CONTINUE);
   2915 
   2916   RecursionCheck rc(compiler);
   2917 
   2918   switch (type_) {
   2919     case STORE_POSITION: {
   2920       Trace::DeferredCapture
   2921           new_capture(data_.u_position_register.reg,
   2922                       data_.u_position_register.is_capture,
   2923                       trace);
   2924       Trace new_trace = *trace;
   2925       new_trace.add_action(&new_capture);
   2926       on_success()->Emit(compiler, &new_trace);
   2927       break;
   2928     }
   2929     case INCREMENT_REGISTER: {
   2930       Trace::DeferredIncrementRegister
   2931           new_increment(data_.u_increment_register.reg);
   2932       Trace new_trace = *trace;
   2933       new_trace.add_action(&new_increment);
   2934       on_success()->Emit(compiler, &new_trace);
   2935       break;
   2936     }
   2937     case SET_REGISTER: {
   2938       Trace::DeferredSetRegister
   2939           new_set(data_.u_store_register.reg, data_.u_store_register.value);
   2940       Trace new_trace = *trace;
   2941       new_trace.add_action(&new_set);
   2942       on_success()->Emit(compiler, &new_trace);
   2943       break;
   2944     }
   2945     case CLEAR_CAPTURES: {
   2946       Trace::DeferredClearCaptures
   2947         new_capture(Interval(data_.u_clear_captures.range_from,
   2948                              data_.u_clear_captures.range_to));
   2949       Trace new_trace = *trace;
   2950       new_trace.add_action(&new_capture);
   2951       on_success()->Emit(compiler, &new_trace);
   2952       break;
   2953     }
   2954     case BEGIN_SUBMATCH:
   2955       if (!trace->is_trivial()) {
   2956         trace->Flush(compiler, this);
   2957       } else {
   2958         assembler->WriteCurrentPositionToRegister(
   2959             data_.u_submatch.current_position_register, 0);
   2960         assembler->WriteStackPointerToRegister(
   2961             data_.u_submatch.stack_pointer_register);
   2962         on_success()->Emit(compiler, trace);
   2963       }
   2964       break;
   2965     case EMPTY_MATCH_CHECK: {
   2966       int start_pos_reg = data_.u_empty_match_check.start_register;
   2967       int stored_pos = 0;
   2968       int rep_reg = data_.u_empty_match_check.repetition_register;
   2969       bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
   2970       bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
   2971       if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
   2972         // If we know we haven't advanced and there is no minimum we
   2973         // can just backtrack immediately.
   2974         assembler->GoTo(trace->backtrack());
   2975       } else if (know_dist && stored_pos < trace->cp_offset()) {
   2976         // If we know we've advanced we can generate the continuation
   2977         // immediately.
   2978         on_success()->Emit(compiler, trace);
   2979       } else if (!trace->is_trivial()) {
   2980         trace->Flush(compiler, this);
   2981       } else {
   2982         Label skip_empty_check;
   2983         // If we have a minimum number of repetitions we check the current
   2984         // number first and skip the empty check if it's not enough.
   2985         if (has_minimum) {
   2986           int limit = data_.u_empty_match_check.repetition_limit;
   2987           assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
   2988         }
   2989         // If the match is empty we bail out, otherwise we fall through
   2990         // to the on-success continuation.
   2991         assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
   2992                                    trace->backtrack());
   2993         assembler->Bind(&skip_empty_check);
   2994         on_success()->Emit(compiler, trace);
   2995       }
   2996       break;
   2997     }
   2998     case POSITIVE_SUBMATCH_SUCCESS: {
   2999       if (!trace->is_trivial()) {
   3000         trace->Flush(compiler, this);
   3001         return;
   3002       }
   3003       assembler->ReadCurrentPositionFromRegister(
   3004           data_.u_submatch.current_position_register);
   3005       assembler->ReadStackPointerFromRegister(
   3006           data_.u_submatch.stack_pointer_register);
   3007       int clear_register_count = data_.u_submatch.clear_register_count;
   3008       if (clear_register_count == 0) {
   3009         on_success()->Emit(compiler, trace);
   3010         return;
   3011       }
   3012       int clear_registers_from = data_.u_submatch.clear_register_from;
   3013       Label clear_registers_backtrack;
   3014       Trace new_trace = *trace;
   3015       new_trace.set_backtrack(&clear_registers_backtrack);
   3016       on_success()->Emit(compiler, &new_trace);
   3017 
   3018       assembler->Bind(&clear_registers_backtrack);
   3019       int clear_registers_to = clear_registers_from + clear_register_count - 1;
   3020       assembler->ClearRegisters(clear_registers_from, clear_registers_to);
   3021 
   3022       ASSERT(trace->backtrack() == NULL);
   3023       assembler->Backtrack();
   3024       return;
   3025     }
   3026     default:
   3027       UNREACHABLE();
   3028   }
   3029 }
   3030 
   3031 
   3032 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
   3033   RegExpMacroAssembler* assembler = compiler->macro_assembler();
   3034   if (!trace->is_trivial()) {
   3035     trace->Flush(compiler, this);
   3036     return;
   3037   }
   3038 
   3039   LimitResult limit_result = LimitVersions(compiler, trace);
   3040   if (limit_result == DONE) return;
   3041   ASSERT(limit_result == CONTINUE);
   3042 
   3043   RecursionCheck rc(compiler);
   3044 
   3045   ASSERT_EQ(start_reg_ + 1, end_reg_);
   3046   if (compiler->ignore_case()) {
   3047     assembler->CheckNotBackReferenceIgnoreCase(start_reg_,
   3048                                                trace->backtrack());
   3049   } else {
   3050     assembler->CheckNotBackReference(start_reg_, trace->backtrack());
   3051   }
   3052   on_success()->Emit(compiler, trace);
   3053 }
   3054 
   3055 
   3056 // -------------------------------------------------------------------
   3057 // Dot/dotty output
   3058 
   3059 
   3060 #ifdef DEBUG
   3061 
   3062 
   3063 class DotPrinter: public NodeVisitor {
   3064  public:
   3065   explicit DotPrinter(bool ignore_case)
   3066       : ignore_case_(ignore_case),
   3067         stream_(&alloc_) { }
   3068   void PrintNode(const char* label, RegExpNode* node);
   3069   void Visit(RegExpNode* node);
   3070   void PrintAttributes(RegExpNode* from);
   3071   StringStream* stream() { return &stream_; }
   3072   void PrintOnFailure(RegExpNode* from, RegExpNode* to);
   3073 #define DECLARE_VISIT(Type)                                          \
   3074   virtual void Visit##Type(Type##Node* that);
   3075 FOR_EACH_NODE_TYPE(DECLARE_VISIT)
   3076 #undef DECLARE_VISIT
   3077  private:
   3078   bool ignore_case_;
   3079   HeapStringAllocator alloc_;
   3080   StringStream stream_;
   3081 };
   3082 
   3083 
   3084 void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
   3085   stream()->Add("digraph G {\n  graph [label=\"");
   3086   for (int i = 0; label[i]; i++) {
   3087     switch (label[i]) {
   3088       case '\\':
   3089         stream()->Add("\\\\");
   3090         break;
   3091       case '"':
   3092         stream()->Add("\"");
   3093         break;
   3094       default:
   3095         stream()->Put(label[i]);
   3096         break;
   3097     }
   3098   }
   3099   stream()->Add("\"];\n");
   3100   Visit(node);
   3101   stream()->Add("}\n");
   3102   printf("%s", *(stream()->ToCString()));
   3103 }
   3104 
   3105 
   3106 void DotPrinter::Visit(RegExpNode* node) {
   3107   if (node->info()->visited) return;
   3108   node->info()->visited = true;
   3109   node->Accept(this);
   3110 }
   3111 
   3112 
   3113 void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
   3114   stream()->Add("  n%p -> n%p [style=dotted];\n", from, on_failure);
   3115   Visit(on_failure);
   3116 }
   3117 
   3118 
   3119 class TableEntryBodyPrinter {
   3120  public:
   3121   TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice)
   3122       : stream_(stream), choice_(choice) { }
   3123   void Call(uc16 from, DispatchTable::Entry entry) {
   3124     OutSet* out_set = entry.out_set();
   3125     for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
   3126       if (out_set->Get(i)) {
   3127         stream()->Add("    n%p:s%io%i -> n%p;\n",
   3128                       choice(),
   3129                       from,
   3130                       i,
   3131                       choice()->alternatives()->at(i).node());
   3132       }
   3133     }
   3134   }
   3135  private:
   3136   StringStream* stream() { return stream_; }
   3137   ChoiceNode* choice() { return choice_; }
   3138   StringStream* stream_;
   3139   ChoiceNode* choice_;
   3140 };
   3141 
   3142 
   3143 class TableEntryHeaderPrinter {
   3144  public:
   3145   explicit TableEntryHeaderPrinter(StringStream* stream)
   3146       : first_(true), stream_(stream) { }
   3147   void Call(uc16 from, DispatchTable::Entry entry) {
   3148     if (first_) {
   3149       first_ = false;
   3150     } else {
   3151       stream()->Add("|");
   3152     }
   3153     stream()->Add("{\\%k-\\%k|{", from, entry.to());
   3154     OutSet* out_set = entry.out_set();
   3155     int priority = 0;
   3156     for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
   3157       if (out_set->Get(i)) {
   3158         if (priority > 0) stream()->Add("|");
   3159         stream()->Add("<s%io%i> %i", from, i, priority);
   3160         priority++;
   3161       }
   3162     }
   3163     stream()->Add("}}");
   3164   }
   3165  private:
   3166   bool first_;
   3167   StringStream* stream() { return stream_; }
   3168   StringStream* stream_;
   3169 };
   3170 
   3171 
   3172 class AttributePrinter {
   3173  public:
   3174   explicit AttributePrinter(DotPrinter* out)
   3175       : out_(out), first_(true) { }
   3176   void PrintSeparator() {
   3177     if (first_) {
   3178       first_ = false;
   3179     } else {
   3180       out_->stream()->Add("|");
   3181     }
   3182   }
   3183   void PrintBit(const char* name, bool value) {
   3184     if (!value) return;
   3185     PrintSeparator();
   3186     out_->stream()->Add("{%s}", name);
   3187   }
   3188   void PrintPositive(const char* name, int value) {
   3189     if (value < 0) return;
   3190     PrintSeparator();
   3191     out_->stream()->Add("{%s|%x}", name, value);
   3192   }
   3193  private:
   3194   DotPrinter* out_;
   3195   bool first_;
   3196 };
   3197 
   3198 
   3199 void DotPrinter::PrintAttributes(RegExpNode* that) {
   3200   stream()->Add("  a%p [shape=Mrecord, color=grey, fontcolor=grey, "
   3201                 "margin=0.1, fontsize=10, label=\"{",
   3202                 that);
   3203   AttributePrinter printer(this);
   3204   NodeInfo* info = that->info();
   3205   printer.PrintBit("NI", info->follows_newline_interest);
   3206   printer.PrintBit("WI", info->follows_word_interest);
   3207   printer.PrintBit("SI", info->follows_start_interest);
   3208   Label* label = that->label();
   3209   if (label->is_bound())
   3210     printer.PrintPositive("@", label->pos());
   3211   stream()->Add("}\"];\n");
   3212   stream()->Add("  a%p -> n%p [style=dashed, color=grey, "
   3213                 "arrowhead=none];\n", that, that);
   3214 }
   3215 
   3216 
   3217 static const bool kPrintDispatchTable = false;
   3218 void DotPrinter::VisitChoice(ChoiceNode* that) {
   3219   if (kPrintDispatchTable) {
   3220     stream()->Add("  n%p [shape=Mrecord, label=\"", that);
   3221     TableEntryHeaderPrinter header_printer(stream());
   3222     that->GetTable(ignore_case_)->ForEach(&header_printer);
   3223     stream()->Add("\"]\n", that);
   3224     PrintAttributes(that);
   3225     TableEntryBodyPrinter body_printer(stream(), that);
   3226     that->GetTable(ignore_case_)->ForEach(&body_printer);
   3227   } else {
   3228     stream()->Add("  n%p [shape=Mrecord, label=\"?\"];\n", that);
   3229     for (int i = 0; i < that->alternatives()->length(); i++) {
   3230       GuardedAlternative alt = that->alternatives()->at(i);
   3231       stream()->Add("  n%p -> n%p;\n", that, alt.node());
   3232     }
   3233   }
   3234   for (int i = 0; i < that->alternatives()->length(); i++) {
   3235     GuardedAlternative alt = that->alternatives()->at(i);
   3236     alt.node()->Accept(this);
   3237   }
   3238 }
   3239 
   3240 
   3241 void DotPrinter::VisitText(TextNode* that) {
   3242   stream()->Add("  n%p [label=\"", that);
   3243   for (int i = 0; i < that->elements()->length(); i++) {
   3244     if (i > 0) stream()->Add(" ");
   3245     TextElement elm = that->elements()->at(i);
   3246     switch (elm.type) {
   3247       case TextElement::ATOM: {
   3248         stream()->Add("'%w'", elm.data.u_atom->data());
   3249         break;
   3250       }
   3251       case TextElement::CHAR_CLASS: {
   3252         RegExpCharacterClass* node = elm.data.u_char_class;
   3253         stream()->Add("[");
   3254         if (node->is_negated())
   3255           stream()->Add("^");
   3256         for (int j = 0; j < node->ranges()->length(); j++) {
   3257           CharacterRange range = node->ranges()->at(j);
   3258           stream()->Add("%k-%k", range.from(), range.to());
   3259         }
   3260         stream()->Add("]");
   3261         break;
   3262       }
   3263       default:
   3264         UNREACHABLE();
   3265     }
   3266   }
   3267   stream()->Add("\", shape=box, peripheries=2];\n");
   3268   PrintAttributes(that);
   3269   stream()->Add("  n%p -> n%p;\n", that, that->on_success());
   3270   Visit(that->on_success());
   3271 }
   3272 
   3273 
   3274 void DotPrinter::VisitBackReference(BackReferenceNode* that) {
   3275   stream()->Add("  n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n",
   3276                 that,
   3277                 that->start_register(),
   3278                 that->end_register());
   3279   PrintAttributes(that);
   3280   stream()->Add("  n%p -> n%p;\n", that, that->on_success());
   3281   Visit(that->on_success());
   3282 }
   3283 
   3284 
   3285 void DotPrinter::VisitEnd(EndNode* that) {
   3286   stream()->Add("  n%p [style=bold, shape=point];\n", that);
   3287   PrintAttributes(that);
   3288 }
   3289 
   3290 
   3291 void DotPrinter::VisitAssertion(AssertionNode* that) {
   3292   stream()->Add("  n%p [", that);
   3293   switch (that->type()) {
   3294     case AssertionNode::AT_END:
   3295       stream()->Add("label=\"$\", shape=septagon");
   3296       break;
   3297     case AssertionNode::AT_START:
   3298       stream()->Add("label=\"^\", shape=septagon");
   3299       break;
   3300     case AssertionNode::AT_BOUNDARY:
   3301       stream()->Add("label=\"\\b\", shape=septagon");
   3302       break;
   3303     case AssertionNode::AT_NON_BOUNDARY:
   3304       stream()->Add("label=\"\\B\", shape=septagon");
   3305       break;
   3306     case AssertionNode::AFTER_NEWLINE:
   3307       stream()->Add("label=\"(?<=\\n)\", shape=septagon");
   3308       break;
   3309     case AssertionNode::AFTER_WORD_CHARACTER:
   3310       stream()->Add("label=\"(?<=\\w)\", shape=septagon");
   3311       break;
   3312     case AssertionNode::AFTER_NONWORD_CHARACTER:
   3313       stream()->Add("label=\"(?<=\\W)\", shape=septagon");
   3314       break;
   3315   }
   3316   stream()->Add("];\n");
   3317   PrintAttributes(that);
   3318   RegExpNode* successor = that->on_success();
   3319   stream()->Add("  n%p -> n%p;\n", that, successor);
   3320   Visit(successor);
   3321 }
   3322 
   3323 
   3324 void DotPrinter::VisitAction(ActionNode* that) {
   3325   stream()->Add("  n%p [", that);
   3326   switch (that->type_) {
   3327     case ActionNode::SET_REGISTER:
   3328       stream()->Add("label=\"$%i:=%i\", shape=octagon",
   3329                     that->data_.u_store_register.reg,
   3330                     that->data_.u_store_register.value);
   3331       break;
   3332     case ActionNode::INCREMENT_REGISTER:
   3333       stream()->Add("label=\"$%i++\", shape=octagon",
   3334                     that->data_.u_increment_register.reg);
   3335       break;
   3336     case ActionNode::STORE_POSITION:
   3337       stream()->Add("label=\"$%i:=$pos\", shape=octagon",
   3338                     that->data_.u_position_register.reg);
   3339       break;
   3340     case ActionNode::BEGIN_SUBMATCH:
   3341       stream()->Add("label=\"$%i:=$pos,begin\", shape=septagon",
   3342                     that->data_.u_submatch.current_position_register);
   3343       break;
   3344     case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
   3345       stream()->Add("label=\"escape\", shape=septagon");
   3346       break;
   3347     case ActionNode::EMPTY_MATCH_CHECK:
   3348       stream()->Add("label=\"$%i=$pos?,$%i<%i?\", shape=septagon",
   3349                     that->data_.u_empty_match_check.start_register,
   3350                     that->data_.u_empty_match_check.repetition_register,
   3351                     that->data_.u_empty_match_check.repetition_limit);
   3352       break;
   3353     case ActionNode::CLEAR_CAPTURES: {
   3354       stream()->Add("label=\"clear $%i to $%i\", shape=septagon",
   3355                     that->data_.u_clear_captures.range_from,
   3356                     that->data_.u_clear_captures.range_to);
   3357       break;
   3358     }
   3359   }
   3360   stream()->Add("];\n");
   3361   PrintAttributes(that);
   3362   RegExpNode* successor = that->on_success();
   3363   stream()->Add("  n%p -> n%p;\n", that, successor);
   3364   Visit(successor);
   3365 }
   3366 
   3367 
   3368 class DispatchTableDumper {
   3369  public:
   3370   explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { }
   3371   void Call(uc16 key, DispatchTable::Entry entry);
   3372   StringStream* stream() { return stream_; }
   3373  private:
   3374   StringStream* stream_;
   3375 };
   3376 
   3377 
   3378 void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
   3379   stream()->Add("[%k-%k]: {", key, entry.to());
   3380   OutSet* set = entry.out_set();
   3381   bool first = true;
   3382   for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
   3383     if (set->Get(i)) {
   3384       if (first) {
   3385         first = false;
   3386       } else {
   3387         stream()->Add(", ");
   3388       }
   3389       stream()->Add("%i", i);
   3390     }
   3391   }
   3392   stream()->Add("}\n");
   3393 }
   3394 
   3395 
   3396 void DispatchTable::Dump() {
   3397   HeapStringAllocator alloc;
   3398   StringStream stream(&alloc);
   3399   DispatchTableDumper dumper(&stream);
   3400   tree()->ForEach(&dumper);
   3401   OS::PrintError("%s", *stream.ToCString());
   3402 }
   3403 
   3404 
   3405 void RegExpEngine::DotPrint(const char* label,
   3406                             RegExpNode* node,
   3407                             bool ignore_case) {
   3408   DotPrinter printer(ignore_case);
   3409   printer.PrintNode(label, node);
   3410 }
   3411 
   3412 
   3413 #endif  // DEBUG
   3414 
   3415 
   3416 // -------------------------------------------------------------------
   3417 // Tree to graph conversion
   3418 
   3419 static const int kSpaceRangeCount = 20;
   3420 static const int kSpaceRangeAsciiCount = 4;
   3421 static const uc16 kSpaceRanges[kSpaceRangeCount] = { 0x0009, 0x000D, 0x0020,
   3422     0x0020, 0x00A0, 0x00A0, 0x1680, 0x1680, 0x180E, 0x180E, 0x2000, 0x200A,
   3423     0x2028, 0x2029, 0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000 };
   3424 
   3425 static const int kWordRangeCount = 8;
   3426 static const uc16 kWordRanges[kWordRangeCount] = { '0', '9', 'A', 'Z', '_',
   3427     '_', 'a', 'z' };
   3428 
   3429 static const int kDigitRangeCount = 2;
   3430 static const uc16 kDigitRanges[kDigitRangeCount] = { '0', '9' };
   3431 
   3432 static const int kLineTerminatorRangeCount = 6;
   3433 static const uc16 kLineTerminatorRanges[kLineTerminatorRangeCount] = { 0x000A,
   3434     0x000A, 0x000D, 0x000D, 0x2028, 0x2029 };
   3435 
   3436 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
   3437                                RegExpNode* on_success) {
   3438   ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);
   3439   elms->Add(TextElement::Atom(this));
   3440   return new TextNode(elms, on_success);
   3441 }
   3442 
   3443 
   3444 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
   3445                                RegExpNode* on_success) {
   3446   return new TextNode(elements(), on_success);
   3447 }
   3448 
   3449 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
   3450                                  const uc16* special_class,
   3451                                  int length) {
   3452   ASSERT(ranges->length() != 0);
   3453   ASSERT(length != 0);
   3454   ASSERT(special_class[0] != 0);
   3455   if (ranges->length() != (length >> 1) + 1) {
   3456     return false;
   3457   }
   3458   CharacterRange range = ranges->at(0);
   3459   if (range.from() != 0) {
   3460     return false;
   3461   }
   3462   for (int i = 0; i < length; i += 2) {
   3463     if (special_class[i] != (range.to() + 1)) {
   3464       return false;
   3465     }
   3466     range = ranges->at((i >> 1) + 1);
   3467     if (special_class[i+1] != range.from() - 1) {
   3468       return false;
   3469     }
   3470   }
   3471   if (range.to() != 0xffff) {
   3472     return false;
   3473   }
   3474   return true;
   3475 }
   3476 
   3477 
   3478 static bool CompareRanges(ZoneList<CharacterRange>* ranges,
   3479                           const uc16* special_class,
   3480                           int length) {
   3481   if (ranges->length() * 2 != length) {
   3482     return false;
   3483   }
   3484   for (int i = 0; i < length; i += 2) {
   3485     CharacterRange range = ranges->at(i >> 1);
   3486     if (range.from() != special_class[i] || range.to() != special_class[i+1]) {
   3487       return false;
   3488     }
   3489   }
   3490   return true;
   3491 }
   3492 
   3493 
   3494 bool RegExpCharacterClass::is_standard() {
   3495   // TODO(lrn): Remove need for this function, by not throwing away information
   3496   // along the way.
   3497   if (is_negated_) {
   3498     return false;
   3499   }
   3500   if (set_.is_standard()) {
   3501     return true;
   3502   }
   3503   if (CompareRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
   3504     set_.set_standard_set_type('s');
   3505     return true;
   3506   }
   3507   if (CompareInverseRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
   3508     set_.set_standard_set_type('S');
   3509     return true;
   3510   }
   3511   if (CompareInverseRanges(set_.ranges(),
   3512                            kLineTerminatorRanges,
   3513                            kLineTerminatorRangeCount)) {
   3514     set_.set_standard_set_type('.');
   3515     return true;
   3516   }
   3517   if (CompareRanges(set_.ranges(),
   3518                     kLineTerminatorRanges,
   3519                     kLineTerminatorRangeCount)) {
   3520     set_.set_standard_set_type('n');
   3521     return true;
   3522   }
   3523   if (CompareRanges(set_.ranges(), kWordRanges, kWordRangeCount)) {
   3524     set_.set_standard_set_type('w');
   3525     return true;
   3526   }
   3527   if (CompareInverseRanges(set_.ranges(), kWordRanges, kWordRangeCount)) {
   3528     set_.set_standard_set_type('W');
   3529     return true;
   3530   }
   3531   return false;
   3532 }
   3533 
   3534 
   3535 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
   3536                                          RegExpNode* on_success) {
   3537   return new TextNode(this, on_success);
   3538 }
   3539 
   3540 
   3541 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
   3542                                       RegExpNode* on_success) {
   3543   ZoneList<RegExpTree*>* alternatives = this->alternatives();
   3544   int length = alternatives->length();
   3545   ChoiceNode* result = new ChoiceNode(length);
   3546   for (int i = 0; i < length; i++) {
   3547     GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
   3548                                                                on_success));
   3549     result->AddAlternative(alternative);
   3550   }
   3551   return result;
   3552 }
   3553 
   3554 
   3555 RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
   3556                                      RegExpNode* on_success) {
   3557   return ToNode(min(),
   3558                 max(),
   3559                 is_greedy(),
   3560                 body(),
   3561                 compiler,
   3562                 on_success);
   3563 }
   3564 
   3565 
   3566 RegExpNode* RegExpQuantifier::ToNode(int min,
   3567                                      int max,
   3568                                      bool is_greedy,
   3569                                      RegExpTree* body,
   3570                                      RegExpCompiler* compiler,
   3571                                      RegExpNode* on_success,
   3572                                      bool not_at_start) {
   3573   // x{f, t} becomes this:
   3574   //
   3575   //             (r++)<-.
   3576   //               |     `
   3577   //               |     (x)
   3578   //               v     ^
   3579   //      (r=0)-->(?)---/ [if r < t]
   3580   //               |
   3581   //   [if r >= f] \----> ...
   3582   //
   3583 
   3584   // 15.10.2.5 RepeatMatcher algorithm.
   3585   // The parser has already eliminated the case where max is 0.  In the case
   3586   // where max_match is zero the parser has removed the quantifier if min was
   3587   // > 0 and removed the atom if min was 0.  See AddQuantifierToAtom.
   3588 
   3589   // If we know that we cannot match zero length then things are a little
   3590   // simpler since we don't need to make the special zero length match check
   3591   // from step 2.1.  If the min and max are small we can unroll a little in
   3592   // this case.
   3593   static const int kMaxUnrolledMinMatches = 3;  // Unroll (foo)+ and (foo){3,}
   3594   static const int kMaxUnrolledMaxMatches = 3;  // Unroll (foo)? and (foo){x,3}
   3595   if (max == 0) return on_success;  // This can happen due to recursion.
   3596   bool body_can_be_empty = (body->min_match() == 0);
   3597   int body_start_reg = RegExpCompiler::kNoRegister;
   3598   Interval capture_registers = body->CaptureRegisters();
   3599   bool needs_capture_clearing = !capture_registers.is_empty();
   3600   if (body_can_be_empty) {
   3601     body_start_reg = compiler->AllocateRegister();
   3602   } else if (FLAG_regexp_optimization && !needs_capture_clearing) {
   3603     // Only unroll if there are no captures and the body can't be
   3604     // empty.
   3605     if (min > 0 && min <= kMaxUnrolledMinMatches) {
   3606       int new_max = (max == kInfinity) ? max : max - min;
   3607       // Recurse once to get the loop or optional matches after the fixed ones.
   3608       RegExpNode* answer = ToNode(
   3609           0, new_max, is_greedy, body, compiler, on_success, true);
   3610       // Unroll the forced matches from 0 to min.  This can cause chains of
   3611       // TextNodes (which the parser does not generate).  These should be
   3612       // combined if it turns out they hinder good code generation.
   3613       for (int i = 0; i < min; i++) {
   3614         answer = body->ToNode(compiler, answer);
   3615       }
   3616       return answer;
   3617     }
   3618     if (max <= kMaxUnrolledMaxMatches) {
   3619       ASSERT(min == 0);
   3620       // Unroll the optional matches up to max.
   3621       RegExpNode* answer = on_success;
   3622       for (int i = 0; i < max; i++) {
   3623         ChoiceNode* alternation = new ChoiceNode(2);
   3624         if (is_greedy) {
   3625           alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler,
   3626                                                                       answer)));
   3627           alternation->AddAlternative(GuardedAlternative(on_success));
   3628         } else {
   3629           alternation->AddAlternative(GuardedAlternative(on_success));
   3630           alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler,
   3631                                                                       answer)));
   3632         }
   3633         answer = alternation;
   3634         if (not_at_start) alternation->set_not_at_start();
   3635       }
   3636       return answer;
   3637     }
   3638   }
   3639   bool has_min = min > 0;
   3640   bool has_max = max < RegExpTree::kInfinity;
   3641   bool needs_counter = has_min || has_max;
   3642   int reg_ctr = needs_counter
   3643       ? compiler->AllocateRegister()
   3644       : RegExpCompiler::kNoRegister;
   3645   LoopChoiceNode* center = new LoopChoiceNode(body->min_match() == 0);
   3646   if (not_at_start) center->set_not_at_start();
   3647   RegExpNode* loop_return = needs_counter
   3648       ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
   3649       : static_cast<RegExpNode*>(center);
   3650   if (body_can_be_empty) {
   3651     // If the body can be empty we need to check if it was and then
   3652     // backtrack.
   3653     loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
   3654                                               reg_ctr,
   3655                                               min,
   3656                                               loop_return);
   3657   }
   3658   RegExpNode* body_node = body->ToNode(compiler, loop_return);
   3659   if (body_can_be_empty) {
   3660     // If the body can be empty we need to store the start position
   3661     // so we can bail out if it was empty.
   3662     body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
   3663   }
   3664   if (needs_capture_clearing) {
   3665     // Before entering the body of this loop we need to clear captures.
   3666     body_node = ActionNode::ClearCaptures(capture_registers, body_node);
   3667   }
   3668   GuardedAlternative body_alt(body_node);
   3669   if (has_max) {
   3670     Guard* body_guard = new Guard(reg_ctr, Guard::LT, max);
   3671     body_alt.AddGuard(body_guard);
   3672   }
   3673   GuardedAlternative rest_alt(on_success);
   3674   if (has_min) {
   3675     Guard* rest_guard = new Guard(reg_ctr, Guard::GEQ, min);
   3676     rest_alt.AddGuard(rest_guard);
   3677   }
   3678   if (is_greedy) {
   3679     center->AddLoopAlternative(body_alt);
   3680     center->AddContinueAlternative(rest_alt);
   3681   } else {
   3682     center->AddContinueAlternative(rest_alt);
   3683     center->AddLoopAlternative(body_alt);
   3684   }
   3685   if (needs_counter) {
   3686     return ActionNode::SetRegister(reg_ctr, 0, center);
   3687   } else {
   3688     return center;
   3689   }
   3690 }
   3691 
   3692 
   3693 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
   3694                                     RegExpNode* on_success) {
   3695   NodeInfo info;
   3696   switch (type()) {
   3697     case START_OF_LINE:
   3698       return AssertionNode::AfterNewline(on_success);
   3699     case START_OF_INPUT:
   3700       return AssertionNode::AtStart(on_success);
   3701     case BOUNDARY:
   3702       return AssertionNode::AtBoundary(on_success);
   3703     case NON_BOUNDARY:
   3704       return AssertionNode::AtNonBoundary(on_success);
   3705     case END_OF_INPUT:
   3706       return AssertionNode::AtEnd(on_success);
   3707     case END_OF_LINE: {
   3708       // Compile $ in multiline regexps as an alternation with a positive
   3709       // lookahead in one side and an end-of-input on the other side.
   3710       // We need two registers for the lookahead.
   3711       int stack_pointer_register = compiler->AllocateRegister();
   3712       int position_register = compiler->AllocateRegister();
   3713       // The ChoiceNode to distinguish between a newline and end-of-input.
   3714       ChoiceNode* result = new ChoiceNode(2);
   3715       // Create a newline atom.
   3716       ZoneList<CharacterRange>* newline_ranges =
   3717           new ZoneList<CharacterRange>(3);
   3718       CharacterRange::AddClassEscape('n', newline_ranges);
   3719       RegExpCharacterClass* newline_atom = new RegExpCharacterClass('n');
   3720       TextNode* newline_matcher = new TextNode(
   3721          newline_atom,
   3722          ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
   3723                                              position_register,
   3724                                              0,  // No captures inside.
   3725                                              -1,  // Ignored if no captures.
   3726                                              on_success));
   3727       // Create an end-of-input matcher.
   3728       RegExpNode* end_of_line = ActionNode::BeginSubmatch(
   3729           stack_pointer_register,
   3730           position_register,
   3731           newline_matcher);
   3732       // Add the two alternatives to the ChoiceNode.
   3733       GuardedAlternative eol_alternative(end_of_line);
   3734       result->AddAlternative(eol_alternative);
   3735       GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
   3736       result->AddAlternative(end_alternative);
   3737       return result;
   3738     }
   3739     default:
   3740       UNREACHABLE();
   3741   }
   3742   return on_success;
   3743 }
   3744 
   3745 
   3746 RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
   3747                                         RegExpNode* on_success) {
   3748   return new BackReferenceNode(RegExpCapture::StartRegister(index()),
   3749                                RegExpCapture::EndRegister(index()),
   3750                                on_success);
   3751 }
   3752 
   3753 
   3754 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
   3755                                 RegExpNode* on_success) {
   3756   return on_success;
   3757 }
   3758 
   3759 
   3760 RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
   3761                                     RegExpNode* on_success) {
   3762   int stack_pointer_register = compiler->AllocateRegister();
   3763   int position_register = compiler->AllocateRegister();
   3764 
   3765   const int registers_per_capture = 2;
   3766   const int register_of_first_capture = 2;
   3767   int register_count = capture_count_ * registers_per_capture;
   3768   int register_start =
   3769     register_of_first_capture + capture_from_ * registers_per_capture;
   3770 
   3771   RegExpNode* success;
   3772   if (is_positive()) {
   3773     RegExpNode* node = ActionNode::BeginSubmatch(
   3774         stack_pointer_register,
   3775         position_register,
   3776         body()->ToNode(
   3777             compiler,
   3778             ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
   3779                                                 position_register,
   3780                                                 register_count,
   3781                                                 register_start,
   3782                                                 on_success)));
   3783     return node;
   3784   } else {
   3785     // We use a ChoiceNode for a negative lookahead because it has most of
   3786     // the characteristics we need.  It has the body of the lookahead as its
   3787     // first alternative and the expression after the lookahead of the second
   3788     // alternative.  If the first alternative succeeds then the
   3789     // NegativeSubmatchSuccess will unwind the stack including everything the
   3790     // choice node set up and backtrack.  If the first alternative fails then
   3791     // the second alternative is tried, which is exactly the desired result
   3792     // for a negative lookahead.  The NegativeLookaheadChoiceNode is a special
   3793     // ChoiceNode that knows to ignore the first exit when calculating quick
   3794     // checks.
   3795     GuardedAlternative body_alt(
   3796         body()->ToNode(
   3797             compiler,
   3798             success = new NegativeSubmatchSuccess(stack_pointer_register,
   3799                                                   position_register,
   3800                                                   register_count,
   3801                                                   register_start)));
   3802     ChoiceNode* choice_node =
   3803         new NegativeLookaheadChoiceNode(body_alt,
   3804                                         GuardedAlternative(on_success));
   3805     return ActionNode::BeginSubmatch(stack_pointer_register,
   3806                                      position_register,
   3807                                      choice_node);
   3808   }
   3809 }
   3810 
   3811 
   3812 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
   3813                                   RegExpNode* on_success) {
   3814   return ToNode(body(), index(), compiler, on_success);
   3815 }
   3816 
   3817 
   3818 RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
   3819                                   int index,
   3820                                   RegExpCompiler* compiler,
   3821                                   RegExpNode* on_success) {
   3822   int start_reg = RegExpCapture::StartRegister(index);
   3823   int end_reg = RegExpCapture::EndRegister(index);
   3824   RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
   3825   RegExpNode* body_node = body->ToNode(compiler, store_end);
   3826   return ActionNode::StorePosition(start_reg, true, body_node);
   3827 }
   3828 
   3829 
   3830 RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
   3831                                       RegExpNode* on_success) {
   3832   ZoneList<RegExpTree*>* children = nodes();
   3833   RegExpNode* current = on_success;
   3834   for (int i = children->length() - 1; i >= 0; i--) {
   3835     current = children->at(i)->ToNode(compiler, current);
   3836   }
   3837   return current;
   3838 }
   3839 
   3840 
   3841 static void AddClass(const uc16* elmv,
   3842                      int elmc,
   3843                      ZoneList<CharacterRange>* ranges) {
   3844   for (int i = 0; i < elmc; i += 2) {
   3845     ASSERT(elmv[i] <= elmv[i + 1]);
   3846     ranges->Add(CharacterRange(elmv[i], elmv[i + 1]));
   3847   }
   3848 }
   3849 
   3850 
   3851 static void AddClassNegated(const uc16 *elmv,
   3852                             int elmc,
   3853                             ZoneList<CharacterRange>* ranges) {
   3854   ASSERT(elmv[0] != 0x0000);
   3855   ASSERT(elmv[elmc-1] != String::kMaxUC16CharCode);
   3856   uc16 last = 0x0000;
   3857   for (int i = 0; i < elmc; i += 2) {
   3858     ASSERT(last <= elmv[i] - 1);
   3859     ASSERT(elmv[i] <= elmv[i + 1]);
   3860     ranges->Add(CharacterRange(last, elmv[i] - 1));
   3861     last = elmv[i + 1] + 1;
   3862   }
   3863   ranges->Add(CharacterRange(last, String::kMaxUC16CharCode));
   3864 }
   3865 
   3866 
   3867 void CharacterRange::AddClassEscape(uc16 type,
   3868                                     ZoneList<CharacterRange>* ranges) {
   3869   switch (type) {
   3870     case 's':
   3871       AddClass(kSpaceRanges, kSpaceRangeCount, ranges);
   3872       break;
   3873     case 'S':
   3874       AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges);
   3875       break;
   3876     case 'w':
   3877       AddClass(kWordRanges, kWordRangeCount, ranges);
   3878       break;
   3879     case 'W':
   3880       AddClassNegated(kWordRanges, kWordRangeCount, ranges);
   3881       break;
   3882     case 'd':
   3883       AddClass(kDigitRanges, kDigitRangeCount, ranges);
   3884       break;
   3885     case 'D':
   3886       AddClassNegated(kDigitRanges, kDigitRangeCount, ranges);
   3887       break;
   3888     case '.':
   3889       AddClassNegated(kLineTerminatorRanges,
   3890                       kLineTerminatorRangeCount,
   3891                       ranges);
   3892       break;
   3893     // This is not a character range as defined by the spec but a
   3894     // convenient shorthand for a character class that matches any
   3895     // character.
   3896     case '*':
   3897       ranges->Add(CharacterRange::Everything());
   3898       break;
   3899     // This is the set of characters matched by the $ and ^ symbols
   3900     // in multiline mode.
   3901     case 'n':
   3902       AddClass(kLineTerminatorRanges,
   3903                kLineTerminatorRangeCount,
   3904                ranges);
   3905       break;
   3906     default:
   3907       UNREACHABLE();
   3908   }
   3909 }
   3910 
   3911 
   3912 Vector<const uc16> CharacterRange::GetWordBounds() {
   3913   return Vector<const uc16>(kWordRanges, kWordRangeCount);
   3914 }
   3915 
   3916 
   3917 class CharacterRangeSplitter {
   3918  public:
   3919   CharacterRangeSplitter(ZoneList<CharacterRange>** included,
   3920                           ZoneList<CharacterRange>** excluded)
   3921       : included_(included),
   3922         excluded_(excluded) { }
   3923   void Call(uc16 from, DispatchTable::Entry entry);
   3924 
   3925   static const int kInBase = 0;
   3926   static const int kInOverlay = 1;
   3927 
   3928  private:
   3929   ZoneList<CharacterRange>** included_;
   3930   ZoneList<CharacterRange>** excluded_;
   3931 };
   3932 
   3933 
   3934 void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) {
   3935   if (!entry.out_set()->Get(kInBase)) return;
   3936   ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)
   3937     ? included_
   3938     : excluded_;
   3939   if (*target == NULL) *target = new ZoneList<CharacterRange>(2);
   3940   (*target)->Add(CharacterRange(entry.from(), entry.to()));
   3941 }
   3942 
   3943 
   3944 void CharacterRange::Split(ZoneList<CharacterRange>* base,
   3945                            Vector<const uc16> overlay,
   3946                            ZoneList<CharacterRange>** included,
   3947                            ZoneList<CharacterRange>** excluded) {
   3948   ASSERT_EQ(NULL, *included);
   3949   ASSERT_EQ(NULL, *excluded);
   3950   DispatchTable table;
   3951   for (int i = 0; i < base->length(); i++)
   3952     table.AddRange(base->at(i), CharacterRangeSplitter::kInBase);
   3953   for (int i = 0; i < overlay.length(); i += 2) {
   3954     table.AddRange(CharacterRange(overlay[i], overlay[i+1]),
   3955                    CharacterRangeSplitter::kInOverlay);
   3956   }
   3957   CharacterRangeSplitter callback(included, excluded);
   3958   table.ForEach(&callback);
   3959 }
   3960 
   3961 
   3962 static void AddUncanonicals(ZoneList<CharacterRange>* ranges,
   3963                             int bottom,
   3964                             int top);
   3965 
   3966 
   3967 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,
   3968                                         bool is_ascii) {
   3969   uc16 bottom = from();
   3970   uc16 top = to();
   3971   if (is_ascii) {
   3972     if (bottom > String::kMaxAsciiCharCode) return;
   3973     if (top > String::kMaxAsciiCharCode) top = String::kMaxAsciiCharCode;
   3974   }
   3975   unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
   3976   if (top == bottom) {
   3977     // If this is a singleton we just expand the one character.
   3978     int length = uncanonicalize.get(bottom, '\0', chars);
   3979     for (int i = 0; i < length; i++) {
   3980       uc32 chr = chars[i];
   3981       if (chr != bottom) {
   3982         ranges->Add(CharacterRange::Singleton(chars[i]));
   3983       }
   3984     }
   3985   } else if (bottom <= kRangeCanonicalizeMax &&
   3986              top <= kRangeCanonicalizeMax) {
   3987     // If this is a range we expand the characters block by block,
   3988     // expanding contiguous subranges (blocks) one at a time.
   3989     // The approach is as follows.  For a given start character we
   3990     // look up the block that contains it, for instance 'a' if the
   3991     // start character is 'c'.  A block is characterized by the property
   3992     // that all characters uncanonicalize in the same way as the first
   3993     // element, except that each entry in the result is incremented
   3994     // by the distance from the first element.  So a-z is a block
   3995     // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter
   3996     // uncanonicalizes to ['a' + k, 'A' + k].
   3997     // Once we've found the start point we look up its uncanonicalization
   3998     // and produce a range for each element.  For instance for [c-f]
   3999     // we look up ['a', 'A'] and produce [c-f] and [C-F].  We then only
   4000     // add a range if it is not already contained in the input, so [c-f]
   4001     // will be skipped but [C-F] will be added.  If this range is not
   4002     // completely contained in a block we do this for all the blocks
   4003     // covered by the range.
   4004     unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
   4005     // First, look up the block that contains the 'bottom' character.
   4006     int length = canonrange.get(bottom, '\0', range);
   4007     if (length == 0) {
   4008       range[0] = bottom;
   4009     } else {
   4010       ASSERT_EQ(1, length);
   4011     }
   4012     int pos = bottom;
   4013     // The start of the current block.  Note that except for the first
   4014     // iteration 'start' is always equal to 'pos'.
   4015     int start;
   4016     // If it is not the start point of a block the entry contains the
   4017     // offset of the character from the start point.
   4018     if ((range[0] & kStartMarker) == 0) {
   4019       start = pos - range[0];
   4020     } else {
   4021       start = pos;
   4022     }
   4023     // Then we add the ranges one at a time, incrementing the current
   4024     // position to be after the last block each time.  The position
   4025     // always points to the start of a block.
   4026     while (pos < top) {
   4027       length = canonrange.get(start, '\0', range);
   4028       if (length == 0) {
   4029         range[0] = start;
   4030       } else {
   4031         ASSERT_EQ(1, length);
   4032       }
   4033       ASSERT((range[0] & kStartMarker) != 0);
   4034       // The start point of a block contains the distance to the end
   4035       // of the range.
   4036       int block_end = start + (range[0] & kPayloadMask) - 1;
   4037       int end = (block_end > top) ? top : block_end;
   4038       length = uncanonicalize.get(start, '\0', range);
   4039       for (int i = 0; i < length; i++) {
   4040         uc32 c = range[i];
   4041         uc16 range_from = c + (pos - start);
   4042         uc16 range_to = c + (end - start);
   4043         if (!(bottom <= range_from && range_to <= top)) {
   4044           ranges->Add(CharacterRange(range_from, range_to));
   4045         }
   4046       }
   4047       start = pos = block_end + 1;
   4048     }
   4049   } else {
   4050     // Unibrow ranges don't work for high characters due to the "2^11 bug".
   4051     // Therefore we do something dumber for these ranges.
   4052     AddUncanonicals(ranges, bottom, top);
   4053   }
   4054 }
   4055 
   4056 
   4057 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
   4058   ASSERT_NOT_NULL(ranges);
   4059   int n = ranges->length();
   4060   if (n <= 1) return true;
   4061   int max = ranges->at(0).to();
   4062   for (int i = 1; i < n; i++) {
   4063     CharacterRange next_range = ranges->at(i);
   4064     if (next_range.from() <= max + 1) return false;
   4065     max = next_range.to();
   4066   }
   4067   return true;
   4068 }
   4069 
   4070 SetRelation CharacterRange::WordCharacterRelation(
   4071     ZoneList<CharacterRange>* range) {
   4072   ASSERT(IsCanonical(range));
   4073   int i = 0;  // Word character range index.
   4074   int j = 0;  // Argument range index.
   4075   ASSERT_NE(0, kWordRangeCount);
   4076   SetRelation result;
   4077   if (range->length() == 0) {
   4078     result.SetElementsInSecondSet();
   4079     return result;
   4080   }
   4081   CharacterRange argument_range = range->at(0);
   4082   CharacterRange word_range = CharacterRange(kWordRanges[0], kWordRanges[1]);
   4083   while (i < kWordRangeCount && j < range->length()) {
   4084     // Check the two ranges for the five cases:
   4085     // - no overlap.
   4086     // - partial overlap (there are elements in both ranges that isn't
   4087     //   in the other, and there are also elements that are in both).
   4088     // - argument range entirely inside word range.
   4089     // - word range entirely inside argument range.
   4090     // - ranges are completely equal.
   4091 
   4092     // First check for no overlap. The earlier range is not in the other set.
   4093     if (argument_range.from() > word_range.to()) {
   4094       // Ranges are disjoint. The earlier word range contains elements that
   4095       // cannot be in the argument set.
   4096       result.SetElementsInSecondSet();
   4097     } else if (word_range.from() > argument_range.to()) {
   4098       // Ranges are disjoint. The earlier argument range contains elements that
   4099       // cannot be in the word set.
   4100       result.SetElementsInFirstSet();
   4101     } else if (word_range.from() <= argument_range.from() &&
   4102                word_range.to() >= argument_range.from()) {
   4103       result.SetElementsInBothSets();
   4104       // argument range completely inside word range.
   4105       if (word_range.from() < argument_range.from() ||
   4106           word_range.to() > argument_range.from()) {
   4107         result.SetElementsInSecondSet();
   4108       }
   4109     } else if (word_range.from() >= argument_range.from() &&
   4110                word_range.to() <= argument_range.from()) {
   4111       result.SetElementsInBothSets();
   4112       result.SetElementsInFirstSet();
   4113     } else {
   4114       // There is overlap, and neither is a subrange of the other
   4115       result.SetElementsInFirstSet();
   4116       result.SetElementsInSecondSet();
   4117       result.SetElementsInBothSets();
   4118     }
   4119     if (result.NonTrivialIntersection()) {
   4120       // The result is as (im)precise as we can possibly make it.
   4121       return result;
   4122     }
   4123     // Progress the range(s) with minimal to-character.
   4124     uc16 word_to = word_range.to();
   4125     uc16 argument_to = argument_range.to();
   4126     if (argument_to <= word_to) {
   4127       j++;
   4128       if (j < range->length()) {
   4129         argument_range = range->at(j);
   4130       }
   4131     }
   4132     if (word_to <= argument_to) {
   4133       i += 2;
   4134       if (i < kWordRangeCount) {
   4135         word_range = CharacterRange(kWordRanges[i], kWordRanges[i + 1]);
   4136       }
   4137     }
   4138   }
   4139   // Check if anything wasn't compared in the loop.
   4140   if (i < kWordRangeCount) {
   4141     // word range contains something not in argument range.
   4142     result.SetElementsInSecondSet();
   4143   } else if (j < range->length()) {
   4144     // Argument range contains something not in word range.
   4145     result.SetElementsInFirstSet();
   4146   }
   4147 
   4148   return result;
   4149 }
   4150 
   4151 
   4152 static void AddUncanonicals(ZoneList<CharacterRange>* ranges,
   4153                             int bottom,
   4154                             int top) {
   4155   unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
   4156   // Zones with no case mappings.  There is a DEBUG-mode loop to assert that
   4157   // this table is correct.
   4158   // 0x0600 - 0x0fff
   4159   // 0x1100 - 0x1cff
   4160   // 0x2000 - 0x20ff
   4161   // 0x2200 - 0x23ff
   4162   // 0x2500 - 0x2bff
   4163   // 0x2e00 - 0xa5ff
   4164   // 0xa800 - 0xfaff
   4165   // 0xfc00 - 0xfeff
   4166   const int boundary_count = 18;
   4167   // The ASCII boundary and the kRangeCanonicalizeMax boundary are also in this
   4168   // array.  This is to split up big ranges and not because they actually denote
   4169   // a case-mapping-free-zone.
   4170   ASSERT(CharacterRange::kRangeCanonicalizeMax < 0x600);
   4171   const int kFirstRealCaselessZoneIndex = 2;
   4172   int boundaries[] = {0x80, CharacterRange::kRangeCanonicalizeMax,
   4173       0x600, 0x1000, 0x1100, 0x1d00, 0x2000, 0x2100, 0x2200, 0x2400, 0x2500,
   4174       0x2c00, 0x2e00, 0xa600, 0xa800, 0xfb00, 0xfc00, 0xff00};
   4175 
   4176   // Special ASCII rule from spec can save us some work here.
   4177   if (bottom == 0x80 && top == 0xffff) return;
   4178 
   4179   // We have optimized support for this range.
   4180   if (top <= CharacterRange::kRangeCanonicalizeMax) {
   4181     CharacterRange range(bottom, top);
   4182     range.AddCaseEquivalents(ranges, false);
   4183     return;
   4184   }
   4185 
   4186   // Split up very large ranges.  This helps remove ranges where there are no
   4187   // case mappings.
   4188   for (int i = 0; i < boundary_count; i++) {
   4189     if (bottom < boundaries[i] && top >= boundaries[i]) {
   4190       AddUncanonicals(ranges, bottom, boundaries[i] - 1);
   4191       AddUncanonicals(ranges, boundaries[i], top);
   4192       return;
   4193     }
   4194   }
   4195 
   4196   // If we are completely in a zone with no case mappings then we are done.
   4197   // We start at 2 so as not to except the ASCII range from mappings.
   4198   for (int i = kFirstRealCaselessZoneIndex; i < boundary_count; i += 2) {
   4199     if (bottom >= boundaries[i] && top < boundaries[i + 1]) {
   4200 #ifdef DEBUG
   4201       for (int j = bottom; j <= top; j++) {
   4202         unsigned current_char = j;
   4203         int length = uncanonicalize.get(current_char, '\0', chars);
   4204         for (int k = 0; k < length; k++) {
   4205           ASSERT(chars[k] == current_char);
   4206         }
   4207       }
   4208 #endif
   4209       return;
   4210     }
   4211   }
   4212 
   4213   // Step through the range finding equivalent characters.
   4214   ZoneList<unibrow::uchar> *characters = new ZoneList<unibrow::uchar>(100);
   4215   for (int i = bottom; i <= top; i++) {
   4216     int length = uncanonicalize.get(i, '\0', chars);
   4217     for (int j = 0; j < length; j++) {
   4218       uc32 chr = chars[j];
   4219       if (chr != i && (chr < bottom || chr > top)) {
   4220         characters->Add(chr);
   4221       }
   4222     }
   4223   }
   4224 
   4225   // Step through the equivalent characters finding simple ranges and
   4226   // adding ranges to the character class.
   4227   if (characters->length() > 0) {
   4228     int new_from = characters->at(0);
   4229     int new_to = new_from;
   4230     for (int i = 1; i < characters->length(); i++) {
   4231       int chr = characters->at(i);
   4232       if (chr == new_to + 1) {
   4233         new_to++;
   4234       } else {
   4235         if (new_to == new_from) {
   4236           ranges->Add(CharacterRange::Singleton(new_from));
   4237         } else {
   4238           ranges->Add(CharacterRange(new_from, new_to));
   4239         }
   4240         new_from = new_to = chr;
   4241       }
   4242     }
   4243     if (new_to == new_from) {
   4244       ranges->Add(CharacterRange::Singleton(new_from));
   4245     } else {
   4246       ranges->Add(CharacterRange(new_from, new_to));
   4247     }
   4248   }
   4249 }
   4250 
   4251 
   4252 ZoneList<CharacterRange>* CharacterSet::ranges() {
   4253   if (ranges_ == NULL) {
   4254     ranges_ = new ZoneList<CharacterRange>(2);
   4255     CharacterRange::AddClassEscape(standard_set_type_, ranges_);
   4256   }
   4257   return ranges_;
   4258 }
   4259 
   4260 
   4261 // Move a number of elements in a zonelist to another position
   4262 // in the same list. Handles overlapping source and target areas.
   4263 static void MoveRanges(ZoneList<CharacterRange>* list,
   4264                        int from,
   4265                        int to,
   4266                        int count) {
   4267   // Ranges are potentially overlapping.
   4268   if (from < to) {
   4269     for (int i = count - 1; i >= 0; i--) {
   4270       list->at(to + i) = list->at(from + i);
   4271     }
   4272   } else {
   4273     for (int i = 0; i < count; i++) {
   4274       list->at(to + i) = list->at(from + i);
   4275     }
   4276   }
   4277 }
   4278 
   4279 
   4280 static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
   4281                                       int count,
   4282                                       CharacterRange insert) {
   4283   // Inserts a range into list[0..count[, which must be sorted
   4284   // by from value and non-overlapping and non-adjacent, using at most
   4285   // list[0..count] for the result. Returns the number of resulting
   4286   // canonicalized ranges. Inserting a range may collapse existing ranges into
   4287   // fewer ranges, so the return value can be anything in the range 1..count+1.
   4288   uc16 from = insert.from();
   4289   uc16 to = insert.to();
   4290   int start_pos = 0;
   4291   int end_pos = count;
   4292   for (int i = count - 1; i >= 0; i--) {
   4293     CharacterRange current = list->at(i);
   4294     if (current.from() > to + 1) {
   4295       end_pos = i;
   4296     } else if (current.to() + 1 < from) {
   4297       start_pos = i + 1;
   4298       break;
   4299     }
   4300   }
   4301 
   4302   // Inserted range overlaps, or is adjacent to, ranges at positions
   4303   // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
   4304   // not affected by the insertion.
   4305   // If start_pos == end_pos, the range must be inserted before start_pos.
   4306   // if start_pos < end_pos, the entire range from start_pos to end_pos
   4307   // must be merged with the insert range.
   4308 
   4309   if (start_pos == end_pos) {
   4310     // Insert between existing ranges at position start_pos.
   4311     if (start_pos < count) {
   4312       MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
   4313     }
   4314     list->at(start_pos) = insert;
   4315     return count + 1;
   4316   }
   4317   if (start_pos + 1 == end_pos) {
   4318     // Replace single existing range at position start_pos.
   4319     CharacterRange to_replace = list->at(start_pos);
   4320     int new_from = Min(to_replace.from(), from);
   4321     int new_to = Max(to_replace.to(), to);
   4322     list->at(start_pos) = CharacterRange(new_from, new_to);
   4323     return count;
   4324   }
   4325   // Replace a number of existing ranges from start_pos to end_pos - 1.
   4326   // Move the remaining ranges down.
   4327 
   4328   int new_from = Min(list->at(start_pos).from(), from);
   4329   int new_to = Max(list->at(end_pos - 1).to(), to);
   4330   if (end_pos < count) {
   4331     MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
   4332   }
   4333   list->at(start_pos) = CharacterRange(new_from, new_to);
   4334   return count - (end_pos - start_pos) + 1;
   4335 }
   4336 
   4337 
   4338 void CharacterSet::Canonicalize() {
   4339   // Special/default classes are always considered canonical. The result
   4340   // of calling ranges() will be sorted.
   4341   if (ranges_ == NULL) return;
   4342   CharacterRange::Canonicalize(ranges_);
   4343 }
   4344 
   4345 
   4346 void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
   4347   if (character_ranges->length() <= 1) return;
   4348   // Check whether ranges are already canonical (increasing, non-overlapping,
   4349   // non-adjacent).
   4350   int n = character_ranges->length();
   4351   int max = character_ranges->at(0).to();
   4352   int i = 1;
   4353   while (i < n) {
   4354     CharacterRange current = character_ranges->at(i);
   4355     if (current.from() <= max + 1) {
   4356       break;
   4357     }
   4358     max = current.to();
   4359     i++;
   4360   }
   4361   // Canonical until the i'th range. If that's all of them, we are done.
   4362   if (i == n) return;
   4363 
   4364   // The ranges at index i and forward are not canonicalized. Make them so by
   4365   // doing the equivalent of insertion sort (inserting each into the previous
   4366   // list, in order).
   4367   // Notice that inserting a range can reduce the number of ranges in the
   4368   // result due to combining of adjacent and overlapping ranges.
   4369   int read = i;  // Range to insert.
   4370   int num_canonical = i;  // Length of canonicalized part of list.
   4371   do {
   4372     num_canonical = InsertRangeInCanonicalList(character_ranges,
   4373                                                num_canonical,
   4374                                                character_ranges->at(read));
   4375     read++;
   4376   } while (read < n);
   4377   character_ranges->Rewind(num_canonical);
   4378 
   4379   ASSERT(CharacterRange::IsCanonical(character_ranges));
   4380 }
   4381 
   4382 
   4383 // Utility function for CharacterRange::Merge. Adds a range at the end of
   4384 // a canonicalized range list, if necessary merging the range with the last
   4385 // range of the list.
   4386 static void AddRangeToSet(ZoneList<CharacterRange>* set, CharacterRange range) {
   4387   if (set == NULL) return;
   4388   ASSERT(set->length() == 0 || set->at(set->length() - 1).to() < range.from());
   4389   int n = set->length();
   4390   if (n > 0) {
   4391     CharacterRange lastRange = set->at(n - 1);
   4392     if (lastRange.to() == range.from() - 1) {
   4393       set->at(n - 1) = CharacterRange(lastRange.from(), range.to());
   4394       return;
   4395     }
   4396   }
   4397   set->Add(range);
   4398 }
   4399 
   4400 
   4401 static void AddRangeToSelectedSet(int selector,
   4402                                   ZoneList<CharacterRange>* first_set,
   4403                                   ZoneList<CharacterRange>* second_set,
   4404                                   ZoneList<CharacterRange>* intersection_set,
   4405                                   CharacterRange range) {
   4406   switch (selector) {
   4407     case kInsideFirst:
   4408       AddRangeToSet(first_set, range);
   4409       break;
   4410     case kInsideSecond:
   4411       AddRangeToSet(second_set, range);
   4412       break;
   4413     case kInsideBoth:
   4414       AddRangeToSet(intersection_set, range);
   4415       break;
   4416   }
   4417 }
   4418 
   4419 
   4420 
   4421 void CharacterRange::Merge(ZoneList<CharacterRange>* first_set,
   4422                            ZoneList<CharacterRange>* second_set,
   4423                            ZoneList<CharacterRange>* first_set_only_out,
   4424                            ZoneList<CharacterRange>* second_set_only_out,
   4425                            ZoneList<CharacterRange>* both_sets_out) {
   4426   // Inputs are canonicalized.
   4427   ASSERT(CharacterRange::IsCanonical(first_set));
   4428   ASSERT(CharacterRange::IsCanonical(second_set));
   4429   // Outputs are empty, if applicable.
   4430   ASSERT(first_set_only_out == NULL || first_set_only_out->length() == 0);
   4431   ASSERT(second_set_only_out == NULL || second_set_only_out->length() == 0);
   4432   ASSERT(both_sets_out == NULL || both_sets_out->length() == 0);
   4433 
   4434   // Merge sets by iterating through the lists in order of lowest "from" value,
   4435   // and putting intervals into one of three sets.
   4436 
   4437   if (first_set->length() == 0) {
   4438     second_set_only_out->AddAll(*second_set);
   4439     return;
   4440   }
   4441   if (second_set->length() == 0) {
   4442     first_set_only_out->AddAll(*first_set);
   4443     return;
   4444   }
   4445   // Indices into input lists.
   4446   int i1 = 0;
   4447   int i2 = 0;
   4448   // Cache length of input lists.
   4449   int n1 = first_set->length();
   4450   int n2 = second_set->length();
   4451   // Current range. May be invalid if state is kInsideNone.
   4452   int from = 0;
   4453   int to = -1;
   4454   // Where current range comes from.
   4455   int state = kInsideNone;
   4456 
   4457   while (i1 < n1 || i2 < n2) {
   4458     CharacterRange next_range;
   4459     int range_source;
   4460     if (i2 == n2 ||
   4461         (i1 < n1 && first_set->at(i1).from() < second_set->at(i2).from())) {
   4462       // Next smallest element is in first set.
   4463       next_range = first_set->at(i1++);
   4464       range_source = kInsideFirst;
   4465     } else {
   4466       // Next smallest element is in second set.
   4467       next_range = second_set->at(i2++);
   4468       range_source = kInsideSecond;
   4469     }
   4470     if (to < next_range.from()) {
   4471       // Ranges disjoint: |current|  |next|
   4472       AddRangeToSelectedSet(state,
   4473                             first_set_only_out,
   4474                             second_set_only_out,
   4475                             both_sets_out,
   4476                             CharacterRange(from, to));
   4477       from = next_range.from();
   4478       to = next_range.to();
   4479       state = range_source;
   4480     } else {
   4481       if (from < next_range.from()) {
   4482         AddRangeToSelectedSet(state,
   4483                               first_set_only_out,
   4484                               second_set_only_out,
   4485                               both_sets_out,
   4486                               CharacterRange(from, next_range.from()-1));
   4487       }
   4488       if (to < next_range.to()) {
   4489         // Ranges overlap:  |current|
   4490         //                       |next|
   4491         AddRangeToSelectedSet(state | range_source,
   4492                               first_set_only_out,
   4493                               second_set_only_out,
   4494                               both_sets_out,
   4495                               CharacterRange(next_range.from(), to));
   4496         from = to + 1;
   4497         to = next_range.to();
   4498         state = range_source;
   4499       } else {
   4500         // Range included:    |current| , possibly ending at same character.
   4501         //                      |next|
   4502         AddRangeToSelectedSet(
   4503             state | range_source,
   4504             first_set_only_out,
   4505             second_set_only_out,
   4506             both_sets_out,
   4507             CharacterRange(next_range.from(), next_range.to()));
   4508         from = next_range.to() + 1;
   4509         // If ranges end at same character, both ranges are consumed completely.
   4510         if (next_range.to() == to) state = kInsideNone;
   4511       }
   4512     }
   4513   }
   4514   AddRangeToSelectedSet(state,
   4515                         first_set_only_out,
   4516                         second_set_only_out,
   4517                         both_sets_out,
   4518                         CharacterRange(from, to));
   4519 }
   4520 
   4521 
   4522 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
   4523                             ZoneList<CharacterRange>* negated_ranges) {
   4524   ASSERT(CharacterRange::IsCanonical(ranges));
   4525   ASSERT_EQ(0, negated_ranges->length());
   4526   int range_count = ranges->length();
   4527   uc16 from = 0;
   4528   int i = 0;
   4529   if (range_count > 0 && ranges->at(0).from() == 0) {
   4530     from = ranges->at(0).to();
   4531     i = 1;
   4532   }
   4533   while (i < range_count) {
   4534     CharacterRange range = ranges->at(i);
   4535     negated_ranges->Add(CharacterRange(from + 1, range.from() - 1));
   4536     from = range.to();
   4537     i++;
   4538   }
   4539   if (from < String::kMaxUC16CharCode) {
   4540     negated_ranges->Add(CharacterRange(from + 1, String::kMaxUC16CharCode));
   4541   }
   4542 }
   4543 
   4544 
   4545 
   4546 // -------------------------------------------------------------------
   4547 // Interest propagation
   4548 
   4549 
   4550 RegExpNode* RegExpNode::TryGetSibling(NodeInfo* info) {
   4551   for (int i = 0; i < siblings_.length(); i++) {
   4552     RegExpNode* sibling = siblings_.Get(i);
   4553     if (sibling->info()->Matches(info))
   4554       return sibling;
   4555   }
   4556   return NULL;
   4557 }
   4558 
   4559 
   4560 RegExpNode* RegExpNode::EnsureSibling(NodeInfo* info, bool* cloned) {
   4561   ASSERT_EQ(false, *cloned);
   4562   siblings_.Ensure(this);
   4563   RegExpNode* result = TryGetSibling(info);
   4564   if (result != NULL) return result;
   4565   result = this->Clone();
   4566   NodeInfo* new_info = result->info();
   4567   new_info->ResetCompilationState();
   4568   new_info->AddFromPreceding(info);
   4569   AddSibling(result);
   4570   *cloned = true;
   4571   return result;
   4572 }
   4573 
   4574 
   4575 template <class C>
   4576 static RegExpNode* PropagateToEndpoint(C* node, NodeInfo* info) {
   4577   NodeInfo full_info(*node->info());
   4578   full_info.AddFromPreceding(info);
   4579   bool cloned = false;
   4580   return RegExpNode::EnsureSibling(node, &full_info, &cloned);
   4581 }
   4582 
   4583 
   4584 // -------------------------------------------------------------------
   4585 // Splay tree
   4586 
   4587 
   4588 OutSet* OutSet::Extend(unsigned value) {
   4589   if (Get(value))
   4590     return this;
   4591   if (successors() != NULL) {
   4592     for (int i = 0; i < successors()->length(); i++) {
   4593       OutSet* successor = successors()->at(i);
   4594       if (successor->Get(value))
   4595         return successor;
   4596     }
   4597   } else {
   4598     successors_ = new ZoneList<OutSet*>(2);
   4599   }
   4600   OutSet* result = new OutSet(first_, remaining_);
   4601   result->Set(value);
   4602   successors()->Add(result);
   4603   return result;
   4604 }
   4605 
   4606 
   4607 void OutSet::Set(unsigned value) {
   4608   if (value < kFirstLimit) {
   4609     first_ |= (1 << value);
   4610   } else {
   4611     if (remaining_ == NULL)
   4612       remaining_ = new ZoneList<unsigned>(1);
   4613     if (remaining_->is_empty() || !remaining_->Contains(value))
   4614       remaining_->Add(value);
   4615   }
   4616 }
   4617 
   4618 
   4619 bool OutSet::Get(unsigned value) {
   4620   if (value < kFirstLimit) {
   4621     return (first_ & (1 << value)) != 0;
   4622   } else if (remaining_ == NULL) {
   4623     return false;
   4624   } else {
   4625     return remaining_->Contains(value);
   4626   }
   4627 }
   4628 
   4629 
   4630 const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
   4631 const DispatchTable::Entry DispatchTable::Config::kNoValue;
   4632 
   4633 
   4634 void DispatchTable::AddRange(CharacterRange full_range, int value) {
   4635   CharacterRange current = full_range;
   4636   if (tree()->is_empty()) {
   4637     // If this is the first range we just insert into the table.
   4638     ZoneSplayTree<Config>::Locator loc;
   4639     ASSERT_RESULT(tree()->Insert(current.from(), &loc));
   4640     loc.set_value(Entry(current.from(), current.to(), empty()->Extend(value)));
   4641     return;
   4642   }
   4643   // First see if there is a range to the left of this one that
   4644   // overlaps.
   4645   ZoneSplayTree<Config>::Locator loc;
   4646   if (tree()->FindGreatestLessThan(current.from(), &loc)) {
   4647     Entry* entry = &loc.value();
   4648     // If we've found a range that overlaps with this one, and it
   4649     // starts strictly to the left of this one, we have to fix it
   4650     // because the following code only handles ranges that start on
   4651     // or after the start point of the range we're adding.
   4652     if (entry->from() < current.from() && entry->to() >= current.from()) {
   4653       // Snap the overlapping range in half around the start point of
   4654       // the range we're adding.
   4655       CharacterRange left(entry->from(), current.from() - 1);
   4656       CharacterRange right(current.from(), entry->to());
   4657       // The left part of the overlapping range doesn't overlap.
   4658       // Truncate the whole entry to be just the left part.
   4659       entry->set_to(left.to());
   4660       // The right part is the one that overlaps.  We add this part
   4661       // to the map and let the next step deal with merging it with
   4662       // the range we're adding.
   4663       ZoneSplayTree<Config>::Locator loc;
   4664       ASSERT_RESULT(tree()->Insert(right.from(), &loc));
   4665       loc.set_value(Entry(right.from(),
   4666                           right.to(),
   4667                           entry->out_set()));
   4668     }
   4669   }
   4670   while (current.is_valid()) {
   4671     if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
   4672         (loc.value().from() <= current.to()) &&
   4673         (loc.value().to() >= current.from())) {
   4674       Entry* entry = &loc.value();
   4675       // We have overlap.  If there is space between the start point of
   4676       // the range we're adding and where the overlapping range starts
   4677       // then we have to add a range covering just that space.
   4678       if (current.from() < entry->from()) {
   4679         ZoneSplayTree<Config>::Locator ins;
   4680         ASSERT_RESULT(tree()->Insert(current.from(), &ins));
   4681         ins.set_value(Entry(current.from(),
   4682                             entry->from() - 1,
   4683                             empty()->Extend(value)));
   4684         current.set_from(entry->from());
   4685       }
   4686       ASSERT_EQ(current.from(), entry->from());
   4687       // If the overlapping range extends beyond the one we want to add
   4688       // we have to snap the right part off and add it separately.
   4689       if (entry->to() > current.to()) {
   4690         ZoneSplayTree<Config>::Locator ins;
   4691         ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins));
   4692         ins.set_value(Entry(current.to() + 1,
   4693                             entry->to(),
   4694                             entry->out_set()));
   4695         entry->set_to(current.to());
   4696       }
   4697       ASSERT(entry->to() <= current.to());
   4698       // The overlapping range is now completely contained by the range
   4699       // we're adding so we can just update it and move the start point
   4700       // of the range we're adding just past it.
   4701       entry->AddValue(value);
   4702       // Bail out if the last interval ended at 0xFFFF since otherwise
   4703       // adding 1 will wrap around to 0.
   4704       if (entry->to() == String::kMaxUC16CharCode)
   4705         break;
   4706       ASSERT(entry->to() + 1 > current.from());
   4707       current.set_from(entry->to() + 1);
   4708     } else {
   4709       // There is no overlap so we can just add the range
   4710       ZoneSplayTree<Config>::Locator ins;
   4711       ASSERT_RESULT(tree()->Insert(current.from(), &ins));
   4712       ins.set_value(Entry(current.from(),
   4713                           current.to(),
   4714                           empty()->Extend(value)));
   4715       break;
   4716     }
   4717   }
   4718 }
   4719 
   4720 
   4721 OutSet* DispatchTable::Get(uc16 value) {
   4722   ZoneSplayTree<Config>::Locator loc;
   4723   if (!tree()->FindGreatestLessThan(value, &loc))
   4724     return empty();
   4725   Entry* entry = &loc.value();
   4726   if (value <= entry->to())
   4727     return entry->out_set();
   4728   else
   4729     return empty();
   4730 }
   4731 
   4732 
   4733 // -------------------------------------------------------------------
   4734 // Analysis
   4735 
   4736 
   4737 void Analysis::EnsureAnalyzed(RegExpNode* that) {
   4738   StackLimitCheck check;
   4739   if (check.HasOverflowed()) {
   4740     fail("Stack overflow");
   4741     return;
   4742   }
   4743   if (that->info()->been_analyzed || that->info()->being_analyzed)
   4744     return;
   4745   that->info()->being_analyzed = true;
   4746   that->Accept(this);
   4747   that->info()->being_analyzed = false;
   4748   that->info()->been_analyzed = true;
   4749 }
   4750 
   4751 
   4752 void Analysis::VisitEnd(EndNode* that) {
   4753   // nothing to do
   4754 }
   4755 
   4756 
   4757 void TextNode::CalculateOffsets() {
   4758   int element_count = elements()->length();
   4759   // Set up the offsets of the elements relative to the start.  This is a fixed
   4760   // quantity since a TextNode can only contain fixed-width things.
   4761   int cp_offset = 0;
   4762   for (int i = 0; i < element_count; i++) {
   4763     TextElement& elm = elements()->at(i);
   4764     elm.cp_offset = cp_offset;
   4765     if (elm.type == TextElement::ATOM) {
   4766       cp_offset += elm.data.u_atom->data().length();
   4767     } else {
   4768       cp_offset++;
   4769       Vector<const uc16> quarks = elm.data.u_atom->data();
   4770     }
   4771   }
   4772 }
   4773 
   4774 
   4775 void Analysis::VisitText(TextNode* that) {
   4776   if (ignore_case_) {
   4777     that->MakeCaseIndependent(is_ascii_);
   4778   }
   4779   EnsureAnalyzed(that->on_success());
   4780   if (!has_failed()) {
   4781     that->CalculateOffsets();
   4782   }
   4783 }
   4784 
   4785 
   4786 void Analysis::VisitAction(ActionNode* that) {
   4787   RegExpNode* target = that->on_success();
   4788   EnsureAnalyzed(target);
   4789   if (!has_failed()) {
   4790     // If the next node is interested in what it follows then this node
   4791     // has to be interested too so it can pass the information on.
   4792     that->info()->AddFromFollowing(target->info());
   4793   }
   4794 }
   4795 
   4796 
   4797 void Analysis::VisitChoice(ChoiceNode* that) {
   4798   NodeInfo* info = that->info();
   4799   for (int i = 0; i < that->alternatives()->length(); i++) {
   4800     RegExpNode* node = that->alternatives()->at(i).node();
   4801     EnsureAnalyzed(node);
   4802     if (has_failed()) return;
   4803     // Anything the following nodes need to know has to be known by
   4804     // this node also, so it can pass it on.
   4805     info->AddFromFollowing(node->info());
   4806   }
   4807 }
   4808 
   4809 
   4810 void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
   4811   NodeInfo* info = that->info();
   4812   for (int i = 0; i < that->alternatives()->length(); i++) {
   4813     RegExpNode* node = that->alternatives()->at(i).node();
   4814     if (node != that->loop_node()) {
   4815       EnsureAnalyzed(node);
   4816       if (has_failed()) return;
   4817       info->AddFromFollowing(node->info());
   4818     }
   4819   }
   4820   // Check the loop last since it may need the value of this node
   4821   // to get a correct result.
   4822   EnsureAnalyzed(that->loop_node());
   4823   if (!has_failed()) {
   4824     info->AddFromFollowing(that->loop_node()->info());
   4825   }
   4826 }
   4827 
   4828 
   4829 void Analysis::VisitBackReference(BackReferenceNode* that) {
   4830   EnsureAnalyzed(that->on_success());
   4831 }
   4832 
   4833 
   4834 void Analysis::VisitAssertion(AssertionNode* that) {
   4835   EnsureAnalyzed(that->on_success());
   4836   AssertionNode::AssertionNodeType type = that->type();
   4837   if (type == AssertionNode::AT_BOUNDARY ||
   4838       type == AssertionNode::AT_NON_BOUNDARY) {
   4839     // Check if the following character is known to be a word character
   4840     // or known to not be a word character.
   4841     ZoneList<CharacterRange>* following_chars = that->FirstCharacterSet();
   4842 
   4843     CharacterRange::Canonicalize(following_chars);
   4844 
   4845     SetRelation word_relation =
   4846         CharacterRange::WordCharacterRelation(following_chars);
   4847     if (word_relation.Disjoint()) {
   4848       // Includes the case where following_chars is empty (e.g., end-of-input).
   4849       // Following character is definitely *not* a word character.
   4850       type = (type == AssertionNode::AT_BOUNDARY) ?
   4851                  AssertionNode::AFTER_WORD_CHARACTER :
   4852                  AssertionNode::AFTER_NONWORD_CHARACTER;
   4853       that->set_type(type);
   4854     } else if (word_relation.ContainedIn()) {
   4855       // Following character is definitely a word character.
   4856       type = (type == AssertionNode::AT_BOUNDARY) ?
   4857                  AssertionNode::AFTER_NONWORD_CHARACTER :
   4858                  AssertionNode::AFTER_WORD_CHARACTER;
   4859       that->set_type(type);
   4860     }
   4861   }
   4862 }
   4863 
   4864 
   4865 ZoneList<CharacterRange>* RegExpNode::FirstCharacterSet() {
   4866   if (first_character_set_ == NULL) {
   4867     if (ComputeFirstCharacterSet(kFirstCharBudget) < 0) {
   4868       // If we can't find an exact solution within the budget, we
   4869       // set the value to the set of every character, i.e., all characters
   4870       // are possible.
   4871       ZoneList<CharacterRange>* all_set = new ZoneList<CharacterRange>(1);
   4872       all_set->Add(CharacterRange::Everything());
   4873       first_character_set_ = all_set;
   4874     }
   4875   }
   4876   return first_character_set_;
   4877 }
   4878 
   4879 
   4880 int RegExpNode::ComputeFirstCharacterSet(int budget) {
   4881   // Default behavior is to not be able to determine the first character.
   4882   return kComputeFirstCharacterSetFail;
   4883 }
   4884 
   4885 
   4886 int LoopChoiceNode::ComputeFirstCharacterSet(int budget) {
   4887   budget--;
   4888   if (budget >= 0) {
   4889     // Find loop min-iteration. It's the value of the guarded choice node
   4890     // with a GEQ guard, if any.
   4891     int min_repetition = 0;
   4892 
   4893     for (int i = 0; i <= 1; i++) {
   4894       GuardedAlternative alternative = alternatives()->at(i);
   4895       ZoneList<Guard*>* guards = alternative.guards();
   4896       if (guards != NULL && guards->length() > 0) {
   4897         Guard* guard = guards->at(0);
   4898         if (guard->op() == Guard::GEQ) {
   4899           min_repetition = guard->value();
   4900           break;
   4901         }
   4902       }
   4903     }
   4904 
   4905     budget = loop_node()->ComputeFirstCharacterSet(budget);
   4906     if (budget >= 0) {
   4907       ZoneList<CharacterRange>* character_set =
   4908           loop_node()->first_character_set();
   4909       if (body_can_be_zero_length() || min_repetition == 0) {
   4910         budget = continue_node()->ComputeFirstCharacterSet(budget);
   4911         if (budget < 0) return budget;
   4912         ZoneList<CharacterRange>* body_set =
   4913             continue_node()->first_character_set();
   4914         ZoneList<CharacterRange>* union_set =
   4915           new ZoneList<CharacterRange>(Max(character_set->length(),
   4916                                            body_set->length()));
   4917         CharacterRange::Merge(character_set,
   4918                               body_set,
   4919                               union_set,
   4920                               union_set,
   4921                               union_set);
   4922         character_set = union_set;
   4923       }
   4924       set_first_character_set(character_set);
   4925     }
   4926   }
   4927   return budget;
   4928 }
   4929 
   4930 
   4931 int NegativeLookaheadChoiceNode::ComputeFirstCharacterSet(int budget) {
   4932   budget--;
   4933   if (budget >= 0) {
   4934     GuardedAlternative successor = this->alternatives()->at(1);
   4935     RegExpNode* successor_node = successor.node();
   4936     budget = successor_node->ComputeFirstCharacterSet(budget);
   4937     if (budget >= 0) {
   4938       set_first_character_set(successor_node->first_character_set());
   4939     }
   4940   }
   4941   return budget;
   4942 }
   4943 
   4944 
   4945 // The first character set of an EndNode is unknowable. Just use the
   4946 // default implementation that fails and returns all characters as possible.
   4947 
   4948 
   4949 int AssertionNode::ComputeFirstCharacterSet(int budget) {
   4950   budget -= 1;
   4951   if (budget >= 0) {
   4952     switch (type_) {
   4953       case AT_END: {
   4954         set_first_character_set(new ZoneList<CharacterRange>(0));
   4955         break;
   4956       }
   4957       case AT_START:
   4958       case AT_BOUNDARY:
   4959       case AT_NON_BOUNDARY:
   4960       case AFTER_NEWLINE:
   4961       case AFTER_NONWORD_CHARACTER:
   4962       case AFTER_WORD_CHARACTER: {
   4963         ASSERT_NOT_NULL(on_success());
   4964         budget = on_success()->ComputeFirstCharacterSet(budget);
   4965         set_first_character_set(on_success()->first_character_set());
   4966         break;
   4967       }
   4968     }
   4969   }
   4970   return budget;
   4971 }
   4972 
   4973 
   4974 int ActionNode::ComputeFirstCharacterSet(int budget) {
   4975   if (type_ == POSITIVE_SUBMATCH_SUCCESS) return kComputeFirstCharacterSetFail;
   4976   budget--;
   4977   if (budget >= 0) {
   4978     ASSERT_NOT_NULL(on_success());
   4979     budget = on_success()->ComputeFirstCharacterSet(budget);
   4980     if (budget >= 0) {
   4981       set_first_character_set(on_success()->first_character_set());
   4982     }
   4983   }
   4984   return budget;
   4985 }
   4986 
   4987 
   4988 int BackReferenceNode::ComputeFirstCharacterSet(int budget) {
   4989   // We don't know anything about the first character of a backreference
   4990   // at this point.
   4991   return kComputeFirstCharacterSetFail;
   4992 }
   4993 
   4994 
   4995 int TextNode::ComputeFirstCharacterSet(int budget) {
   4996   budget--;
   4997   if (budget >= 0) {
   4998     ASSERT_NE(0, elements()->length());
   4999     TextElement text = elements()->at(0);
   5000     if (text.type == TextElement::ATOM) {
   5001       RegExpAtom* atom = text.data.u_atom;
   5002       ASSERT_NE(0, atom->length());
   5003       uc16 first_char = atom->data()[0];
   5004       ZoneList<CharacterRange>* range = new ZoneList<CharacterRange>(1);
   5005       range->Add(CharacterRange(first_char, first_char));
   5006       set_first_character_set(range);
   5007     } else {
   5008       ASSERT(text.type == TextElement::CHAR_CLASS);
   5009       RegExpCharacterClass* char_class = text.data.u_char_class;
   5010       if (char_class->is_negated()) {
   5011         ZoneList<CharacterRange>* ranges = char_class->ranges();
   5012         int length = ranges->length();
   5013         int new_length = length + 1;
   5014         if (length > 0) {
   5015           if (ranges->at(0).from() == 0) new_length--;
   5016           if (ranges->at(length - 1).to() == String::kMaxUC16CharCode) {
   5017             new_length--;
   5018           }
   5019         }
   5020         ZoneList<CharacterRange>* negated_ranges =
   5021             new ZoneList<CharacterRange>(new_length);
   5022         CharacterRange::Negate(ranges, negated_ranges);
   5023         set_first_character_set(negated_ranges);
   5024       } else {
   5025         set_first_character_set(char_class->ranges());
   5026       }
   5027     }
   5028   }
   5029   return budget;
   5030 }
   5031 
   5032 
   5033 
   5034 // -------------------------------------------------------------------
   5035 // Dispatch table construction
   5036 
   5037 
   5038 void DispatchTableConstructor::VisitEnd(EndNode* that) {
   5039   AddRange(CharacterRange::Everything());
   5040 }
   5041 
   5042 
   5043 void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
   5044   node->set_being_calculated(true);
   5045   ZoneList<GuardedAlternative>* alternatives = node->alternatives();
   5046   for (int i = 0; i < alternatives->length(); i++) {
   5047     set_choice_index(i);
   5048     alternatives->at(i).node()->Accept(this);
   5049   }
   5050   node->set_being_calculated(false);
   5051 }
   5052 
   5053 
   5054 class AddDispatchRange {
   5055  public:
   5056   explicit AddDispatchRange(DispatchTableConstructor* constructor)
   5057     : constructor_(constructor) { }
   5058   void Call(uc32 from, DispatchTable::Entry entry);
   5059  private:
   5060   DispatchTableConstructor* constructor_;
   5061 };
   5062 
   5063 
   5064 void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
   5065   CharacterRange range(from, entry.to());
   5066   constructor_->AddRange(range);
   5067 }
   5068 
   5069 
   5070 void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
   5071   if (node->being_calculated())
   5072     return;
   5073   DispatchTable* table = node->GetTable(ignore_case_);
   5074   AddDispatchRange adder(this);
   5075   table->ForEach(&adder);
   5076 }
   5077 
   5078 
   5079 void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
   5080   // TODO(160): Find the node that we refer back to and propagate its start
   5081   // set back to here.  For now we just accept anything.
   5082   AddRange(CharacterRange::Everything());
   5083 }
   5084 
   5085 
   5086 void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
   5087   RegExpNode* target = that->on_success();
   5088   target->Accept(this);
   5089 }
   5090 
   5091 
   5092 static int CompareRangeByFrom(const CharacterRange* a,
   5093                               const CharacterRange* b) {
   5094   return Compare<uc16>(a->from(), b->from());
   5095 }
   5096 
   5097 
   5098 void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
   5099   ranges->Sort(CompareRangeByFrom);
   5100   uc16 last = 0;
   5101   for (int i = 0; i < ranges->length(); i++) {
   5102     CharacterRange range = ranges->at(i);
   5103     if (last < range.from())
   5104       AddRange(CharacterRange(last, range.from() - 1));
   5105     if (range.to() >= last) {
   5106       if (range.to() == String::kMaxUC16CharCode) {
   5107         return;
   5108       } else {
   5109         last = range.to() + 1;
   5110       }
   5111     }
   5112   }
   5113   AddRange(CharacterRange(last, String::kMaxUC16CharCode));
   5114 }
   5115 
   5116 
   5117 void DispatchTableConstructor::VisitText(TextNode* that) {
   5118   TextElement elm = that->elements()->at(0);
   5119   switch (elm.type) {
   5120     case TextElement::ATOM: {
   5121       uc16 c = elm.data.u_atom->data()[0];
   5122       AddRange(CharacterRange(c, c));
   5123       break;
   5124     }
   5125     case TextElement::CHAR_CLASS: {
   5126       RegExpCharacterClass* tree = elm.data.u_char_class;
   5127       ZoneList<CharacterRange>* ranges = tree->ranges();
   5128       if (tree->is_negated()) {
   5129         AddInverse(ranges);
   5130       } else {
   5131         for (int i = 0; i < ranges->length(); i++)
   5132           AddRange(ranges->at(i));
   5133       }
   5134       break;
   5135     }
   5136     default: {
   5137       UNIMPLEMENTED();
   5138     }
   5139   }
   5140 }
   5141 
   5142 
   5143 void DispatchTableConstructor::VisitAction(ActionNode* that) {
   5144   RegExpNode* target = that->on_success();
   5145   target->Accept(this);
   5146 }
   5147 
   5148 
   5149 RegExpEngine::CompilationResult RegExpEngine::Compile(RegExpCompileData* data,
   5150                                                       bool ignore_case,
   5151                                                       bool is_multiline,
   5152                                                       Handle<String> pattern,
   5153                                                       bool is_ascii) {
   5154   if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
   5155     return IrregexpRegExpTooBig();
   5156   }
   5157   RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii);
   5158   // Wrap the body of the regexp in capture #0.
   5159   RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
   5160                                                     0,
   5161                                                     &compiler,
   5162                                                     compiler.accept());
   5163   RegExpNode* node = captured_body;
   5164   if (!data->tree->IsAnchored()) {
   5165     // Add a .*? at the beginning, outside the body capture, unless
   5166     // this expression is anchored at the beginning.
   5167     RegExpNode* loop_node =
   5168         RegExpQuantifier::ToNode(0,
   5169                                  RegExpTree::kInfinity,
   5170                                  false,
   5171                                  new RegExpCharacterClass('*'),
   5172                                  &compiler,
   5173                                  captured_body,
   5174                                  data->contains_anchor);
   5175 
   5176     if (data->contains_anchor) {
   5177       // Unroll loop once, to take care of the case that might start
   5178       // at the start of input.
   5179       ChoiceNode* first_step_node = new ChoiceNode(2);
   5180       first_step_node->AddAlternative(GuardedAlternative(captured_body));
   5181       first_step_node->AddAlternative(GuardedAlternative(
   5182           new TextNode(new RegExpCharacterClass('*'), loop_node)));
   5183       node = first_step_node;
   5184     } else {
   5185       node = loop_node;
   5186     }
   5187   }
   5188   data->node = node;
   5189   Analysis analysis(ignore_case, is_ascii);
   5190   analysis.EnsureAnalyzed(node);
   5191   if (analysis.has_failed()) {
   5192     const char* error_message = analysis.error_message();
   5193     return CompilationResult(error_message);
   5194   }
   5195 
   5196   NodeInfo info = *node->info();
   5197 
   5198   // Create the correct assembler for the architecture.
   5199 #ifdef V8_NATIVE_REGEXP
   5200   // Native regexp implementation.
   5201 
   5202   NativeRegExpMacroAssembler::Mode mode =
   5203       is_ascii ? NativeRegExpMacroAssembler::ASCII
   5204                : NativeRegExpMacroAssembler::UC16;
   5205 
   5206 #if V8_TARGET_ARCH_IA32
   5207   RegExpMacroAssemblerIA32 macro_assembler(mode, (data->capture_count + 1) * 2);
   5208 #elif V8_TARGET_ARCH_X64
   5209   RegExpMacroAssemblerX64 macro_assembler(mode, (data->capture_count + 1) * 2);
   5210 #elif V8_TARGET_ARCH_ARM
   5211   RegExpMacroAssemblerARM macro_assembler(mode, (data->capture_count + 1) * 2);
   5212 #endif
   5213 
   5214 #else  // ! V8_NATIVE_REGEXP
   5215   // Interpreted regexp implementation.
   5216   EmbeddedVector<byte, 1024> codes;
   5217   RegExpMacroAssemblerIrregexp macro_assembler(codes);
   5218 #endif
   5219 
   5220   return compiler.Assemble(&macro_assembler,
   5221                            node,
   5222                            data->capture_count,
   5223                            pattern);
   5224 }
   5225 
   5226 
   5227 int OffsetsVector::static_offsets_vector_[
   5228     OffsetsVector::kStaticOffsetsVectorSize];
   5229 
   5230 }}  // namespace v8::internal
   5231