Home | History | Annotate | Download | only in regexp
      1 // Copyright 2012 the V8 project authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "src/regexp/regexp-macro-assembler.h"
      6 
      7 #include "src/assembler.h"
      8 #include "src/isolate-inl.h"
      9 #include "src/regexp/regexp-stack.h"
     10 #include "src/simulator.h"
     11 
     12 #ifdef V8_I18N_SUPPORT
     13 #include "unicode/uchar.h"
     14 #endif  // V8_I18N_SUPPORT
     15 
     16 namespace v8 {
     17 namespace internal {
     18 
     19 RegExpMacroAssembler::RegExpMacroAssembler(Isolate* isolate, Zone* zone)
     20     : slow_safe_compiler_(false),
     21       global_mode_(NOT_GLOBAL),
     22       isolate_(isolate),
     23       zone_(zone) {}
     24 
     25 
     26 RegExpMacroAssembler::~RegExpMacroAssembler() {
     27 }
     28 
     29 
     30 int RegExpMacroAssembler::CaseInsensitiveCompareUC16(Address byte_offset1,
     31                                                      Address byte_offset2,
     32                                                      size_t byte_length,
     33                                                      Isolate* isolate) {
     34   unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
     35       isolate->regexp_macro_assembler_canonicalize();
     36   // This function is not allowed to cause a garbage collection.
     37   // A GC might move the calling generated code and invalidate the
     38   // return address on the stack.
     39   DCHECK(byte_length % 2 == 0);
     40   uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
     41   uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
     42   size_t length = byte_length >> 1;
     43 
     44 #ifdef V8_I18N_SUPPORT
     45   if (isolate == nullptr) {
     46     for (size_t i = 0; i < length; i++) {
     47       uc32 c1 = substring1[i];
     48       uc32 c2 = substring2[i];
     49       if (unibrow::Utf16::IsLeadSurrogate(c1)) {
     50         // Non-BMP characters do not have case-equivalents in the BMP.
     51         // Both have to be non-BMP for them to be able to match.
     52         if (!unibrow::Utf16::IsLeadSurrogate(c2)) return 0;
     53         if (i + 1 < length) {
     54           uc16 c1t = substring1[i + 1];
     55           uc16 c2t = substring2[i + 1];
     56           if (unibrow::Utf16::IsTrailSurrogate(c1t) &&
     57               unibrow::Utf16::IsTrailSurrogate(c2t)) {
     58             c1 = unibrow::Utf16::CombineSurrogatePair(c1, c1t);
     59             c2 = unibrow::Utf16::CombineSurrogatePair(c2, c2t);
     60             i++;
     61           }
     62         }
     63       }
     64       c1 = u_foldCase(c1, U_FOLD_CASE_DEFAULT);
     65       c2 = u_foldCase(c2, U_FOLD_CASE_DEFAULT);
     66       if (c1 != c2) return 0;
     67     }
     68     return 1;
     69   }
     70 #endif  // V8_I18N_SUPPORT
     71   DCHECK_NOT_NULL(isolate);
     72   for (size_t i = 0; i < length; i++) {
     73     unibrow::uchar c1 = substring1[i];
     74     unibrow::uchar c2 = substring2[i];
     75     if (c1 != c2) {
     76       unibrow::uchar s1[1] = {c1};
     77       canonicalize->get(c1, '\0', s1);
     78       if (s1[0] != c2) {
     79         unibrow::uchar s2[1] = {c2};
     80         canonicalize->get(c2, '\0', s2);
     81         if (s1[0] != s2[0]) {
     82           return 0;
     83         }
     84       }
     85     }
     86   }
     87   return 1;
     88 }
     89 
     90 
     91 void RegExpMacroAssembler::CheckNotInSurrogatePair(int cp_offset,
     92                                                    Label* on_failure) {
     93   Label ok;
     94   // Check that current character is not a trail surrogate.
     95   LoadCurrentCharacter(cp_offset, &ok);
     96   CheckCharacterNotInRange(kTrailSurrogateStart, kTrailSurrogateEnd, &ok);
     97   // Check that previous character is not a lead surrogate.
     98   LoadCurrentCharacter(cp_offset - 1, &ok);
     99   CheckCharacterInRange(kLeadSurrogateStart, kLeadSurrogateEnd, on_failure);
    100   Bind(&ok);
    101 }
    102 
    103 void RegExpMacroAssembler::CheckPosition(int cp_offset,
    104                                          Label* on_outside_input) {
    105   LoadCurrentCharacter(cp_offset, on_outside_input, true);
    106 }
    107 
    108 bool RegExpMacroAssembler::CheckSpecialCharacterClass(uc16 type,
    109                                                       Label* on_no_match) {
    110   return false;
    111 }
    112 
    113 #ifndef V8_INTERPRETED_REGEXP  // Avoid unused code, e.g., on ARM.
    114 
    115 NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate,
    116                                                        Zone* zone)
    117     : RegExpMacroAssembler(isolate, zone) {}
    118 
    119 
    120 NativeRegExpMacroAssembler::~NativeRegExpMacroAssembler() {
    121 }
    122 
    123 
    124 bool NativeRegExpMacroAssembler::CanReadUnaligned() {
    125   return FLAG_enable_unaligned_accesses && !slow_safe();
    126 }
    127 
    128 const byte* NativeRegExpMacroAssembler::StringCharacterPosition(
    129     String* subject,
    130     int start_index) {
    131   if (subject->IsConsString()) {
    132     subject = ConsString::cast(subject)->first();
    133   } else if (subject->IsSlicedString()) {
    134     start_index += SlicedString::cast(subject)->offset();
    135     subject = SlicedString::cast(subject)->parent();
    136   }
    137   DCHECK(start_index >= 0);
    138   DCHECK(start_index <= subject->length());
    139   if (subject->IsSeqOneByteString()) {
    140     return reinterpret_cast<const byte*>(
    141         SeqOneByteString::cast(subject)->GetChars() + start_index);
    142   } else if (subject->IsSeqTwoByteString()) {
    143     return reinterpret_cast<const byte*>(
    144         SeqTwoByteString::cast(subject)->GetChars() + start_index);
    145   } else if (subject->IsExternalOneByteString()) {
    146     return reinterpret_cast<const byte*>(
    147         ExternalOneByteString::cast(subject)->GetChars() + start_index);
    148   } else {
    149     return reinterpret_cast<const byte*>(
    150         ExternalTwoByteString::cast(subject)->GetChars() + start_index);
    151   }
    152 }
    153 
    154 
    155 int NativeRegExpMacroAssembler::CheckStackGuardState(
    156     Isolate* isolate, int start_index, bool is_direct_call,
    157     Address* return_address, Code* re_code, String** subject,
    158     const byte** input_start, const byte** input_end) {
    159   DCHECK(re_code->instruction_start() <= *return_address);
    160   DCHECK(*return_address <= re_code->instruction_end());
    161   int return_value = 0;
    162   // Prepare for possible GC.
    163   HandleScope handles(isolate);
    164   Handle<Code> code_handle(re_code);
    165   Handle<String> subject_handle(*subject);
    166   bool is_one_byte = subject_handle->IsOneByteRepresentationUnderneath();
    167 
    168   StackLimitCheck check(isolate);
    169   if (check.JsHasOverflowed()) {
    170     isolate->StackOverflow();
    171     return_value = EXCEPTION;
    172   } else if (is_direct_call) {
    173     // If not real stack overflow the stack guard was used to interrupt
    174     // execution for another purpose.  If this is a direct call from JavaScript
    175     // retry the RegExp forcing the call through the runtime system.
    176     // Currently the direct call cannot handle a GC.
    177     return_value = RETRY;
    178   } else {
    179     Object* result = isolate->stack_guard()->HandleInterrupts();
    180     if (result->IsException(isolate)) return_value = EXCEPTION;
    181   }
    182 
    183   DisallowHeapAllocation no_gc;
    184 
    185   if (*code_handle != re_code) {  // Return address no longer valid
    186     intptr_t delta = code_handle->address() - re_code->address();
    187     // Overwrite the return address on the stack.
    188     *return_address += delta;
    189   }
    190 
    191   // If we continue, we need to update the subject string addresses.
    192   if (return_value == 0) {
    193     // String encoding might have changed.
    194     if (subject_handle->IsOneByteRepresentationUnderneath() != is_one_byte) {
    195       // If we changed between an LATIN1 and an UC16 string, the specialized
    196       // code cannot be used, and we need to restart regexp matching from
    197       // scratch (including, potentially, compiling a new version of the code).
    198       return_value = RETRY;
    199     } else {
    200       *subject = *subject_handle;
    201       intptr_t byte_length = *input_end - *input_start;
    202       *input_start = StringCharacterPosition(*subject, start_index);
    203       *input_end = *input_start + byte_length;
    204     }
    205   }
    206   return return_value;
    207 }
    208 
    209 
    210 NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Match(
    211     Handle<Code> regexp_code,
    212     Handle<String> subject,
    213     int* offsets_vector,
    214     int offsets_vector_length,
    215     int previous_index,
    216     Isolate* isolate) {
    217 
    218   DCHECK(subject->IsFlat());
    219   DCHECK(previous_index >= 0);
    220   DCHECK(previous_index <= subject->length());
    221 
    222   // No allocations before calling the regexp, but we can't use
    223   // DisallowHeapAllocation, since regexps might be preempted, and another
    224   // thread might do allocation anyway.
    225 
    226   String* subject_ptr = *subject;
    227   // Character offsets into string.
    228   int start_offset = previous_index;
    229   int char_length = subject_ptr->length() - start_offset;
    230   int slice_offset = 0;
    231 
    232   // The string has been flattened, so if it is a cons string it contains the
    233   // full string in the first part.
    234   if (StringShape(subject_ptr).IsCons()) {
    235     DCHECK_EQ(0, ConsString::cast(subject_ptr)->second()->length());
    236     subject_ptr = ConsString::cast(subject_ptr)->first();
    237   } else if (StringShape(subject_ptr).IsSliced()) {
    238     SlicedString* slice = SlicedString::cast(subject_ptr);
    239     subject_ptr = slice->parent();
    240     slice_offset = slice->offset();
    241   }
    242   // Ensure that an underlying string has the same representation.
    243   bool is_one_byte = subject_ptr->IsOneByteRepresentation();
    244   DCHECK(subject_ptr->IsExternalString() || subject_ptr->IsSeqString());
    245   // String is now either Sequential or External
    246   int char_size_shift = is_one_byte ? 0 : 1;
    247 
    248   const byte* input_start =
    249       StringCharacterPosition(subject_ptr, start_offset + slice_offset);
    250   int byte_length = char_length << char_size_shift;
    251   const byte* input_end = input_start + byte_length;
    252   Result res = Execute(*regexp_code,
    253                        *subject,
    254                        start_offset,
    255                        input_start,
    256                        input_end,
    257                        offsets_vector,
    258                        offsets_vector_length,
    259                        isolate);
    260   return res;
    261 }
    262 
    263 
    264 NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Execute(
    265     Code* code,
    266     String* input,  // This needs to be the unpacked (sliced, cons) string.
    267     int start_offset,
    268     const byte* input_start,
    269     const byte* input_end,
    270     int* output,
    271     int output_size,
    272     Isolate* isolate) {
    273   // Ensure that the minimum stack has been allocated.
    274   RegExpStackScope stack_scope(isolate);
    275   Address stack_base = stack_scope.stack()->stack_base();
    276 
    277   int direct_call = 0;
    278   int result = CALL_GENERATED_REGEXP_CODE(
    279       isolate, code->entry(), input, start_offset, input_start, input_end,
    280       output, output_size, stack_base, direct_call, isolate);
    281   DCHECK(result >= RETRY);
    282 
    283   if (result == EXCEPTION && !isolate->has_pending_exception()) {
    284     // We detected a stack overflow (on the backtrack stack) in RegExp code,
    285     // but haven't created the exception yet.
    286     isolate->StackOverflow();
    287   }
    288   return static_cast<Result>(result);
    289 }
    290 
    291 
    292 const byte NativeRegExpMacroAssembler::word_character_map[] = {
    293     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    294     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    295     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    296     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    297 
    298     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    299     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    300     0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // '0' - '7'
    301     0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,  // '8' - '9'
    302 
    303     0x00u, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'A' - 'G'
    304     0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'H' - 'O'
    305     0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'P' - 'W'
    306     0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0xffu,  // 'X' - 'Z', '_'
    307 
    308     0x00u, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'a' - 'g'
    309     0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'h' - 'o'
    310     0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'p' - 'w'
    311     0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,  // 'x' - 'z'
    312     // Latin-1 range
    313     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    314     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    315     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    316     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    317 
    318     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    319     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    320     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    321     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    322 
    323     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    324     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    325     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    326     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    327 
    328     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    329     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    330     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    331     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    332 };
    333 
    334 
    335 Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer,
    336                                               Address* stack_base,
    337                                               Isolate* isolate) {
    338   RegExpStack* regexp_stack = isolate->regexp_stack();
    339   size_t size = regexp_stack->stack_capacity();
    340   Address old_stack_base = regexp_stack->stack_base();
    341   DCHECK(old_stack_base == *stack_base);
    342   DCHECK(stack_pointer <= old_stack_base);
    343   DCHECK(static_cast<size_t>(old_stack_base - stack_pointer) <= size);
    344   Address new_stack_base = regexp_stack->EnsureCapacity(size * 2);
    345   if (new_stack_base == NULL) {
    346     return NULL;
    347   }
    348   *stack_base = new_stack_base;
    349   intptr_t stack_content_size = old_stack_base - stack_pointer;
    350   return new_stack_base - stack_content_size;
    351 }
    352 
    353 #endif  // V8_INTERPRETED_REGEXP
    354 
    355 }  // namespace internal
    356 }  // namespace v8
    357