Home | History | Annotate | Download | only in regexp
      1 // Copyright 2012 the V8 project authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "src/regexp/regexp-macro-assembler.h"
      6 
      7 #include "src/assembler.h"
      8 #include "src/isolate-inl.h"
      9 #include "src/regexp/regexp-stack.h"
     10 #include "src/simulator.h"
     11 
     12 #ifdef V8_I18N_SUPPORT
     13 #include "unicode/uchar.h"
     14 #endif  // V8_I18N_SUPPORT
     15 
     16 namespace v8 {
     17 namespace internal {
     18 
     19 RegExpMacroAssembler::RegExpMacroAssembler(Isolate* isolate, Zone* zone)
     20     : slow_safe_compiler_(false),
     21       global_mode_(NOT_GLOBAL),
     22       isolate_(isolate),
     23       zone_(zone) {}
     24 
     25 
     26 RegExpMacroAssembler::~RegExpMacroAssembler() {
     27 }
     28 
     29 
     30 int RegExpMacroAssembler::CaseInsensitiveCompareUC16(Address byte_offset1,
     31                                                      Address byte_offset2,
     32                                                      size_t byte_length,
     33                                                      Isolate* isolate) {
     34   unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
     35       isolate->regexp_macro_assembler_canonicalize();
     36   // This function is not allowed to cause a garbage collection.
     37   // A GC might move the calling generated code and invalidate the
     38   // return address on the stack.
     39   DCHECK(byte_length % 2 == 0);
     40   uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
     41   uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
     42   size_t length = byte_length >> 1;
     43 
     44 #ifdef V8_I18N_SUPPORT
     45   if (isolate == nullptr) {
     46     for (size_t i = 0; i < length; i++) {
     47       uc32 c1 = substring1[i];
     48       uc32 c2 = substring2[i];
     49       if (unibrow::Utf16::IsLeadSurrogate(c1)) {
     50         // Non-BMP characters do not have case-equivalents in the BMP.
     51         // Both have to be non-BMP for them to be able to match.
     52         if (!unibrow::Utf16::IsLeadSurrogate(c2)) return 0;
     53         if (i + 1 < length) {
     54           uc16 c1t = substring1[i + 1];
     55           uc16 c2t = substring2[i + 1];
     56           if (unibrow::Utf16::IsTrailSurrogate(c1t) &&
     57               unibrow::Utf16::IsTrailSurrogate(c2t)) {
     58             c1 = unibrow::Utf16::CombineSurrogatePair(c1, c1t);
     59             c2 = unibrow::Utf16::CombineSurrogatePair(c2, c2t);
     60             i++;
     61           }
     62         }
     63       }
     64       c1 = u_foldCase(c1, U_FOLD_CASE_DEFAULT);
     65       c2 = u_foldCase(c2, U_FOLD_CASE_DEFAULT);
     66       if (c1 != c2) return 0;
     67     }
     68     return 1;
     69   }
     70 #endif  // V8_I18N_SUPPORT
     71   DCHECK_NOT_NULL(isolate);
     72   for (size_t i = 0; i < length; i++) {
     73     unibrow::uchar c1 = substring1[i];
     74     unibrow::uchar c2 = substring2[i];
     75     if (c1 != c2) {
     76       unibrow::uchar s1[1] = {c1};
     77       canonicalize->get(c1, '\0', s1);
     78       if (s1[0] != c2) {
     79         unibrow::uchar s2[1] = {c2};
     80         canonicalize->get(c2, '\0', s2);
     81         if (s1[0] != s2[0]) {
     82           return 0;
     83         }
     84       }
     85     }
     86   }
     87   return 1;
     88 }
     89 
     90 
     91 void RegExpMacroAssembler::CheckNotInSurrogatePair(int cp_offset,
     92                                                    Label* on_failure) {
     93   Label ok;
     94   // Check that current character is not a trail surrogate.
     95   LoadCurrentCharacter(cp_offset, &ok);
     96   CheckCharacterNotInRange(kTrailSurrogateStart, kTrailSurrogateEnd, &ok);
     97   // Check that previous character is not a lead surrogate.
     98   LoadCurrentCharacter(cp_offset - 1, &ok);
     99   CheckCharacterInRange(kLeadSurrogateStart, kLeadSurrogateEnd, on_failure);
    100   Bind(&ok);
    101 }
    102 
    103 void RegExpMacroAssembler::CheckPosition(int cp_offset,
    104                                          Label* on_outside_input) {
    105   LoadCurrentCharacter(cp_offset, on_outside_input, true);
    106 }
    107 
    108 bool RegExpMacroAssembler::CheckSpecialCharacterClass(uc16 type,
    109                                                       Label* on_no_match) {
    110   return false;
    111 }
    112 
    113 #ifndef V8_INTERPRETED_REGEXP  // Avoid unused code, e.g., on ARM.
    114 
    115 NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate,
    116                                                        Zone* zone)
    117     : RegExpMacroAssembler(isolate, zone) {}
    118 
    119 
    120 NativeRegExpMacroAssembler::~NativeRegExpMacroAssembler() {
    121 }
    122 
    123 
    124 bool NativeRegExpMacroAssembler::CanReadUnaligned() {
    125   return FLAG_enable_regexp_unaligned_accesses && !slow_safe();
    126 }
    127 
    128 const byte* NativeRegExpMacroAssembler::StringCharacterPosition(
    129     String* subject,
    130     int start_index) {
    131   if (subject->IsConsString()) {
    132     subject = ConsString::cast(subject)->first();
    133   } else if (subject->IsSlicedString()) {
    134     start_index += SlicedString::cast(subject)->offset();
    135     subject = SlicedString::cast(subject)->parent();
    136   }
    137   if (subject->IsThinString()) {
    138     subject = ThinString::cast(subject)->actual();
    139   }
    140   DCHECK(start_index >= 0);
    141   DCHECK(start_index <= subject->length());
    142   if (subject->IsSeqOneByteString()) {
    143     return reinterpret_cast<const byte*>(
    144         SeqOneByteString::cast(subject)->GetChars() + start_index);
    145   } else if (subject->IsSeqTwoByteString()) {
    146     return reinterpret_cast<const byte*>(
    147         SeqTwoByteString::cast(subject)->GetChars() + start_index);
    148   } else if (subject->IsExternalOneByteString()) {
    149     return reinterpret_cast<const byte*>(
    150         ExternalOneByteString::cast(subject)->GetChars() + start_index);
    151   } else {
    152     DCHECK(subject->IsExternalTwoByteString());
    153     return reinterpret_cast<const byte*>(
    154         ExternalTwoByteString::cast(subject)->GetChars() + start_index);
    155   }
    156 }
    157 
    158 
    159 int NativeRegExpMacroAssembler::CheckStackGuardState(
    160     Isolate* isolate, int start_index, bool is_direct_call,
    161     Address* return_address, Code* re_code, String** subject,
    162     const byte** input_start, const byte** input_end) {
    163   DCHECK(re_code->instruction_start() <= *return_address);
    164   DCHECK(*return_address <= re_code->instruction_end());
    165   int return_value = 0;
    166   // Prepare for possible GC.
    167   HandleScope handles(isolate);
    168   Handle<Code> code_handle(re_code);
    169   Handle<String> subject_handle(*subject);
    170   bool is_one_byte = subject_handle->IsOneByteRepresentationUnderneath();
    171 
    172   StackLimitCheck check(isolate);
    173   if (check.JsHasOverflowed()) {
    174     isolate->StackOverflow();
    175     return_value = EXCEPTION;
    176   } else if (is_direct_call) {
    177     // If not real stack overflow the stack guard was used to interrupt
    178     // execution for another purpose.  If this is a direct call from JavaScript
    179     // retry the RegExp forcing the call through the runtime system.
    180     // Currently the direct call cannot handle a GC.
    181     return_value = RETRY;
    182   } else {
    183     Object* result = isolate->stack_guard()->HandleInterrupts();
    184     if (result->IsException(isolate)) return_value = EXCEPTION;
    185   }
    186 
    187   DisallowHeapAllocation no_gc;
    188 
    189   if (*code_handle != re_code) {  // Return address no longer valid
    190     intptr_t delta = code_handle->address() - re_code->address();
    191     // Overwrite the return address on the stack.
    192     *return_address += delta;
    193   }
    194 
    195   // If we continue, we need to update the subject string addresses.
    196   if (return_value == 0) {
    197     // String encoding might have changed.
    198     if (subject_handle->IsOneByteRepresentationUnderneath() != is_one_byte) {
    199       // If we changed between an LATIN1 and an UC16 string, the specialized
    200       // code cannot be used, and we need to restart regexp matching from
    201       // scratch (including, potentially, compiling a new version of the code).
    202       return_value = RETRY;
    203     } else {
    204       *subject = *subject_handle;
    205       intptr_t byte_length = *input_end - *input_start;
    206       *input_start = StringCharacterPosition(*subject, start_index);
    207       *input_end = *input_start + byte_length;
    208     }
    209   }
    210   return return_value;
    211 }
    212 
    213 
    214 NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Match(
    215     Handle<Code> regexp_code,
    216     Handle<String> subject,
    217     int* offsets_vector,
    218     int offsets_vector_length,
    219     int previous_index,
    220     Isolate* isolate) {
    221 
    222   DCHECK(subject->IsFlat());
    223   DCHECK(previous_index >= 0);
    224   DCHECK(previous_index <= subject->length());
    225 
    226   // No allocations before calling the regexp, but we can't use
    227   // DisallowHeapAllocation, since regexps might be preempted, and another
    228   // thread might do allocation anyway.
    229 
    230   String* subject_ptr = *subject;
    231   // Character offsets into string.
    232   int start_offset = previous_index;
    233   int char_length = subject_ptr->length() - start_offset;
    234   int slice_offset = 0;
    235 
    236   // The string has been flattened, so if it is a cons string it contains the
    237   // full string in the first part.
    238   if (StringShape(subject_ptr).IsCons()) {
    239     DCHECK_EQ(0, ConsString::cast(subject_ptr)->second()->length());
    240     subject_ptr = ConsString::cast(subject_ptr)->first();
    241   } else if (StringShape(subject_ptr).IsSliced()) {
    242     SlicedString* slice = SlicedString::cast(subject_ptr);
    243     subject_ptr = slice->parent();
    244     slice_offset = slice->offset();
    245   }
    246   if (StringShape(subject_ptr).IsThin()) {
    247     subject_ptr = ThinString::cast(subject_ptr)->actual();
    248   }
    249   // Ensure that an underlying string has the same representation.
    250   bool is_one_byte = subject_ptr->IsOneByteRepresentation();
    251   DCHECK(subject_ptr->IsExternalString() || subject_ptr->IsSeqString());
    252   // String is now either Sequential or External
    253   int char_size_shift = is_one_byte ? 0 : 1;
    254 
    255   const byte* input_start =
    256       StringCharacterPosition(subject_ptr, start_offset + slice_offset);
    257   int byte_length = char_length << char_size_shift;
    258   const byte* input_end = input_start + byte_length;
    259   Result res = Execute(*regexp_code,
    260                        *subject,
    261                        start_offset,
    262                        input_start,
    263                        input_end,
    264                        offsets_vector,
    265                        offsets_vector_length,
    266                        isolate);
    267   return res;
    268 }
    269 
    270 
    271 NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Execute(
    272     Code* code,
    273     String* input,  // This needs to be the unpacked (sliced, cons) string.
    274     int start_offset,
    275     const byte* input_start,
    276     const byte* input_end,
    277     int* output,
    278     int output_size,
    279     Isolate* isolate) {
    280   // Ensure that the minimum stack has been allocated.
    281   RegExpStackScope stack_scope(isolate);
    282   Address stack_base = stack_scope.stack()->stack_base();
    283 
    284   int direct_call = 0;
    285   int result = CALL_GENERATED_REGEXP_CODE(
    286       isolate, code->entry(), input, start_offset, input_start, input_end,
    287       output, output_size, stack_base, direct_call, isolate);
    288   DCHECK(result >= RETRY);
    289 
    290   if (result == EXCEPTION && !isolate->has_pending_exception()) {
    291     // We detected a stack overflow (on the backtrack stack) in RegExp code,
    292     // but haven't created the exception yet.
    293     isolate->StackOverflow();
    294   }
    295   return static_cast<Result>(result);
    296 }
    297 
    298 
    299 const byte NativeRegExpMacroAssembler::word_character_map[] = {
    300     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    301     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    302     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    303     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    304 
    305     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    306     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    307     0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // '0' - '7'
    308     0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,  // '8' - '9'
    309 
    310     0x00u, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'A' - 'G'
    311     0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'H' - 'O'
    312     0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'P' - 'W'
    313     0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0xffu,  // 'X' - 'Z', '_'
    314 
    315     0x00u, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'a' - 'g'
    316     0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'h' - 'o'
    317     0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'p' - 'w'
    318     0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,  // 'x' - 'z'
    319     // Latin-1 range
    320     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    321     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    322     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    323     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    324 
    325     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    326     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    327     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    328     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    329 
    330     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    331     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    332     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    333     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    334 
    335     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    336     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    337     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    338     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    339 };
    340 
    341 
    342 Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer,
    343                                               Address* stack_base,
    344                                               Isolate* isolate) {
    345   RegExpStack* regexp_stack = isolate->regexp_stack();
    346   size_t size = regexp_stack->stack_capacity();
    347   Address old_stack_base = regexp_stack->stack_base();
    348   DCHECK(old_stack_base == *stack_base);
    349   DCHECK(stack_pointer <= old_stack_base);
    350   DCHECK(static_cast<size_t>(old_stack_base - stack_pointer) <= size);
    351   Address new_stack_base = regexp_stack->EnsureCapacity(size * 2);
    352   if (new_stack_base == NULL) {
    353     return NULL;
    354   }
    355   *stack_base = new_stack_base;
    356   intptr_t stack_content_size = old_stack_base - stack_pointer;
    357   return new_stack_base - stack_content_size;
    358 }
    359 
    360 #endif  // V8_INTERPRETED_REGEXP
    361 
    362 }  // namespace internal
    363 }  // namespace v8
    364