Home | History | Annotate | Download | only in regexp
      1 // Copyright 2012 the V8 project authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "src/regexp/regexp-macro-assembler.h"
      6 
      7 #include "src/assembler.h"
      8 #include "src/isolate-inl.h"
      9 #include "src/regexp/regexp-stack.h"
     10 #include "src/simulator.h"
     11 #include "src/unicode-inl.h"
     12 
     13 #ifdef V8_INTL_SUPPORT
     14 #include "unicode/uchar.h"
     15 #endif  // V8_INTL_SUPPORT
     16 
     17 namespace v8 {
     18 namespace internal {
     19 
     20 RegExpMacroAssembler::RegExpMacroAssembler(Isolate* isolate, Zone* zone)
     21     : slow_safe_compiler_(false),
     22       global_mode_(NOT_GLOBAL),
     23       isolate_(isolate),
     24       zone_(zone) {}
     25 
     26 
     27 RegExpMacroAssembler::~RegExpMacroAssembler() {
     28 }
     29 
     30 
     31 int RegExpMacroAssembler::CaseInsensitiveCompareUC16(Address byte_offset1,
     32                                                      Address byte_offset2,
     33                                                      size_t byte_length,
     34                                                      Isolate* isolate) {
     35   unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
     36       isolate->regexp_macro_assembler_canonicalize();
     37   // This function is not allowed to cause a garbage collection.
     38   // A GC might move the calling generated code and invalidate the
     39   // return address on the stack.
     40   DCHECK_EQ(0, byte_length % 2);
     41   uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
     42   uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
     43   size_t length = byte_length >> 1;
     44 
     45 #ifdef V8_INTL_SUPPORT
     46   if (isolate == nullptr) {
     47     for (size_t i = 0; i < length; i++) {
     48       uc32 c1 = substring1[i];
     49       uc32 c2 = substring2[i];
     50       if (unibrow::Utf16::IsLeadSurrogate(c1)) {
     51         // Non-BMP characters do not have case-equivalents in the BMP.
     52         // Both have to be non-BMP for them to be able to match.
     53         if (!unibrow::Utf16::IsLeadSurrogate(c2)) return 0;
     54         if (i + 1 < length) {
     55           uc16 c1t = substring1[i + 1];
     56           uc16 c2t = substring2[i + 1];
     57           if (unibrow::Utf16::IsTrailSurrogate(c1t) &&
     58               unibrow::Utf16::IsTrailSurrogate(c2t)) {
     59             c1 = unibrow::Utf16::CombineSurrogatePair(c1, c1t);
     60             c2 = unibrow::Utf16::CombineSurrogatePair(c2, c2t);
     61             i++;
     62           }
     63         }
     64       }
     65       c1 = u_foldCase(c1, U_FOLD_CASE_DEFAULT);
     66       c2 = u_foldCase(c2, U_FOLD_CASE_DEFAULT);
     67       if (c1 != c2) return 0;
     68     }
     69     return 1;
     70   }
     71 #endif  // V8_INTL_SUPPORT
     72   DCHECK_NOT_NULL(isolate);
     73   for (size_t i = 0; i < length; i++) {
     74     unibrow::uchar c1 = substring1[i];
     75     unibrow::uchar c2 = substring2[i];
     76     if (c1 != c2) {
     77       unibrow::uchar s1[1] = {c1};
     78       canonicalize->get(c1, '\0', s1);
     79       if (s1[0] != c2) {
     80         unibrow::uchar s2[1] = {c2};
     81         canonicalize->get(c2, '\0', s2);
     82         if (s1[0] != s2[0]) {
     83           return 0;
     84         }
     85       }
     86     }
     87   }
     88   return 1;
     89 }
     90 
     91 
     92 void RegExpMacroAssembler::CheckNotInSurrogatePair(int cp_offset,
     93                                                    Label* on_failure) {
     94   Label ok;
     95   // Check that current character is not a trail surrogate.
     96   LoadCurrentCharacter(cp_offset, &ok);
     97   CheckCharacterNotInRange(kTrailSurrogateStart, kTrailSurrogateEnd, &ok);
     98   // Check that previous character is not a lead surrogate.
     99   LoadCurrentCharacter(cp_offset - 1, &ok);
    100   CheckCharacterInRange(kLeadSurrogateStart, kLeadSurrogateEnd, on_failure);
    101   Bind(&ok);
    102 }
    103 
    104 void RegExpMacroAssembler::CheckPosition(int cp_offset,
    105                                          Label* on_outside_input) {
    106   LoadCurrentCharacter(cp_offset, on_outside_input, true);
    107 }
    108 
    109 bool RegExpMacroAssembler::CheckSpecialCharacterClass(uc16 type,
    110                                                       Label* on_no_match) {
    111   return false;
    112 }
    113 
    114 #ifndef V8_INTERPRETED_REGEXP  // Avoid unused code, e.g., on ARM.
    115 
    116 NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate,
    117                                                        Zone* zone)
    118     : RegExpMacroAssembler(isolate, zone) {}
    119 
    120 
    121 NativeRegExpMacroAssembler::~NativeRegExpMacroAssembler() {
    122 }
    123 
    124 
    125 bool NativeRegExpMacroAssembler::CanReadUnaligned() {
    126   return FLAG_enable_regexp_unaligned_accesses && !slow_safe();
    127 }
    128 
    129 const byte* NativeRegExpMacroAssembler::StringCharacterPosition(
    130     String* subject,
    131     int start_index) {
    132   if (subject->IsConsString()) {
    133     subject = ConsString::cast(subject)->first();
    134   } else if (subject->IsSlicedString()) {
    135     start_index += SlicedString::cast(subject)->offset();
    136     subject = SlicedString::cast(subject)->parent();
    137   }
    138   if (subject->IsThinString()) {
    139     subject = ThinString::cast(subject)->actual();
    140   }
    141   DCHECK_LE(0, start_index);
    142   DCHECK_LE(start_index, subject->length());
    143   if (subject->IsSeqOneByteString()) {
    144     return reinterpret_cast<const byte*>(
    145         SeqOneByteString::cast(subject)->GetChars() + start_index);
    146   } else if (subject->IsSeqTwoByteString()) {
    147     return reinterpret_cast<const byte*>(
    148         SeqTwoByteString::cast(subject)->GetChars() + start_index);
    149   } else if (subject->IsExternalOneByteString()) {
    150     return reinterpret_cast<const byte*>(
    151         ExternalOneByteString::cast(subject)->GetChars() + start_index);
    152   } else {
    153     DCHECK(subject->IsExternalTwoByteString());
    154     return reinterpret_cast<const byte*>(
    155         ExternalTwoByteString::cast(subject)->GetChars() + start_index);
    156   }
    157 }
    158 
    159 
    160 int NativeRegExpMacroAssembler::CheckStackGuardState(
    161     Isolate* isolate, int start_index, bool is_direct_call,
    162     Address* return_address, Code* re_code, String** subject,
    163     const byte** input_start, const byte** input_end) {
    164   DCHECK(re_code->raw_instruction_start() <= *return_address);
    165   DCHECK(*return_address <= re_code->raw_instruction_end());
    166   int return_value = 0;
    167   // Prepare for possible GC.
    168   HandleScope handles(isolate);
    169   Handle<Code> code_handle(re_code, isolate);
    170   Handle<String> subject_handle(*subject, isolate);
    171   bool is_one_byte = subject_handle->IsOneByteRepresentationUnderneath();
    172 
    173   StackLimitCheck check(isolate);
    174   bool js_has_overflowed = check.JsHasOverflowed();
    175 
    176   if (is_direct_call) {
    177     // Direct calls from JavaScript can be interrupted in two ways:
    178     // 1. A real stack overflow, in which case we let the caller throw the
    179     //    exception.
    180     // 2. The stack guard was used to interrupt execution for another purpose,
    181     //    forcing the call through the runtime system.
    182     return_value = js_has_overflowed ? EXCEPTION : RETRY;
    183   } else if (js_has_overflowed) {
    184     isolate->StackOverflow();
    185     return_value = EXCEPTION;
    186   } else {
    187     Object* result = isolate->stack_guard()->HandleInterrupts();
    188     if (result->IsException(isolate)) return_value = EXCEPTION;
    189   }
    190 
    191   DisallowHeapAllocation no_gc;
    192 
    193   if (*code_handle != re_code) {  // Return address no longer valid
    194     intptr_t delta = code_handle->address() - re_code->address();
    195     // Overwrite the return address on the stack.
    196     *return_address += delta;
    197   }
    198 
    199   // If we continue, we need to update the subject string addresses.
    200   if (return_value == 0) {
    201     // String encoding might have changed.
    202     if (subject_handle->IsOneByteRepresentationUnderneath() != is_one_byte) {
    203       // If we changed between an LATIN1 and an UC16 string, the specialized
    204       // code cannot be used, and we need to restart regexp matching from
    205       // scratch (including, potentially, compiling a new version of the code).
    206       return_value = RETRY;
    207     } else {
    208       *subject = *subject_handle;
    209       intptr_t byte_length = *input_end - *input_start;
    210       *input_start = StringCharacterPosition(*subject, start_index);
    211       *input_end = *input_start + byte_length;
    212     }
    213   }
    214   return return_value;
    215 }
    216 
    217 
    218 NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Match(
    219     Handle<Code> regexp_code,
    220     Handle<String> subject,
    221     int* offsets_vector,
    222     int offsets_vector_length,
    223     int previous_index,
    224     Isolate* isolate) {
    225 
    226   DCHECK(subject->IsFlat());
    227   DCHECK_LE(0, previous_index);
    228   DCHECK_LE(previous_index, subject->length());
    229 
    230   // No allocations before calling the regexp, but we can't use
    231   // DisallowHeapAllocation, since regexps might be preempted, and another
    232   // thread might do allocation anyway.
    233 
    234   String* subject_ptr = *subject;
    235   // Character offsets into string.
    236   int start_offset = previous_index;
    237   int char_length = subject_ptr->length() - start_offset;
    238   int slice_offset = 0;
    239 
    240   // The string has been flattened, so if it is a cons string it contains the
    241   // full string in the first part.
    242   if (StringShape(subject_ptr).IsCons()) {
    243     DCHECK_EQ(0, ConsString::cast(subject_ptr)->second()->length());
    244     subject_ptr = ConsString::cast(subject_ptr)->first();
    245   } else if (StringShape(subject_ptr).IsSliced()) {
    246     SlicedString* slice = SlicedString::cast(subject_ptr);
    247     subject_ptr = slice->parent();
    248     slice_offset = slice->offset();
    249   }
    250   if (StringShape(subject_ptr).IsThin()) {
    251     subject_ptr = ThinString::cast(subject_ptr)->actual();
    252   }
    253   // Ensure that an underlying string has the same representation.
    254   bool is_one_byte = subject_ptr->IsOneByteRepresentation();
    255   DCHECK(subject_ptr->IsExternalString() || subject_ptr->IsSeqString());
    256   // String is now either Sequential or External
    257   int char_size_shift = is_one_byte ? 0 : 1;
    258 
    259   const byte* input_start =
    260       StringCharacterPosition(subject_ptr, start_offset + slice_offset);
    261   int byte_length = char_length << char_size_shift;
    262   const byte* input_end = input_start + byte_length;
    263   Result res = Execute(*regexp_code,
    264                        *subject,
    265                        start_offset,
    266                        input_start,
    267                        input_end,
    268                        offsets_vector,
    269                        offsets_vector_length,
    270                        isolate);
    271   return res;
    272 }
    273 
    274 
    275 NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Execute(
    276     Code* code,
    277     String* input,  // This needs to be the unpacked (sliced, cons) string.
    278     int start_offset,
    279     const byte* input_start,
    280     const byte* input_end,
    281     int* output,
    282     int output_size,
    283     Isolate* isolate) {
    284   // Ensure that the minimum stack has been allocated.
    285   RegExpStackScope stack_scope(isolate);
    286   Address stack_base = stack_scope.stack()->stack_base();
    287 
    288   int direct_call = 0;
    289 
    290   using RegexpMatcherSig = int(
    291       String * input, int start_offset,  // NOLINT(readability/casting)
    292       const byte* input_start, const byte* input_end, int* output,
    293       int output_size, Address stack_base, int direct_call, Isolate* isolate);
    294 
    295   auto fn = GeneratedCode<RegexpMatcherSig>::FromCode(code);
    296   int result = fn.Call(input, start_offset, input_start, input_end, output,
    297                        output_size, stack_base, direct_call, isolate);
    298   DCHECK(result >= RETRY);
    299 
    300   if (result == EXCEPTION && !isolate->has_pending_exception()) {
    301     // We detected a stack overflow (on the backtrack stack) in RegExp code,
    302     // but haven't created the exception yet.
    303     isolate->StackOverflow();
    304   }
    305   return static_cast<Result>(result);
    306 }
    307 
    308 // clang-format off
    309 const byte NativeRegExpMacroAssembler::word_character_map[] = {
    310     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    311     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    312     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    313     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    314 
    315     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    316     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    317     0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // '0' - '7'
    318     0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,  // '8' - '9'
    319 
    320     0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'A' - 'G'
    321     0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'H' - 'O'
    322     0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'P' - 'W'
    323     0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0xFFu,  // 'X' - 'Z', '_'
    324 
    325     0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'a' - 'g'
    326     0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'h' - 'o'
    327     0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'p' - 'w'
    328     0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,  // 'x' - 'z'
    329     // Latin-1 range
    330     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    331     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    332     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    333     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    334 
    335     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    336     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    337     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    338     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    339 
    340     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    341     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    342     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    343     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    344 
    345     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    346     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    347     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    348     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    349 };
    350 // clang-format on
    351 
    352 Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer,
    353                                               Address* stack_base,
    354                                               Isolate* isolate) {
    355   RegExpStack* regexp_stack = isolate->regexp_stack();
    356   size_t size = regexp_stack->stack_capacity();
    357   Address old_stack_base = regexp_stack->stack_base();
    358   DCHECK(old_stack_base == *stack_base);
    359   DCHECK(stack_pointer <= old_stack_base);
    360   DCHECK(static_cast<size_t>(old_stack_base - stack_pointer) <= size);
    361   Address new_stack_base = regexp_stack->EnsureCapacity(size * 2);
    362   if (new_stack_base == kNullAddress) {
    363     return kNullAddress;
    364   }
    365   *stack_base = new_stack_base;
    366   intptr_t stack_content_size = old_stack_base - stack_pointer;
    367   return new_stack_base - stack_content_size;
    368 }
    369 
    370 #endif  // V8_INTERPRETED_REGEXP
    371 
    372 }  // namespace internal
    373 }  // namespace v8
    374