1 // Copyright 2012 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "src/regexp/regexp-macro-assembler.h" 6 7 #include "src/assembler.h" 8 #include "src/isolate-inl.h" 9 #include "src/regexp/regexp-stack.h" 10 #include "src/simulator.h" 11 #include "src/unicode-inl.h" 12 13 #ifdef V8_INTL_SUPPORT 14 #include "unicode/uchar.h" 15 #endif // V8_INTL_SUPPORT 16 17 namespace v8 { 18 namespace internal { 19 20 RegExpMacroAssembler::RegExpMacroAssembler(Isolate* isolate, Zone* zone) 21 : slow_safe_compiler_(false), 22 global_mode_(NOT_GLOBAL), 23 isolate_(isolate), 24 zone_(zone) {} 25 26 27 RegExpMacroAssembler::~RegExpMacroAssembler() { 28 } 29 30 31 int RegExpMacroAssembler::CaseInsensitiveCompareUC16(Address byte_offset1, 32 Address byte_offset2, 33 size_t byte_length, 34 Isolate* isolate) { 35 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize = 36 isolate->regexp_macro_assembler_canonicalize(); 37 // This function is not allowed to cause a garbage collection. 38 // A GC might move the calling generated code and invalidate the 39 // return address on the stack. 40 DCHECK_EQ(0, byte_length % 2); 41 uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1); 42 uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2); 43 size_t length = byte_length >> 1; 44 45 #ifdef V8_INTL_SUPPORT 46 if (isolate == nullptr) { 47 for (size_t i = 0; i < length; i++) { 48 uc32 c1 = substring1[i]; 49 uc32 c2 = substring2[i]; 50 if (unibrow::Utf16::IsLeadSurrogate(c1)) { 51 // Non-BMP characters do not have case-equivalents in the BMP. 52 // Both have to be non-BMP for them to be able to match. 53 if (!unibrow::Utf16::IsLeadSurrogate(c2)) return 0; 54 if (i + 1 < length) { 55 uc16 c1t = substring1[i + 1]; 56 uc16 c2t = substring2[i + 1]; 57 if (unibrow::Utf16::IsTrailSurrogate(c1t) && 58 unibrow::Utf16::IsTrailSurrogate(c2t)) { 59 c1 = unibrow::Utf16::CombineSurrogatePair(c1, c1t); 60 c2 = unibrow::Utf16::CombineSurrogatePair(c2, c2t); 61 i++; 62 } 63 } 64 } 65 c1 = u_foldCase(c1, U_FOLD_CASE_DEFAULT); 66 c2 = u_foldCase(c2, U_FOLD_CASE_DEFAULT); 67 if (c1 != c2) return 0; 68 } 69 return 1; 70 } 71 #endif // V8_INTL_SUPPORT 72 DCHECK_NOT_NULL(isolate); 73 for (size_t i = 0; i < length; i++) { 74 unibrow::uchar c1 = substring1[i]; 75 unibrow::uchar c2 = substring2[i]; 76 if (c1 != c2) { 77 unibrow::uchar s1[1] = {c1}; 78 canonicalize->get(c1, '\0', s1); 79 if (s1[0] != c2) { 80 unibrow::uchar s2[1] = {c2}; 81 canonicalize->get(c2, '\0', s2); 82 if (s1[0] != s2[0]) { 83 return 0; 84 } 85 } 86 } 87 } 88 return 1; 89 } 90 91 92 void RegExpMacroAssembler::CheckNotInSurrogatePair(int cp_offset, 93 Label* on_failure) { 94 Label ok; 95 // Check that current character is not a trail surrogate. 96 LoadCurrentCharacter(cp_offset, &ok); 97 CheckCharacterNotInRange(kTrailSurrogateStart, kTrailSurrogateEnd, &ok); 98 // Check that previous character is not a lead surrogate. 99 LoadCurrentCharacter(cp_offset - 1, &ok); 100 CheckCharacterInRange(kLeadSurrogateStart, kLeadSurrogateEnd, on_failure); 101 Bind(&ok); 102 } 103 104 void RegExpMacroAssembler::CheckPosition(int cp_offset, 105 Label* on_outside_input) { 106 LoadCurrentCharacter(cp_offset, on_outside_input, true); 107 } 108 109 bool RegExpMacroAssembler::CheckSpecialCharacterClass(uc16 type, 110 Label* on_no_match) { 111 return false; 112 } 113 114 #ifndef V8_INTERPRETED_REGEXP // Avoid unused code, e.g., on ARM. 115 116 NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate, 117 Zone* zone) 118 : RegExpMacroAssembler(isolate, zone) {} 119 120 121 NativeRegExpMacroAssembler::~NativeRegExpMacroAssembler() { 122 } 123 124 125 bool NativeRegExpMacroAssembler::CanReadUnaligned() { 126 return FLAG_enable_regexp_unaligned_accesses && !slow_safe(); 127 } 128 129 const byte* NativeRegExpMacroAssembler::StringCharacterPosition( 130 String* subject, 131 int start_index) { 132 if (subject->IsConsString()) { 133 subject = ConsString::cast(subject)->first(); 134 } else if (subject->IsSlicedString()) { 135 start_index += SlicedString::cast(subject)->offset(); 136 subject = SlicedString::cast(subject)->parent(); 137 } 138 if (subject->IsThinString()) { 139 subject = ThinString::cast(subject)->actual(); 140 } 141 DCHECK_LE(0, start_index); 142 DCHECK_LE(start_index, subject->length()); 143 if (subject->IsSeqOneByteString()) { 144 return reinterpret_cast<const byte*>( 145 SeqOneByteString::cast(subject)->GetChars() + start_index); 146 } else if (subject->IsSeqTwoByteString()) { 147 return reinterpret_cast<const byte*>( 148 SeqTwoByteString::cast(subject)->GetChars() + start_index); 149 } else if (subject->IsExternalOneByteString()) { 150 return reinterpret_cast<const byte*>( 151 ExternalOneByteString::cast(subject)->GetChars() + start_index); 152 } else { 153 DCHECK(subject->IsExternalTwoByteString()); 154 return reinterpret_cast<const byte*>( 155 ExternalTwoByteString::cast(subject)->GetChars() + start_index); 156 } 157 } 158 159 160 int NativeRegExpMacroAssembler::CheckStackGuardState( 161 Isolate* isolate, int start_index, bool is_direct_call, 162 Address* return_address, Code* re_code, String** subject, 163 const byte** input_start, const byte** input_end) { 164 DCHECK(re_code->raw_instruction_start() <= *return_address); 165 DCHECK(*return_address <= re_code->raw_instruction_end()); 166 int return_value = 0; 167 // Prepare for possible GC. 168 HandleScope handles(isolate); 169 Handle<Code> code_handle(re_code, isolate); 170 Handle<String> subject_handle(*subject, isolate); 171 bool is_one_byte = subject_handle->IsOneByteRepresentationUnderneath(); 172 173 StackLimitCheck check(isolate); 174 bool js_has_overflowed = check.JsHasOverflowed(); 175 176 if (is_direct_call) { 177 // Direct calls from JavaScript can be interrupted in two ways: 178 // 1. A real stack overflow, in which case we let the caller throw the 179 // exception. 180 // 2. The stack guard was used to interrupt execution for another purpose, 181 // forcing the call through the runtime system. 182 return_value = js_has_overflowed ? EXCEPTION : RETRY; 183 } else if (js_has_overflowed) { 184 isolate->StackOverflow(); 185 return_value = EXCEPTION; 186 } else { 187 Object* result = isolate->stack_guard()->HandleInterrupts(); 188 if (result->IsException(isolate)) return_value = EXCEPTION; 189 } 190 191 DisallowHeapAllocation no_gc; 192 193 if (*code_handle != re_code) { // Return address no longer valid 194 intptr_t delta = code_handle->address() - re_code->address(); 195 // Overwrite the return address on the stack. 196 *return_address += delta; 197 } 198 199 // If we continue, we need to update the subject string addresses. 200 if (return_value == 0) { 201 // String encoding might have changed. 202 if (subject_handle->IsOneByteRepresentationUnderneath() != is_one_byte) { 203 // If we changed between an LATIN1 and an UC16 string, the specialized 204 // code cannot be used, and we need to restart regexp matching from 205 // scratch (including, potentially, compiling a new version of the code). 206 return_value = RETRY; 207 } else { 208 *subject = *subject_handle; 209 intptr_t byte_length = *input_end - *input_start; 210 *input_start = StringCharacterPosition(*subject, start_index); 211 *input_end = *input_start + byte_length; 212 } 213 } 214 return return_value; 215 } 216 217 218 NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Match( 219 Handle<Code> regexp_code, 220 Handle<String> subject, 221 int* offsets_vector, 222 int offsets_vector_length, 223 int previous_index, 224 Isolate* isolate) { 225 226 DCHECK(subject->IsFlat()); 227 DCHECK_LE(0, previous_index); 228 DCHECK_LE(previous_index, subject->length()); 229 230 // No allocations before calling the regexp, but we can't use 231 // DisallowHeapAllocation, since regexps might be preempted, and another 232 // thread might do allocation anyway. 233 234 String* subject_ptr = *subject; 235 // Character offsets into string. 236 int start_offset = previous_index; 237 int char_length = subject_ptr->length() - start_offset; 238 int slice_offset = 0; 239 240 // The string has been flattened, so if it is a cons string it contains the 241 // full string in the first part. 242 if (StringShape(subject_ptr).IsCons()) { 243 DCHECK_EQ(0, ConsString::cast(subject_ptr)->second()->length()); 244 subject_ptr = ConsString::cast(subject_ptr)->first(); 245 } else if (StringShape(subject_ptr).IsSliced()) { 246 SlicedString* slice = SlicedString::cast(subject_ptr); 247 subject_ptr = slice->parent(); 248 slice_offset = slice->offset(); 249 } 250 if (StringShape(subject_ptr).IsThin()) { 251 subject_ptr = ThinString::cast(subject_ptr)->actual(); 252 } 253 // Ensure that an underlying string has the same representation. 254 bool is_one_byte = subject_ptr->IsOneByteRepresentation(); 255 DCHECK(subject_ptr->IsExternalString() || subject_ptr->IsSeqString()); 256 // String is now either Sequential or External 257 int char_size_shift = is_one_byte ? 0 : 1; 258 259 const byte* input_start = 260 StringCharacterPosition(subject_ptr, start_offset + slice_offset); 261 int byte_length = char_length << char_size_shift; 262 const byte* input_end = input_start + byte_length; 263 Result res = Execute(*regexp_code, 264 *subject, 265 start_offset, 266 input_start, 267 input_end, 268 offsets_vector, 269 offsets_vector_length, 270 isolate); 271 return res; 272 } 273 274 275 NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Execute( 276 Code* code, 277 String* input, // This needs to be the unpacked (sliced, cons) string. 278 int start_offset, 279 const byte* input_start, 280 const byte* input_end, 281 int* output, 282 int output_size, 283 Isolate* isolate) { 284 // Ensure that the minimum stack has been allocated. 285 RegExpStackScope stack_scope(isolate); 286 Address stack_base = stack_scope.stack()->stack_base(); 287 288 int direct_call = 0; 289 290 using RegexpMatcherSig = int( 291 String * input, int start_offset, // NOLINT(readability/casting) 292 const byte* input_start, const byte* input_end, int* output, 293 int output_size, Address stack_base, int direct_call, Isolate* isolate); 294 295 auto fn = GeneratedCode<RegexpMatcherSig>::FromCode(code); 296 int result = fn.Call(input, start_offset, input_start, input_end, output, 297 output_size, stack_base, direct_call, isolate); 298 DCHECK(result >= RETRY); 299 300 if (result == EXCEPTION && !isolate->has_pending_exception()) { 301 // We detected a stack overflow (on the backtrack stack) in RegExp code, 302 // but haven't created the exception yet. 303 isolate->StackOverflow(); 304 } 305 return static_cast<Result>(result); 306 } 307 308 // clang-format off 309 const byte NativeRegExpMacroAssembler::word_character_map[] = { 310 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 311 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 312 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 313 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 314 315 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 316 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 317 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // '0' - '7' 318 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // '8' - '9' 319 320 0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'A' - 'G' 321 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'H' - 'O' 322 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'P' - 'W' 323 0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0xFFu, // 'X' - 'Z', '_' 324 325 0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'a' - 'g' 326 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'h' - 'o' 327 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'p' - 'w' 328 0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // 'x' - 'z' 329 // Latin-1 range 330 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 331 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 332 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 333 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 334 335 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 336 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 337 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 338 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 339 340 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 341 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 342 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 343 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 344 345 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 346 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 347 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 348 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 349 }; 350 // clang-format on 351 352 Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer, 353 Address* stack_base, 354 Isolate* isolate) { 355 RegExpStack* regexp_stack = isolate->regexp_stack(); 356 size_t size = regexp_stack->stack_capacity(); 357 Address old_stack_base = regexp_stack->stack_base(); 358 DCHECK(old_stack_base == *stack_base); 359 DCHECK(stack_pointer <= old_stack_base); 360 DCHECK(static_cast<size_t>(old_stack_base - stack_pointer) <= size); 361 Address new_stack_base = regexp_stack->EnsureCapacity(size * 2); 362 if (new_stack_base == kNullAddress) { 363 return kNullAddress; 364 } 365 *stack_base = new_stack_base; 366 intptr_t stack_content_size = old_stack_base - stack_pointer; 367 return new_stack_base - stack_content_size; 368 } 369 370 #endif // V8_INTERPRETED_REGEXP 371 372 } // namespace internal 373 } // namespace v8 374