1 // Copyright 2011 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // A simple interpreter for the Irregexp byte code. 6 7 8 #include "src/v8.h" 9 10 #include "src/ast.h" 11 #include "src/bytecodes-irregexp.h" 12 #include "src/interpreter-irregexp.h" 13 #include "src/jsregexp.h" 14 #include "src/regexp-macro-assembler.h" 15 #include "src/unicode.h" 16 #include "src/utils.h" 17 18 namespace v8 { 19 namespace internal { 20 21 22 typedef unibrow::Mapping<unibrow::Ecma262Canonicalize> Canonicalize; 23 24 static bool BackRefMatchesNoCase(Canonicalize* interp_canonicalize, 25 int from, 26 int current, 27 int len, 28 Vector<const uc16> subject) { 29 for (int i = 0; i < len; i++) { 30 unibrow::uchar old_char = subject[from++]; 31 unibrow::uchar new_char = subject[current++]; 32 if (old_char == new_char) continue; 33 unibrow::uchar old_string[1] = { old_char }; 34 unibrow::uchar new_string[1] = { new_char }; 35 interp_canonicalize->get(old_char, '\0', old_string); 36 interp_canonicalize->get(new_char, '\0', new_string); 37 if (old_string[0] != new_string[0]) { 38 return false; 39 } 40 } 41 return true; 42 } 43 44 45 static bool BackRefMatchesNoCase(Canonicalize* interp_canonicalize, 46 int from, 47 int current, 48 int len, 49 Vector<const uint8_t> subject) { 50 for (int i = 0; i < len; i++) { 51 unsigned int old_char = subject[from++]; 52 unsigned int new_char = subject[current++]; 53 if (old_char == new_char) continue; 54 // Convert both characters to lower case. 55 old_char |= 0x20; 56 new_char |= 0x20; 57 if (old_char != new_char) return false; 58 // Not letters in the ASCII range and Latin-1 range. 59 if (!(old_char - 'a' <= 'z' - 'a') && 60 !(old_char - 224 <= 254 - 224 && old_char != 247)) { 61 return false; 62 } 63 } 64 return true; 65 } 66 67 68 #ifdef DEBUG 69 static void TraceInterpreter(const byte* code_base, 70 const byte* pc, 71 int stack_depth, 72 int current_position, 73 uint32_t current_char, 74 int bytecode_length, 75 const char* bytecode_name) { 76 if (FLAG_trace_regexp_bytecodes) { 77 bool printable = (current_char < 127 && current_char >= 32); 78 const char* format = 79 printable ? 80 "pc = %02x, sp = %d, curpos = %d, curchar = %08x (%c), bc = %s" : 81 "pc = %02x, sp = %d, curpos = %d, curchar = %08x .%c., bc = %s"; 82 PrintF(format, 83 pc - code_base, 84 stack_depth, 85 current_position, 86 current_char, 87 printable ? current_char : '.', 88 bytecode_name); 89 for (int i = 0; i < bytecode_length; i++) { 90 printf(", %02x", pc[i]); 91 } 92 printf(" "); 93 for (int i = 1; i < bytecode_length; i++) { 94 unsigned char b = pc[i]; 95 if (b < 127 && b >= 32) { 96 printf("%c", b); 97 } else { 98 printf("."); 99 } 100 } 101 printf("\n"); 102 } 103 } 104 105 106 #define BYTECODE(name) \ 107 case BC_##name: \ 108 TraceInterpreter(code_base, \ 109 pc, \ 110 static_cast<int>(backtrack_sp - backtrack_stack_base), \ 111 current, \ 112 current_char, \ 113 BC_##name##_LENGTH, \ 114 #name); 115 #else 116 #define BYTECODE(name) \ 117 case BC_##name: 118 #endif 119 120 121 static int32_t Load32Aligned(const byte* pc) { 122 DCHECK((reinterpret_cast<intptr_t>(pc) & 3) == 0); 123 return *reinterpret_cast<const int32_t *>(pc); 124 } 125 126 127 static int32_t Load16Aligned(const byte* pc) { 128 DCHECK((reinterpret_cast<intptr_t>(pc) & 1) == 0); 129 return *reinterpret_cast<const uint16_t *>(pc); 130 } 131 132 133 // A simple abstraction over the backtracking stack used by the interpreter. 134 // This backtracking stack does not grow automatically, but it ensures that the 135 // the memory held by the stack is released or remembered in a cache if the 136 // matching terminates. 137 class BacktrackStack { 138 public: 139 BacktrackStack() { data_ = NewArray<int>(kBacktrackStackSize); } 140 141 ~BacktrackStack() { 142 DeleteArray(data_); 143 } 144 145 int* data() const { return data_; } 146 147 int max_size() const { return kBacktrackStackSize; } 148 149 private: 150 static const int kBacktrackStackSize = 10000; 151 152 int* data_; 153 154 DISALLOW_COPY_AND_ASSIGN(BacktrackStack); 155 }; 156 157 158 template <typename Char> 159 static RegExpImpl::IrregexpResult RawMatch(Isolate* isolate, 160 const byte* code_base, 161 Vector<const Char> subject, 162 int* registers, 163 int current, 164 uint32_t current_char) { 165 const byte* pc = code_base; 166 // BacktrackStack ensures that the memory allocated for the backtracking stack 167 // is returned to the system or cached if there is no stack being cached at 168 // the moment. 169 BacktrackStack backtrack_stack; 170 int* backtrack_stack_base = backtrack_stack.data(); 171 int* backtrack_sp = backtrack_stack_base; 172 int backtrack_stack_space = backtrack_stack.max_size(); 173 #ifdef DEBUG 174 if (FLAG_trace_regexp_bytecodes) { 175 PrintF("\n\nStart bytecode interpreter\n\n"); 176 } 177 #endif 178 while (true) { 179 int32_t insn = Load32Aligned(pc); 180 switch (insn & BYTECODE_MASK) { 181 BYTECODE(BREAK) 182 UNREACHABLE(); 183 return RegExpImpl::RE_FAILURE; 184 BYTECODE(PUSH_CP) 185 if (--backtrack_stack_space < 0) { 186 return RegExpImpl::RE_EXCEPTION; 187 } 188 *backtrack_sp++ = current; 189 pc += BC_PUSH_CP_LENGTH; 190 break; 191 BYTECODE(PUSH_BT) 192 if (--backtrack_stack_space < 0) { 193 return RegExpImpl::RE_EXCEPTION; 194 } 195 *backtrack_sp++ = Load32Aligned(pc + 4); 196 pc += BC_PUSH_BT_LENGTH; 197 break; 198 BYTECODE(PUSH_REGISTER) 199 if (--backtrack_stack_space < 0) { 200 return RegExpImpl::RE_EXCEPTION; 201 } 202 *backtrack_sp++ = registers[insn >> BYTECODE_SHIFT]; 203 pc += BC_PUSH_REGISTER_LENGTH; 204 break; 205 BYTECODE(SET_REGISTER) 206 registers[insn >> BYTECODE_SHIFT] = Load32Aligned(pc + 4); 207 pc += BC_SET_REGISTER_LENGTH; 208 break; 209 BYTECODE(ADVANCE_REGISTER) 210 registers[insn >> BYTECODE_SHIFT] += Load32Aligned(pc + 4); 211 pc += BC_ADVANCE_REGISTER_LENGTH; 212 break; 213 BYTECODE(SET_REGISTER_TO_CP) 214 registers[insn >> BYTECODE_SHIFT] = current + Load32Aligned(pc + 4); 215 pc += BC_SET_REGISTER_TO_CP_LENGTH; 216 break; 217 BYTECODE(SET_CP_TO_REGISTER) 218 current = registers[insn >> BYTECODE_SHIFT]; 219 pc += BC_SET_CP_TO_REGISTER_LENGTH; 220 break; 221 BYTECODE(SET_REGISTER_TO_SP) 222 registers[insn >> BYTECODE_SHIFT] = 223 static_cast<int>(backtrack_sp - backtrack_stack_base); 224 pc += BC_SET_REGISTER_TO_SP_LENGTH; 225 break; 226 BYTECODE(SET_SP_TO_REGISTER) 227 backtrack_sp = backtrack_stack_base + registers[insn >> BYTECODE_SHIFT]; 228 backtrack_stack_space = backtrack_stack.max_size() - 229 static_cast<int>(backtrack_sp - backtrack_stack_base); 230 pc += BC_SET_SP_TO_REGISTER_LENGTH; 231 break; 232 BYTECODE(POP_CP) 233 backtrack_stack_space++; 234 --backtrack_sp; 235 current = *backtrack_sp; 236 pc += BC_POP_CP_LENGTH; 237 break; 238 BYTECODE(POP_BT) 239 backtrack_stack_space++; 240 --backtrack_sp; 241 pc = code_base + *backtrack_sp; 242 break; 243 BYTECODE(POP_REGISTER) 244 backtrack_stack_space++; 245 --backtrack_sp; 246 registers[insn >> BYTECODE_SHIFT] = *backtrack_sp; 247 pc += BC_POP_REGISTER_LENGTH; 248 break; 249 BYTECODE(FAIL) 250 return RegExpImpl::RE_FAILURE; 251 BYTECODE(SUCCEED) 252 return RegExpImpl::RE_SUCCESS; 253 BYTECODE(ADVANCE_CP) 254 current += insn >> BYTECODE_SHIFT; 255 pc += BC_ADVANCE_CP_LENGTH; 256 break; 257 BYTECODE(GOTO) 258 pc = code_base + Load32Aligned(pc + 4); 259 break; 260 BYTECODE(ADVANCE_CP_AND_GOTO) 261 current += insn >> BYTECODE_SHIFT; 262 pc = code_base + Load32Aligned(pc + 4); 263 break; 264 BYTECODE(CHECK_GREEDY) 265 if (current == backtrack_sp[-1]) { 266 backtrack_sp--; 267 backtrack_stack_space++; 268 pc = code_base + Load32Aligned(pc + 4); 269 } else { 270 pc += BC_CHECK_GREEDY_LENGTH; 271 } 272 break; 273 BYTECODE(LOAD_CURRENT_CHAR) { 274 int pos = current + (insn >> BYTECODE_SHIFT); 275 if (pos >= subject.length()) { 276 pc = code_base + Load32Aligned(pc + 4); 277 } else { 278 current_char = subject[pos]; 279 pc += BC_LOAD_CURRENT_CHAR_LENGTH; 280 } 281 break; 282 } 283 BYTECODE(LOAD_CURRENT_CHAR_UNCHECKED) { 284 int pos = current + (insn >> BYTECODE_SHIFT); 285 current_char = subject[pos]; 286 pc += BC_LOAD_CURRENT_CHAR_UNCHECKED_LENGTH; 287 break; 288 } 289 BYTECODE(LOAD_2_CURRENT_CHARS) { 290 int pos = current + (insn >> BYTECODE_SHIFT); 291 if (pos + 2 > subject.length()) { 292 pc = code_base + Load32Aligned(pc + 4); 293 } else { 294 Char next = subject[pos + 1]; 295 current_char = 296 (subject[pos] | (next << (kBitsPerByte * sizeof(Char)))); 297 pc += BC_LOAD_2_CURRENT_CHARS_LENGTH; 298 } 299 break; 300 } 301 BYTECODE(LOAD_2_CURRENT_CHARS_UNCHECKED) { 302 int pos = current + (insn >> BYTECODE_SHIFT); 303 Char next = subject[pos + 1]; 304 current_char = (subject[pos] | (next << (kBitsPerByte * sizeof(Char)))); 305 pc += BC_LOAD_2_CURRENT_CHARS_UNCHECKED_LENGTH; 306 break; 307 } 308 BYTECODE(LOAD_4_CURRENT_CHARS) { 309 DCHECK(sizeof(Char) == 1); 310 int pos = current + (insn >> BYTECODE_SHIFT); 311 if (pos + 4 > subject.length()) { 312 pc = code_base + Load32Aligned(pc + 4); 313 } else { 314 Char next1 = subject[pos + 1]; 315 Char next2 = subject[pos + 2]; 316 Char next3 = subject[pos + 3]; 317 current_char = (subject[pos] | 318 (next1 << 8) | 319 (next2 << 16) | 320 (next3 << 24)); 321 pc += BC_LOAD_4_CURRENT_CHARS_LENGTH; 322 } 323 break; 324 } 325 BYTECODE(LOAD_4_CURRENT_CHARS_UNCHECKED) { 326 DCHECK(sizeof(Char) == 1); 327 int pos = current + (insn >> BYTECODE_SHIFT); 328 Char next1 = subject[pos + 1]; 329 Char next2 = subject[pos + 2]; 330 Char next3 = subject[pos + 3]; 331 current_char = (subject[pos] | 332 (next1 << 8) | 333 (next2 << 16) | 334 (next3 << 24)); 335 pc += BC_LOAD_4_CURRENT_CHARS_UNCHECKED_LENGTH; 336 break; 337 } 338 BYTECODE(CHECK_4_CHARS) { 339 uint32_t c = Load32Aligned(pc + 4); 340 if (c == current_char) { 341 pc = code_base + Load32Aligned(pc + 8); 342 } else { 343 pc += BC_CHECK_4_CHARS_LENGTH; 344 } 345 break; 346 } 347 BYTECODE(CHECK_CHAR) { 348 uint32_t c = (insn >> BYTECODE_SHIFT); 349 if (c == current_char) { 350 pc = code_base + Load32Aligned(pc + 4); 351 } else { 352 pc += BC_CHECK_CHAR_LENGTH; 353 } 354 break; 355 } 356 BYTECODE(CHECK_NOT_4_CHARS) { 357 uint32_t c = Load32Aligned(pc + 4); 358 if (c != current_char) { 359 pc = code_base + Load32Aligned(pc + 8); 360 } else { 361 pc += BC_CHECK_NOT_4_CHARS_LENGTH; 362 } 363 break; 364 } 365 BYTECODE(CHECK_NOT_CHAR) { 366 uint32_t c = (insn >> BYTECODE_SHIFT); 367 if (c != current_char) { 368 pc = code_base + Load32Aligned(pc + 4); 369 } else { 370 pc += BC_CHECK_NOT_CHAR_LENGTH; 371 } 372 break; 373 } 374 BYTECODE(AND_CHECK_4_CHARS) { 375 uint32_t c = Load32Aligned(pc + 4); 376 if (c == (current_char & Load32Aligned(pc + 8))) { 377 pc = code_base + Load32Aligned(pc + 12); 378 } else { 379 pc += BC_AND_CHECK_4_CHARS_LENGTH; 380 } 381 break; 382 } 383 BYTECODE(AND_CHECK_CHAR) { 384 uint32_t c = (insn >> BYTECODE_SHIFT); 385 if (c == (current_char & Load32Aligned(pc + 4))) { 386 pc = code_base + Load32Aligned(pc + 8); 387 } else { 388 pc += BC_AND_CHECK_CHAR_LENGTH; 389 } 390 break; 391 } 392 BYTECODE(AND_CHECK_NOT_4_CHARS) { 393 uint32_t c = Load32Aligned(pc + 4); 394 if (c != (current_char & Load32Aligned(pc + 8))) { 395 pc = code_base + Load32Aligned(pc + 12); 396 } else { 397 pc += BC_AND_CHECK_NOT_4_CHARS_LENGTH; 398 } 399 break; 400 } 401 BYTECODE(AND_CHECK_NOT_CHAR) { 402 uint32_t c = (insn >> BYTECODE_SHIFT); 403 if (c != (current_char & Load32Aligned(pc + 4))) { 404 pc = code_base + Load32Aligned(pc + 8); 405 } else { 406 pc += BC_AND_CHECK_NOT_CHAR_LENGTH; 407 } 408 break; 409 } 410 BYTECODE(MINUS_AND_CHECK_NOT_CHAR) { 411 uint32_t c = (insn >> BYTECODE_SHIFT); 412 uint32_t minus = Load16Aligned(pc + 4); 413 uint32_t mask = Load16Aligned(pc + 6); 414 if (c != ((current_char - minus) & mask)) { 415 pc = code_base + Load32Aligned(pc + 8); 416 } else { 417 pc += BC_MINUS_AND_CHECK_NOT_CHAR_LENGTH; 418 } 419 break; 420 } 421 BYTECODE(CHECK_CHAR_IN_RANGE) { 422 uint32_t from = Load16Aligned(pc + 4); 423 uint32_t to = Load16Aligned(pc + 6); 424 if (from <= current_char && current_char <= to) { 425 pc = code_base + Load32Aligned(pc + 8); 426 } else { 427 pc += BC_CHECK_CHAR_IN_RANGE_LENGTH; 428 } 429 break; 430 } 431 BYTECODE(CHECK_CHAR_NOT_IN_RANGE) { 432 uint32_t from = Load16Aligned(pc + 4); 433 uint32_t to = Load16Aligned(pc + 6); 434 if (from > current_char || current_char > to) { 435 pc = code_base + Load32Aligned(pc + 8); 436 } else { 437 pc += BC_CHECK_CHAR_NOT_IN_RANGE_LENGTH; 438 } 439 break; 440 } 441 BYTECODE(CHECK_BIT_IN_TABLE) { 442 int mask = RegExpMacroAssembler::kTableMask; 443 byte b = pc[8 + ((current_char & mask) >> kBitsPerByteLog2)]; 444 int bit = (current_char & (kBitsPerByte - 1)); 445 if ((b & (1 << bit)) != 0) { 446 pc = code_base + Load32Aligned(pc + 4); 447 } else { 448 pc += BC_CHECK_BIT_IN_TABLE_LENGTH; 449 } 450 break; 451 } 452 BYTECODE(CHECK_LT) { 453 uint32_t limit = (insn >> BYTECODE_SHIFT); 454 if (current_char < limit) { 455 pc = code_base + Load32Aligned(pc + 4); 456 } else { 457 pc += BC_CHECK_LT_LENGTH; 458 } 459 break; 460 } 461 BYTECODE(CHECK_GT) { 462 uint32_t limit = (insn >> BYTECODE_SHIFT); 463 if (current_char > limit) { 464 pc = code_base + Load32Aligned(pc + 4); 465 } else { 466 pc += BC_CHECK_GT_LENGTH; 467 } 468 break; 469 } 470 BYTECODE(CHECK_REGISTER_LT) 471 if (registers[insn >> BYTECODE_SHIFT] < Load32Aligned(pc + 4)) { 472 pc = code_base + Load32Aligned(pc + 8); 473 } else { 474 pc += BC_CHECK_REGISTER_LT_LENGTH; 475 } 476 break; 477 BYTECODE(CHECK_REGISTER_GE) 478 if (registers[insn >> BYTECODE_SHIFT] >= Load32Aligned(pc + 4)) { 479 pc = code_base + Load32Aligned(pc + 8); 480 } else { 481 pc += BC_CHECK_REGISTER_GE_LENGTH; 482 } 483 break; 484 BYTECODE(CHECK_REGISTER_EQ_POS) 485 if (registers[insn >> BYTECODE_SHIFT] == current) { 486 pc = code_base + Load32Aligned(pc + 4); 487 } else { 488 pc += BC_CHECK_REGISTER_EQ_POS_LENGTH; 489 } 490 break; 491 BYTECODE(CHECK_NOT_REGS_EQUAL) 492 if (registers[insn >> BYTECODE_SHIFT] == 493 registers[Load32Aligned(pc + 4)]) { 494 pc += BC_CHECK_NOT_REGS_EQUAL_LENGTH; 495 } else { 496 pc = code_base + Load32Aligned(pc + 8); 497 } 498 break; 499 BYTECODE(CHECK_NOT_BACK_REF) { 500 int from = registers[insn >> BYTECODE_SHIFT]; 501 int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from; 502 if (from < 0 || len <= 0) { 503 pc += BC_CHECK_NOT_BACK_REF_LENGTH; 504 break; 505 } 506 if (current + len > subject.length()) { 507 pc = code_base + Load32Aligned(pc + 4); 508 break; 509 } else { 510 int i; 511 for (i = 0; i < len; i++) { 512 if (subject[from + i] != subject[current + i]) { 513 pc = code_base + Load32Aligned(pc + 4); 514 break; 515 } 516 } 517 if (i < len) break; 518 current += len; 519 } 520 pc += BC_CHECK_NOT_BACK_REF_LENGTH; 521 break; 522 } 523 BYTECODE(CHECK_NOT_BACK_REF_NO_CASE) { 524 int from = registers[insn >> BYTECODE_SHIFT]; 525 int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from; 526 if (from < 0 || len <= 0) { 527 pc += BC_CHECK_NOT_BACK_REF_NO_CASE_LENGTH; 528 break; 529 } 530 if (current + len > subject.length()) { 531 pc = code_base + Load32Aligned(pc + 4); 532 break; 533 } else { 534 if (BackRefMatchesNoCase(isolate->interp_canonicalize_mapping(), 535 from, current, len, subject)) { 536 current += len; 537 pc += BC_CHECK_NOT_BACK_REF_NO_CASE_LENGTH; 538 } else { 539 pc = code_base + Load32Aligned(pc + 4); 540 } 541 } 542 break; 543 } 544 BYTECODE(CHECK_AT_START) 545 if (current == 0) { 546 pc = code_base + Load32Aligned(pc + 4); 547 } else { 548 pc += BC_CHECK_AT_START_LENGTH; 549 } 550 break; 551 BYTECODE(CHECK_NOT_AT_START) 552 if (current == 0) { 553 pc += BC_CHECK_NOT_AT_START_LENGTH; 554 } else { 555 pc = code_base + Load32Aligned(pc + 4); 556 } 557 break; 558 BYTECODE(SET_CURRENT_POSITION_FROM_END) { 559 int by = static_cast<uint32_t>(insn) >> BYTECODE_SHIFT; 560 if (subject.length() - current > by) { 561 current = subject.length() - by; 562 current_char = subject[current - 1]; 563 } 564 pc += BC_SET_CURRENT_POSITION_FROM_END_LENGTH; 565 break; 566 } 567 default: 568 UNREACHABLE(); 569 break; 570 } 571 } 572 } 573 574 575 RegExpImpl::IrregexpResult IrregexpInterpreter::Match( 576 Isolate* isolate, 577 Handle<ByteArray> code_array, 578 Handle<String> subject, 579 int* registers, 580 int start_position) { 581 DCHECK(subject->IsFlat()); 582 583 DisallowHeapAllocation no_gc; 584 const byte* code_base = code_array->GetDataStartAddress(); 585 uc16 previous_char = '\n'; 586 String::FlatContent subject_content = subject->GetFlatContent(); 587 if (subject_content.IsOneByte()) { 588 Vector<const uint8_t> subject_vector = subject_content.ToOneByteVector(); 589 if (start_position != 0) previous_char = subject_vector[start_position - 1]; 590 return RawMatch(isolate, 591 code_base, 592 subject_vector, 593 registers, 594 start_position, 595 previous_char); 596 } else { 597 DCHECK(subject_content.IsTwoByte()); 598 Vector<const uc16> subject_vector = subject_content.ToUC16Vector(); 599 if (start_position != 0) previous_char = subject_vector[start_position - 1]; 600 return RawMatch(isolate, 601 code_base, 602 subject_vector, 603 registers, 604 start_position, 605 previous_char); 606 } 607 } 608 609 } } // namespace v8::internal 610