1 // Copyright 2011 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // A simple interpreter for the Irregexp byte code. 6 7 8 #include "src/v8.h" 9 #include "src/unicode.h" 10 #include "src/utils.h" 11 #include "src/ast.h" 12 #include "src/bytecodes-irregexp.h" 13 #include "src/interpreter-irregexp.h" 14 #include "src/jsregexp.h" 15 #include "src/regexp-macro-assembler.h" 16 17 namespace v8 { 18 namespace internal { 19 20 21 typedef unibrow::Mapping<unibrow::Ecma262Canonicalize> Canonicalize; 22 23 static bool BackRefMatchesNoCase(Canonicalize* interp_canonicalize, 24 int from, 25 int current, 26 int len, 27 Vector<const uc16> subject) { 28 for (int i = 0; i < len; i++) { 29 unibrow::uchar old_char = subject[from++]; 30 unibrow::uchar new_char = subject[current++]; 31 if (old_char == new_char) continue; 32 unibrow::uchar old_string[1] = { old_char }; 33 unibrow::uchar new_string[1] = { new_char }; 34 interp_canonicalize->get(old_char, '\0', old_string); 35 interp_canonicalize->get(new_char, '\0', new_string); 36 if (old_string[0] != new_string[0]) { 37 return false; 38 } 39 } 40 return true; 41 } 42 43 44 static bool BackRefMatchesNoCase(Canonicalize* interp_canonicalize, 45 int from, 46 int current, 47 int len, 48 Vector<const uint8_t> subject) { 49 for (int i = 0; i < len; i++) { 50 unsigned int old_char = subject[from++]; 51 unsigned int new_char = subject[current++]; 52 if (old_char == new_char) continue; 53 // Convert both characters to lower case. 54 old_char |= 0x20; 55 new_char |= 0x20; 56 if (old_char != new_char) return false; 57 // Not letters in the ASCII range and Latin-1 range. 58 if (!(old_char - 'a' <= 'z' - 'a') && 59 !(old_char - 224 <= 254 - 224 && old_char != 247)) { 60 return false; 61 } 62 } 63 return true; 64 } 65 66 67 #ifdef DEBUG 68 static void TraceInterpreter(const byte* code_base, 69 const byte* pc, 70 int stack_depth, 71 int current_position, 72 uint32_t current_char, 73 int bytecode_length, 74 const char* bytecode_name) { 75 if (FLAG_trace_regexp_bytecodes) { 76 bool printable = (current_char < 127 && current_char >= 32); 77 const char* format = 78 printable ? 79 "pc = %02x, sp = %d, curpos = %d, curchar = %08x (%c), bc = %s" : 80 "pc = %02x, sp = %d, curpos = %d, curchar = %08x .%c., bc = %s"; 81 PrintF(format, 82 pc - code_base, 83 stack_depth, 84 current_position, 85 current_char, 86 printable ? current_char : '.', 87 bytecode_name); 88 for (int i = 0; i < bytecode_length; i++) { 89 printf(", %02x", pc[i]); 90 } 91 printf(" "); 92 for (int i = 1; i < bytecode_length; i++) { 93 unsigned char b = pc[i]; 94 if (b < 127 && b >= 32) { 95 printf("%c", b); 96 } else { 97 printf("."); 98 } 99 } 100 printf("\n"); 101 } 102 } 103 104 105 #define BYTECODE(name) \ 106 case BC_##name: \ 107 TraceInterpreter(code_base, \ 108 pc, \ 109 static_cast<int>(backtrack_sp - backtrack_stack_base), \ 110 current, \ 111 current_char, \ 112 BC_##name##_LENGTH, \ 113 #name); 114 #else 115 #define BYTECODE(name) \ 116 case BC_##name: 117 #endif 118 119 120 static int32_t Load32Aligned(const byte* pc) { 121 ASSERT((reinterpret_cast<intptr_t>(pc) & 3) == 0); 122 return *reinterpret_cast<const int32_t *>(pc); 123 } 124 125 126 static int32_t Load16Aligned(const byte* pc) { 127 ASSERT((reinterpret_cast<intptr_t>(pc) & 1) == 0); 128 return *reinterpret_cast<const uint16_t *>(pc); 129 } 130 131 132 // A simple abstraction over the backtracking stack used by the interpreter. 133 // This backtracking stack does not grow automatically, but it ensures that the 134 // the memory held by the stack is released or remembered in a cache if the 135 // matching terminates. 136 class BacktrackStack { 137 public: 138 explicit BacktrackStack() { 139 data_ = NewArray<int>(kBacktrackStackSize); 140 } 141 142 ~BacktrackStack() { 143 DeleteArray(data_); 144 } 145 146 int* data() const { return data_; } 147 148 int max_size() const { return kBacktrackStackSize; } 149 150 private: 151 static const int kBacktrackStackSize = 10000; 152 153 int* data_; 154 155 DISALLOW_COPY_AND_ASSIGN(BacktrackStack); 156 }; 157 158 159 template <typename Char> 160 static RegExpImpl::IrregexpResult RawMatch(Isolate* isolate, 161 const byte* code_base, 162 Vector<const Char> subject, 163 int* registers, 164 int current, 165 uint32_t current_char) { 166 const byte* pc = code_base; 167 // BacktrackStack ensures that the memory allocated for the backtracking stack 168 // is returned to the system or cached if there is no stack being cached at 169 // the moment. 170 BacktrackStack backtrack_stack; 171 int* backtrack_stack_base = backtrack_stack.data(); 172 int* backtrack_sp = backtrack_stack_base; 173 int backtrack_stack_space = backtrack_stack.max_size(); 174 #ifdef DEBUG 175 if (FLAG_trace_regexp_bytecodes) { 176 PrintF("\n\nStart bytecode interpreter\n\n"); 177 } 178 #endif 179 while (true) { 180 int32_t insn = Load32Aligned(pc); 181 switch (insn & BYTECODE_MASK) { 182 BYTECODE(BREAK) 183 UNREACHABLE(); 184 return RegExpImpl::RE_FAILURE; 185 BYTECODE(PUSH_CP) 186 if (--backtrack_stack_space < 0) { 187 return RegExpImpl::RE_EXCEPTION; 188 } 189 *backtrack_sp++ = current; 190 pc += BC_PUSH_CP_LENGTH; 191 break; 192 BYTECODE(PUSH_BT) 193 if (--backtrack_stack_space < 0) { 194 return RegExpImpl::RE_EXCEPTION; 195 } 196 *backtrack_sp++ = Load32Aligned(pc + 4); 197 pc += BC_PUSH_BT_LENGTH; 198 break; 199 BYTECODE(PUSH_REGISTER) 200 if (--backtrack_stack_space < 0) { 201 return RegExpImpl::RE_EXCEPTION; 202 } 203 *backtrack_sp++ = registers[insn >> BYTECODE_SHIFT]; 204 pc += BC_PUSH_REGISTER_LENGTH; 205 break; 206 BYTECODE(SET_REGISTER) 207 registers[insn >> BYTECODE_SHIFT] = Load32Aligned(pc + 4); 208 pc += BC_SET_REGISTER_LENGTH; 209 break; 210 BYTECODE(ADVANCE_REGISTER) 211 registers[insn >> BYTECODE_SHIFT] += Load32Aligned(pc + 4); 212 pc += BC_ADVANCE_REGISTER_LENGTH; 213 break; 214 BYTECODE(SET_REGISTER_TO_CP) 215 registers[insn >> BYTECODE_SHIFT] = current + Load32Aligned(pc + 4); 216 pc += BC_SET_REGISTER_TO_CP_LENGTH; 217 break; 218 BYTECODE(SET_CP_TO_REGISTER) 219 current = registers[insn >> BYTECODE_SHIFT]; 220 pc += BC_SET_CP_TO_REGISTER_LENGTH; 221 break; 222 BYTECODE(SET_REGISTER_TO_SP) 223 registers[insn >> BYTECODE_SHIFT] = 224 static_cast<int>(backtrack_sp - backtrack_stack_base); 225 pc += BC_SET_REGISTER_TO_SP_LENGTH; 226 break; 227 BYTECODE(SET_SP_TO_REGISTER) 228 backtrack_sp = backtrack_stack_base + registers[insn >> BYTECODE_SHIFT]; 229 backtrack_stack_space = backtrack_stack.max_size() - 230 static_cast<int>(backtrack_sp - backtrack_stack_base); 231 pc += BC_SET_SP_TO_REGISTER_LENGTH; 232 break; 233 BYTECODE(POP_CP) 234 backtrack_stack_space++; 235 --backtrack_sp; 236 current = *backtrack_sp; 237 pc += BC_POP_CP_LENGTH; 238 break; 239 BYTECODE(POP_BT) 240 backtrack_stack_space++; 241 --backtrack_sp; 242 pc = code_base + *backtrack_sp; 243 break; 244 BYTECODE(POP_REGISTER) 245 backtrack_stack_space++; 246 --backtrack_sp; 247 registers[insn >> BYTECODE_SHIFT] = *backtrack_sp; 248 pc += BC_POP_REGISTER_LENGTH; 249 break; 250 BYTECODE(FAIL) 251 return RegExpImpl::RE_FAILURE; 252 BYTECODE(SUCCEED) 253 return RegExpImpl::RE_SUCCESS; 254 BYTECODE(ADVANCE_CP) 255 current += insn >> BYTECODE_SHIFT; 256 pc += BC_ADVANCE_CP_LENGTH; 257 break; 258 BYTECODE(GOTO) 259 pc = code_base + Load32Aligned(pc + 4); 260 break; 261 BYTECODE(ADVANCE_CP_AND_GOTO) 262 current += insn >> BYTECODE_SHIFT; 263 pc = code_base + Load32Aligned(pc + 4); 264 break; 265 BYTECODE(CHECK_GREEDY) 266 if (current == backtrack_sp[-1]) { 267 backtrack_sp--; 268 backtrack_stack_space++; 269 pc = code_base + Load32Aligned(pc + 4); 270 } else { 271 pc += BC_CHECK_GREEDY_LENGTH; 272 } 273 break; 274 BYTECODE(LOAD_CURRENT_CHAR) { 275 int pos = current + (insn >> BYTECODE_SHIFT); 276 if (pos >= subject.length()) { 277 pc = code_base + Load32Aligned(pc + 4); 278 } else { 279 current_char = subject[pos]; 280 pc += BC_LOAD_CURRENT_CHAR_LENGTH; 281 } 282 break; 283 } 284 BYTECODE(LOAD_CURRENT_CHAR_UNCHECKED) { 285 int pos = current + (insn >> BYTECODE_SHIFT); 286 current_char = subject[pos]; 287 pc += BC_LOAD_CURRENT_CHAR_UNCHECKED_LENGTH; 288 break; 289 } 290 BYTECODE(LOAD_2_CURRENT_CHARS) { 291 int pos = current + (insn >> BYTECODE_SHIFT); 292 if (pos + 2 > subject.length()) { 293 pc = code_base + Load32Aligned(pc + 4); 294 } else { 295 Char next = subject[pos + 1]; 296 current_char = 297 (subject[pos] | (next << (kBitsPerByte * sizeof(Char)))); 298 pc += BC_LOAD_2_CURRENT_CHARS_LENGTH; 299 } 300 break; 301 } 302 BYTECODE(LOAD_2_CURRENT_CHARS_UNCHECKED) { 303 int pos = current + (insn >> BYTECODE_SHIFT); 304 Char next = subject[pos + 1]; 305 current_char = (subject[pos] | (next << (kBitsPerByte * sizeof(Char)))); 306 pc += BC_LOAD_2_CURRENT_CHARS_UNCHECKED_LENGTH; 307 break; 308 } 309 BYTECODE(LOAD_4_CURRENT_CHARS) { 310 ASSERT(sizeof(Char) == 1); 311 int pos = current + (insn >> BYTECODE_SHIFT); 312 if (pos + 4 > subject.length()) { 313 pc = code_base + Load32Aligned(pc + 4); 314 } else { 315 Char next1 = subject[pos + 1]; 316 Char next2 = subject[pos + 2]; 317 Char next3 = subject[pos + 3]; 318 current_char = (subject[pos] | 319 (next1 << 8) | 320 (next2 << 16) | 321 (next3 << 24)); 322 pc += BC_LOAD_4_CURRENT_CHARS_LENGTH; 323 } 324 break; 325 } 326 BYTECODE(LOAD_4_CURRENT_CHARS_UNCHECKED) { 327 ASSERT(sizeof(Char) == 1); 328 int pos = current + (insn >> BYTECODE_SHIFT); 329 Char next1 = subject[pos + 1]; 330 Char next2 = subject[pos + 2]; 331 Char next3 = subject[pos + 3]; 332 current_char = (subject[pos] | 333 (next1 << 8) | 334 (next2 << 16) | 335 (next3 << 24)); 336 pc += BC_LOAD_4_CURRENT_CHARS_UNCHECKED_LENGTH; 337 break; 338 } 339 BYTECODE(CHECK_4_CHARS) { 340 uint32_t c = Load32Aligned(pc + 4); 341 if (c == current_char) { 342 pc = code_base + Load32Aligned(pc + 8); 343 } else { 344 pc += BC_CHECK_4_CHARS_LENGTH; 345 } 346 break; 347 } 348 BYTECODE(CHECK_CHAR) { 349 uint32_t c = (insn >> BYTECODE_SHIFT); 350 if (c == current_char) { 351 pc = code_base + Load32Aligned(pc + 4); 352 } else { 353 pc += BC_CHECK_CHAR_LENGTH; 354 } 355 break; 356 } 357 BYTECODE(CHECK_NOT_4_CHARS) { 358 uint32_t c = Load32Aligned(pc + 4); 359 if (c != current_char) { 360 pc = code_base + Load32Aligned(pc + 8); 361 } else { 362 pc += BC_CHECK_NOT_4_CHARS_LENGTH; 363 } 364 break; 365 } 366 BYTECODE(CHECK_NOT_CHAR) { 367 uint32_t c = (insn >> BYTECODE_SHIFT); 368 if (c != current_char) { 369 pc = code_base + Load32Aligned(pc + 4); 370 } else { 371 pc += BC_CHECK_NOT_CHAR_LENGTH; 372 } 373 break; 374 } 375 BYTECODE(AND_CHECK_4_CHARS) { 376 uint32_t c = Load32Aligned(pc + 4); 377 if (c == (current_char & Load32Aligned(pc + 8))) { 378 pc = code_base + Load32Aligned(pc + 12); 379 } else { 380 pc += BC_AND_CHECK_4_CHARS_LENGTH; 381 } 382 break; 383 } 384 BYTECODE(AND_CHECK_CHAR) { 385 uint32_t c = (insn >> BYTECODE_SHIFT); 386 if (c == (current_char & Load32Aligned(pc + 4))) { 387 pc = code_base + Load32Aligned(pc + 8); 388 } else { 389 pc += BC_AND_CHECK_CHAR_LENGTH; 390 } 391 break; 392 } 393 BYTECODE(AND_CHECK_NOT_4_CHARS) { 394 uint32_t c = Load32Aligned(pc + 4); 395 if (c != (current_char & Load32Aligned(pc + 8))) { 396 pc = code_base + Load32Aligned(pc + 12); 397 } else { 398 pc += BC_AND_CHECK_NOT_4_CHARS_LENGTH; 399 } 400 break; 401 } 402 BYTECODE(AND_CHECK_NOT_CHAR) { 403 uint32_t c = (insn >> BYTECODE_SHIFT); 404 if (c != (current_char & Load32Aligned(pc + 4))) { 405 pc = code_base + Load32Aligned(pc + 8); 406 } else { 407 pc += BC_AND_CHECK_NOT_CHAR_LENGTH; 408 } 409 break; 410 } 411 BYTECODE(MINUS_AND_CHECK_NOT_CHAR) { 412 uint32_t c = (insn >> BYTECODE_SHIFT); 413 uint32_t minus = Load16Aligned(pc + 4); 414 uint32_t mask = Load16Aligned(pc + 6); 415 if (c != ((current_char - minus) & mask)) { 416 pc = code_base + Load32Aligned(pc + 8); 417 } else { 418 pc += BC_MINUS_AND_CHECK_NOT_CHAR_LENGTH; 419 } 420 break; 421 } 422 BYTECODE(CHECK_CHAR_IN_RANGE) { 423 uint32_t from = Load16Aligned(pc + 4); 424 uint32_t to = Load16Aligned(pc + 6); 425 if (from <= current_char && current_char <= to) { 426 pc = code_base + Load32Aligned(pc + 8); 427 } else { 428 pc += BC_CHECK_CHAR_IN_RANGE_LENGTH; 429 } 430 break; 431 } 432 BYTECODE(CHECK_CHAR_NOT_IN_RANGE) { 433 uint32_t from = Load16Aligned(pc + 4); 434 uint32_t to = Load16Aligned(pc + 6); 435 if (from > current_char || current_char > to) { 436 pc = code_base + Load32Aligned(pc + 8); 437 } else { 438 pc += BC_CHECK_CHAR_NOT_IN_RANGE_LENGTH; 439 } 440 break; 441 } 442 BYTECODE(CHECK_BIT_IN_TABLE) { 443 int mask = RegExpMacroAssembler::kTableMask; 444 byte b = pc[8 + ((current_char & mask) >> kBitsPerByteLog2)]; 445 int bit = (current_char & (kBitsPerByte - 1)); 446 if ((b & (1 << bit)) != 0) { 447 pc = code_base + Load32Aligned(pc + 4); 448 } else { 449 pc += BC_CHECK_BIT_IN_TABLE_LENGTH; 450 } 451 break; 452 } 453 BYTECODE(CHECK_LT) { 454 uint32_t limit = (insn >> BYTECODE_SHIFT); 455 if (current_char < limit) { 456 pc = code_base + Load32Aligned(pc + 4); 457 } else { 458 pc += BC_CHECK_LT_LENGTH; 459 } 460 break; 461 } 462 BYTECODE(CHECK_GT) { 463 uint32_t limit = (insn >> BYTECODE_SHIFT); 464 if (current_char > limit) { 465 pc = code_base + Load32Aligned(pc + 4); 466 } else { 467 pc += BC_CHECK_GT_LENGTH; 468 } 469 break; 470 } 471 BYTECODE(CHECK_REGISTER_LT) 472 if (registers[insn >> BYTECODE_SHIFT] < Load32Aligned(pc + 4)) { 473 pc = code_base + Load32Aligned(pc + 8); 474 } else { 475 pc += BC_CHECK_REGISTER_LT_LENGTH; 476 } 477 break; 478 BYTECODE(CHECK_REGISTER_GE) 479 if (registers[insn >> BYTECODE_SHIFT] >= Load32Aligned(pc + 4)) { 480 pc = code_base + Load32Aligned(pc + 8); 481 } else { 482 pc += BC_CHECK_REGISTER_GE_LENGTH; 483 } 484 break; 485 BYTECODE(CHECK_REGISTER_EQ_POS) 486 if (registers[insn >> BYTECODE_SHIFT] == current) { 487 pc = code_base + Load32Aligned(pc + 4); 488 } else { 489 pc += BC_CHECK_REGISTER_EQ_POS_LENGTH; 490 } 491 break; 492 BYTECODE(CHECK_NOT_REGS_EQUAL) 493 if (registers[insn >> BYTECODE_SHIFT] == 494 registers[Load32Aligned(pc + 4)]) { 495 pc += BC_CHECK_NOT_REGS_EQUAL_LENGTH; 496 } else { 497 pc = code_base + Load32Aligned(pc + 8); 498 } 499 break; 500 BYTECODE(CHECK_NOT_BACK_REF) { 501 int from = registers[insn >> BYTECODE_SHIFT]; 502 int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from; 503 if (from < 0 || len <= 0) { 504 pc += BC_CHECK_NOT_BACK_REF_LENGTH; 505 break; 506 } 507 if (current + len > subject.length()) { 508 pc = code_base + Load32Aligned(pc + 4); 509 break; 510 } else { 511 int i; 512 for (i = 0; i < len; i++) { 513 if (subject[from + i] != subject[current + i]) { 514 pc = code_base + Load32Aligned(pc + 4); 515 break; 516 } 517 } 518 if (i < len) break; 519 current += len; 520 } 521 pc += BC_CHECK_NOT_BACK_REF_LENGTH; 522 break; 523 } 524 BYTECODE(CHECK_NOT_BACK_REF_NO_CASE) { 525 int from = registers[insn >> BYTECODE_SHIFT]; 526 int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from; 527 if (from < 0 || len <= 0) { 528 pc += BC_CHECK_NOT_BACK_REF_NO_CASE_LENGTH; 529 break; 530 } 531 if (current + len > subject.length()) { 532 pc = code_base + Load32Aligned(pc + 4); 533 break; 534 } else { 535 if (BackRefMatchesNoCase(isolate->interp_canonicalize_mapping(), 536 from, current, len, subject)) { 537 current += len; 538 pc += BC_CHECK_NOT_BACK_REF_NO_CASE_LENGTH; 539 } else { 540 pc = code_base + Load32Aligned(pc + 4); 541 } 542 } 543 break; 544 } 545 BYTECODE(CHECK_AT_START) 546 if (current == 0) { 547 pc = code_base + Load32Aligned(pc + 4); 548 } else { 549 pc += BC_CHECK_AT_START_LENGTH; 550 } 551 break; 552 BYTECODE(CHECK_NOT_AT_START) 553 if (current == 0) { 554 pc += BC_CHECK_NOT_AT_START_LENGTH; 555 } else { 556 pc = code_base + Load32Aligned(pc + 4); 557 } 558 break; 559 BYTECODE(SET_CURRENT_POSITION_FROM_END) { 560 int by = static_cast<uint32_t>(insn) >> BYTECODE_SHIFT; 561 if (subject.length() - current > by) { 562 current = subject.length() - by; 563 current_char = subject[current - 1]; 564 } 565 pc += BC_SET_CURRENT_POSITION_FROM_END_LENGTH; 566 break; 567 } 568 default: 569 UNREACHABLE(); 570 break; 571 } 572 } 573 } 574 575 576 RegExpImpl::IrregexpResult IrregexpInterpreter::Match( 577 Isolate* isolate, 578 Handle<ByteArray> code_array, 579 Handle<String> subject, 580 int* registers, 581 int start_position) { 582 ASSERT(subject->IsFlat()); 583 584 DisallowHeapAllocation no_gc; 585 const byte* code_base = code_array->GetDataStartAddress(); 586 uc16 previous_char = '\n'; 587 String::FlatContent subject_content = subject->GetFlatContent(); 588 if (subject_content.IsAscii()) { 589 Vector<const uint8_t> subject_vector = subject_content.ToOneByteVector(); 590 if (start_position != 0) previous_char = subject_vector[start_position - 1]; 591 return RawMatch(isolate, 592 code_base, 593 subject_vector, 594 registers, 595 start_position, 596 previous_char); 597 } else { 598 ASSERT(subject_content.IsTwoByte()); 599 Vector<const uc16> subject_vector = subject_content.ToUC16Vector(); 600 if (start_position != 0) previous_char = subject_vector[start_position - 1]; 601 return RawMatch(isolate, 602 code_base, 603 subject_vector, 604 registers, 605 start_position, 606 previous_char); 607 } 608 } 609 610 } } // namespace v8::internal 611