1 // Copyright 2011 the V8 project authors. All rights reserved. 2 // Redistribution and use in source and binary forms, with or without 3 // modification, are permitted provided that the following conditions are 4 // met: 5 // 6 // * Redistributions of source code must retain the above copyright 7 // notice, this list of conditions and the following disclaimer. 8 // * Redistributions in binary form must reproduce the above 9 // copyright notice, this list of conditions and the following 10 // disclaimer in the documentation and/or other materials provided 11 // with the distribution. 12 // * Neither the name of Google Inc. nor the names of its 13 // contributors may be used to endorse or promote products derived 14 // from this software without specific prior written permission. 15 // 16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 28 #ifndef V8_STRING_SEARCH_H_ 29 #define V8_STRING_SEARCH_H_ 30 31 namespace v8 { 32 namespace internal { 33 34 35 //--------------------------------------------------------------------- 36 // String Search object. 37 //--------------------------------------------------------------------- 38 39 // Class holding constants and methods that apply to all string search variants, 40 // independently of subject and pattern char size. 41 class StringSearchBase { 42 protected: 43 // Cap on the maximal shift in the Boyer-Moore implementation. By setting a 44 // limit, we can fix the size of tables. For a needle longer than this limit, 45 // search will not be optimal, since we only build tables for a suffix 46 // of the string, but it is a safe approximation. 47 static const int kBMMaxShift = Isolate::kBMMaxShift; 48 49 // Reduce alphabet to this size. 50 // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size 51 // proportional to the input alphabet. We reduce the alphabet size by 52 // equating input characters modulo a smaller alphabet size. This gives 53 // a potentially less efficient searching, but is a safe approximation. 54 // For needles using only characters in the same Unicode 256-code point page, 55 // there is no search speed degradation. 56 static const int kAsciiAlphabetSize = 256; 57 static const int kUC16AlphabetSize = Isolate::kUC16AlphabetSize; 58 59 // Bad-char shift table stored in the state. It's length is the alphabet size. 60 // For patterns below this length, the skip length of Boyer-Moore is too short 61 // to compensate for the algorithmic overhead compared to simple brute force. 62 static const int kBMMinPatternLength = 7; 63 64 static inline bool IsOneByteString(Vector<const uint8_t> string) { 65 return true; 66 } 67 68 static inline bool IsOneByteString(Vector<const uc16> string) { 69 return String::IsOneByte(string.start(), string.length()); 70 } 71 72 friend class Isolate; 73 }; 74 75 76 template <typename PatternChar, typename SubjectChar> 77 class StringSearch : private StringSearchBase { 78 public: 79 StringSearch(Isolate* isolate, Vector<const PatternChar> pattern) 80 : isolate_(isolate), 81 pattern_(pattern), 82 start_(Max(0, pattern.length() - kBMMaxShift)) { 83 if (sizeof(PatternChar) > sizeof(SubjectChar)) { 84 if (!IsOneByteString(pattern_)) { 85 strategy_ = &FailSearch; 86 return; 87 } 88 } 89 int pattern_length = pattern_.length(); 90 if (pattern_length < kBMMinPatternLength) { 91 if (pattern_length == 1) { 92 strategy_ = &SingleCharSearch; 93 return; 94 } 95 strategy_ = &LinearSearch; 96 return; 97 } 98 strategy_ = &InitialSearch; 99 } 100 101 int Search(Vector<const SubjectChar> subject, int index) { 102 return strategy_(this, subject, index); 103 } 104 105 static inline int AlphabetSize() { 106 if (sizeof(PatternChar) == 1) { 107 // ASCII needle. 108 return kAsciiAlphabetSize; 109 } else { 110 ASSERT(sizeof(PatternChar) == 2); 111 // UC16 needle. 112 return kUC16AlphabetSize; 113 } 114 } 115 116 private: 117 typedef int (*SearchFunction)( // NOLINT - it's not a cast! 118 StringSearch<PatternChar, SubjectChar>*, 119 Vector<const SubjectChar>, 120 int); 121 122 static int FailSearch(StringSearch<PatternChar, SubjectChar>*, 123 Vector<const SubjectChar>, 124 int) { 125 return -1; 126 } 127 128 static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search, 129 Vector<const SubjectChar> subject, 130 int start_index); 131 132 static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search, 133 Vector<const SubjectChar> subject, 134 int start_index); 135 136 static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search, 137 Vector<const SubjectChar> subject, 138 int start_index); 139 140 static int BoyerMooreHorspoolSearch( 141 StringSearch<PatternChar, SubjectChar>* search, 142 Vector<const SubjectChar> subject, 143 int start_index); 144 145 static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search, 146 Vector<const SubjectChar> subject, 147 int start_index); 148 149 void PopulateBoyerMooreHorspoolTable(); 150 151 void PopulateBoyerMooreTable(); 152 153 static inline bool exceedsOneByte(uint8_t c) { 154 return false; 155 } 156 157 static inline bool exceedsOneByte(uint16_t c) { 158 return c > String::kMaxOneByteCharCodeU; 159 } 160 161 static inline int CharOccurrence(int* bad_char_occurrence, 162 SubjectChar char_code) { 163 if (sizeof(SubjectChar) == 1) { 164 return bad_char_occurrence[static_cast<int>(char_code)]; 165 } 166 if (sizeof(PatternChar) == 1) { 167 if (exceedsOneByte(char_code)) { 168 return -1; 169 } 170 return bad_char_occurrence[static_cast<unsigned int>(char_code)]; 171 } 172 // Both pattern and subject are UC16. Reduce character to equivalence class. 173 int equiv_class = char_code % kUC16AlphabetSize; 174 return bad_char_occurrence[equiv_class]; 175 } 176 177 // The following tables are shared by all searches. 178 // TODO(lrn): Introduce a way for a pattern to keep its tables 179 // between searches (e.g., for an Atom RegExp). 180 181 // Store for the BoyerMoore(Horspool) bad char shift table. 182 // Return a table covering the last kBMMaxShift+1 positions of 183 // pattern. 184 int* bad_char_table() { 185 return isolate_->bad_char_shift_table(); 186 } 187 188 // Store for the BoyerMoore good suffix shift table. 189 int* good_suffix_shift_table() { 190 // Return biased pointer that maps the range [start_..pattern_.length() 191 // to the kGoodSuffixShiftTable array. 192 return isolate_->good_suffix_shift_table() - start_; 193 } 194 195 // Table used temporarily while building the BoyerMoore good suffix 196 // shift table. 197 int* suffix_table() { 198 // Return biased pointer that maps the range [start_..pattern_.length() 199 // to the kSuffixTable array. 200 return isolate_->suffix_table() - start_; 201 } 202 203 Isolate* isolate_; 204 // The pattern to search for. 205 Vector<const PatternChar> pattern_; 206 // Pointer to implementation of the search. 207 SearchFunction strategy_; 208 // Cache value of Max(0, pattern_length() - kBMMaxShift) 209 int start_; 210 }; 211 212 213 //--------------------------------------------------------------------- 214 // Single Character Pattern Search Strategy 215 //--------------------------------------------------------------------- 216 217 template <typename PatternChar, typename SubjectChar> 218 int StringSearch<PatternChar, SubjectChar>::SingleCharSearch( 219 StringSearch<PatternChar, SubjectChar>* search, 220 Vector<const SubjectChar> subject, 221 int index) { 222 ASSERT_EQ(1, search->pattern_.length()); 223 PatternChar pattern_first_char = search->pattern_[0]; 224 int i = index; 225 if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) { 226 const SubjectChar* pos = reinterpret_cast<const SubjectChar*>( 227 memchr(subject.start() + i, 228 pattern_first_char, 229 subject.length() - i)); 230 if (pos == NULL) return -1; 231 return static_cast<int>(pos - subject.start()); 232 } else { 233 if (sizeof(PatternChar) > sizeof(SubjectChar)) { 234 if (exceedsOneByte(pattern_first_char)) { 235 return -1; 236 } 237 } 238 SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char); 239 int n = subject.length(); 240 while (i < n) { 241 if (subject[i++] == search_char) return i - 1; 242 } 243 return -1; 244 } 245 } 246 247 //--------------------------------------------------------------------- 248 // Linear Search Strategy 249 //--------------------------------------------------------------------- 250 251 252 template <typename PatternChar, typename SubjectChar> 253 inline bool CharCompare(const PatternChar* pattern, 254 const SubjectChar* subject, 255 int length) { 256 ASSERT(length > 0); 257 int pos = 0; 258 do { 259 if (pattern[pos] != subject[pos]) { 260 return false; 261 } 262 pos++; 263 } while (pos < length); 264 return true; 265 } 266 267 268 // Simple linear search for short patterns. Never bails out. 269 template <typename PatternChar, typename SubjectChar> 270 int StringSearch<PatternChar, SubjectChar>::LinearSearch( 271 StringSearch<PatternChar, SubjectChar>* search, 272 Vector<const SubjectChar> subject, 273 int index) { 274 Vector<const PatternChar> pattern = search->pattern_; 275 ASSERT(pattern.length() > 1); 276 int pattern_length = pattern.length(); 277 PatternChar pattern_first_char = pattern[0]; 278 int i = index; 279 int n = subject.length() - pattern_length; 280 while (i <= n) { 281 if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) { 282 const SubjectChar* pos = reinterpret_cast<const SubjectChar*>( 283 memchr(subject.start() + i, 284 pattern_first_char, 285 n - i + 1)); 286 if (pos == NULL) return -1; 287 i = static_cast<int>(pos - subject.start()) + 1; 288 } else { 289 if (subject[i++] != pattern_first_char) continue; 290 } 291 // Loop extracted to separate function to allow using return to do 292 // a deeper break. 293 if (CharCompare(pattern.start() + 1, 294 subject.start() + i, 295 pattern_length - 1)) { 296 return i - 1; 297 } 298 } 299 return -1; 300 } 301 302 //--------------------------------------------------------------------- 303 // Boyer-Moore string search 304 //--------------------------------------------------------------------- 305 306 template <typename PatternChar, typename SubjectChar> 307 int StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch( 308 StringSearch<PatternChar, SubjectChar>* search, 309 Vector<const SubjectChar> subject, 310 int start_index) { 311 Vector<const PatternChar> pattern = search->pattern_; 312 int subject_length = subject.length(); 313 int pattern_length = pattern.length(); 314 // Only preprocess at most kBMMaxShift last characters of pattern. 315 int start = search->start_; 316 317 int* bad_char_occurence = search->bad_char_table(); 318 int* good_suffix_shift = search->good_suffix_shift_table(); 319 320 PatternChar last_char = pattern[pattern_length - 1]; 321 int index = start_index; 322 // Continue search from i. 323 while (index <= subject_length - pattern_length) { 324 int j = pattern_length - 1; 325 int c; 326 while (last_char != (c = subject[index + j])) { 327 int shift = 328 j - CharOccurrence(bad_char_occurence, c); 329 index += shift; 330 if (index > subject_length - pattern_length) { 331 return -1; 332 } 333 } 334 while (j >= 0 && pattern[j] == (c = subject[index + j])) j--; 335 if (j < 0) { 336 return index; 337 } else if (j < start) { 338 // we have matched more than our tables allow us to be smart about. 339 // Fall back on BMH shift. 340 index += pattern_length - 1 341 - CharOccurrence(bad_char_occurence, 342 static_cast<SubjectChar>(last_char)); 343 } else { 344 int gs_shift = good_suffix_shift[j + 1]; 345 int bc_occ = 346 CharOccurrence(bad_char_occurence, c); 347 int shift = j - bc_occ; 348 if (gs_shift > shift) { 349 shift = gs_shift; 350 } 351 index += shift; 352 } 353 } 354 355 return -1; 356 } 357 358 359 template <typename PatternChar, typename SubjectChar> 360 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() { 361 int pattern_length = pattern_.length(); 362 const PatternChar* pattern = pattern_.start(); 363 // Only look at the last kBMMaxShift characters of pattern (from start_ 364 // to pattern_length). 365 int start = start_; 366 int length = pattern_length - start; 367 368 // Biased tables so that we can use pattern indices as table indices, 369 // even if we only cover the part of the pattern from offset start. 370 int* shift_table = good_suffix_shift_table(); 371 int* suffix_table = this->suffix_table(); 372 373 // Initialize table. 374 for (int i = start; i < pattern_length; i++) { 375 shift_table[i] = length; 376 } 377 shift_table[pattern_length] = 1; 378 suffix_table[pattern_length] = pattern_length + 1; 379 380 if (pattern_length <= start) { 381 return; 382 } 383 384 // Find suffixes. 385 PatternChar last_char = pattern[pattern_length - 1]; 386 int suffix = pattern_length + 1; 387 { 388 int i = pattern_length; 389 while (i > start) { 390 PatternChar c = pattern[i - 1]; 391 while (suffix <= pattern_length && c != pattern[suffix - 1]) { 392 if (shift_table[suffix] == length) { 393 shift_table[suffix] = suffix - i; 394 } 395 suffix = suffix_table[suffix]; 396 } 397 suffix_table[--i] = --suffix; 398 if (suffix == pattern_length) { 399 // No suffix to extend, so we check against last_char only. 400 while ((i > start) && (pattern[i - 1] != last_char)) { 401 if (shift_table[pattern_length] == length) { 402 shift_table[pattern_length] = pattern_length - i; 403 } 404 suffix_table[--i] = pattern_length; 405 } 406 if (i > start) { 407 suffix_table[--i] = --suffix; 408 } 409 } 410 } 411 } 412 // Build shift table using suffixes. 413 if (suffix < pattern_length) { 414 for (int i = start; i <= pattern_length; i++) { 415 if (shift_table[i] == length) { 416 shift_table[i] = suffix - start; 417 } 418 if (i == suffix) { 419 suffix = suffix_table[suffix]; 420 } 421 } 422 } 423 } 424 425 //--------------------------------------------------------------------- 426 // Boyer-Moore-Horspool string search. 427 //--------------------------------------------------------------------- 428 429 template <typename PatternChar, typename SubjectChar> 430 int StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch( 431 StringSearch<PatternChar, SubjectChar>* search, 432 Vector<const SubjectChar> subject, 433 int start_index) { 434 Vector<const PatternChar> pattern = search->pattern_; 435 int subject_length = subject.length(); 436 int pattern_length = pattern.length(); 437 int* char_occurrences = search->bad_char_table(); 438 int badness = -pattern_length; 439 440 // How bad we are doing without a good-suffix table. 441 PatternChar last_char = pattern[pattern_length - 1]; 442 int last_char_shift = pattern_length - 1 - 443 CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char)); 444 // Perform search 445 int index = start_index; // No matches found prior to this index. 446 while (index <= subject_length - pattern_length) { 447 int j = pattern_length - 1; 448 int subject_char; 449 while (last_char != (subject_char = subject[index + j])) { 450 int bc_occ = CharOccurrence(char_occurrences, subject_char); 451 int shift = j - bc_occ; 452 index += shift; 453 badness += 1 - shift; // at most zero, so badness cannot increase. 454 if (index > subject_length - pattern_length) { 455 return -1; 456 } 457 } 458 j--; 459 while (j >= 0 && pattern[j] == (subject[index + j])) j--; 460 if (j < 0) { 461 return index; 462 } else { 463 index += last_char_shift; 464 // Badness increases by the number of characters we have 465 // checked, and decreases by the number of characters we 466 // can skip by shifting. It's a measure of how we are doing 467 // compared to reading each character exactly once. 468 badness += (pattern_length - j) - last_char_shift; 469 if (badness > 0) { 470 search->PopulateBoyerMooreTable(); 471 search->strategy_ = &BoyerMooreSearch; 472 return BoyerMooreSearch(search, subject, index); 473 } 474 } 475 } 476 return -1; 477 } 478 479 480 template <typename PatternChar, typename SubjectChar> 481 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() { 482 int pattern_length = pattern_.length(); 483 484 int* bad_char_occurrence = bad_char_table(); 485 486 // Only preprocess at most kBMMaxShift last characters of pattern. 487 int start = start_; 488 // Run forwards to populate bad_char_table, so that *last* instance 489 // of character equivalence class is the one registered. 490 // Notice: Doesn't include the last character. 491 int table_size = AlphabetSize(); 492 if (start == 0) { // All patterns less than kBMMaxShift in length. 493 memset(bad_char_occurrence, 494 -1, 495 table_size * sizeof(*bad_char_occurrence)); 496 } else { 497 for (int i = 0; i < table_size; i++) { 498 bad_char_occurrence[i] = start - 1; 499 } 500 } 501 for (int i = start; i < pattern_length - 1; i++) { 502 PatternChar c = pattern_[i]; 503 int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize(); 504 bad_char_occurrence[bucket] = i; 505 } 506 } 507 508 //--------------------------------------------------------------------- 509 // Linear string search with bailout to BMH. 510 //--------------------------------------------------------------------- 511 512 // Simple linear search for short patterns, which bails out if the string 513 // isn't found very early in the subject. Upgrades to BoyerMooreHorspool. 514 template <typename PatternChar, typename SubjectChar> 515 int StringSearch<PatternChar, SubjectChar>::InitialSearch( 516 StringSearch<PatternChar, SubjectChar>* search, 517 Vector<const SubjectChar> subject, 518 int index) { 519 Vector<const PatternChar> pattern = search->pattern_; 520 int pattern_length = pattern.length(); 521 // Badness is a count of how much work we have done. When we have 522 // done enough work we decide it's probably worth switching to a better 523 // algorithm. 524 int badness = -10 - (pattern_length << 2); 525 526 // We know our pattern is at least 2 characters, we cache the first so 527 // the common case of the first character not matching is faster. 528 PatternChar pattern_first_char = pattern[0]; 529 for (int i = index, n = subject.length() - pattern_length; i <= n; i++) { 530 badness++; 531 if (badness <= 0) { 532 if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) { 533 const SubjectChar* pos = reinterpret_cast<const SubjectChar*>( 534 memchr(subject.start() + i, 535 pattern_first_char, 536 n - i + 1)); 537 if (pos == NULL) { 538 return -1; 539 } 540 i = static_cast<int>(pos - subject.start()); 541 } else { 542 if (subject[i] != pattern_first_char) continue; 543 } 544 int j = 1; 545 do { 546 if (pattern[j] != subject[i + j]) { 547 break; 548 } 549 j++; 550 } while (j < pattern_length); 551 if (j == pattern_length) { 552 return i; 553 } 554 badness += j; 555 } else { 556 search->PopulateBoyerMooreHorspoolTable(); 557 search->strategy_ = &BoyerMooreHorspoolSearch; 558 return BoyerMooreHorspoolSearch(search, subject, i); 559 } 560 } 561 return -1; 562 } 563 564 565 // Perform a a single stand-alone search. 566 // If searching multiple times for the same pattern, a search 567 // object should be constructed once and the Search function then called 568 // for each search. 569 template <typename SubjectChar, typename PatternChar> 570 int SearchString(Isolate* isolate, 571 Vector<const SubjectChar> subject, 572 Vector<const PatternChar> pattern, 573 int start_index) { 574 StringSearch<PatternChar, SubjectChar> search(isolate, pattern); 575 return search.Search(subject, start_index); 576 } 577 578 }} // namespace v8::internal 579 580 #endif // V8_STRING_SEARCH_H_ 581