1 // Copyright 2011 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef V8_STRING_SEARCH_H_ 6 #define V8_STRING_SEARCH_H_ 7 8 #include "src/isolate.h" 9 #include "src/vector.h" 10 11 namespace v8 { 12 namespace internal { 13 14 15 //--------------------------------------------------------------------- 16 // String Search object. 17 //--------------------------------------------------------------------- 18 19 // Class holding constants and methods that apply to all string search variants, 20 // independently of subject and pattern char size. 21 class StringSearchBase { 22 protected: 23 // Cap on the maximal shift in the Boyer-Moore implementation. By setting a 24 // limit, we can fix the size of tables. For a needle longer than this limit, 25 // search will not be optimal, since we only build tables for a suffix 26 // of the string, but it is a safe approximation. 27 static const int kBMMaxShift = Isolate::kBMMaxShift; 28 29 // Reduce alphabet to this size. 30 // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size 31 // proportional to the input alphabet. We reduce the alphabet size by 32 // equating input characters modulo a smaller alphabet size. This gives 33 // a potentially less efficient searching, but is a safe approximation. 34 // For needles using only characters in the same Unicode 256-code point page, 35 // there is no search speed degradation. 36 static const int kLatin1AlphabetSize = 256; 37 static const int kUC16AlphabetSize = Isolate::kUC16AlphabetSize; 38 39 // Bad-char shift table stored in the state. It's length is the alphabet size. 40 // For patterns below this length, the skip length of Boyer-Moore is too short 41 // to compensate for the algorithmic overhead compared to simple brute force. 42 static const int kBMMinPatternLength = 7; 43 44 static inline bool IsOneByteString(Vector<const uint8_t> string) { 45 return true; 46 } 47 48 static inline bool IsOneByteString(Vector<const uc16> string) { 49 return String::IsOneByte(string.start(), string.length()); 50 } 51 52 friend class Isolate; 53 }; 54 55 56 template <typename PatternChar, typename SubjectChar> 57 class StringSearch : private StringSearchBase { 58 public: 59 StringSearch(Isolate* isolate, Vector<const PatternChar> pattern) 60 : isolate_(isolate), 61 pattern_(pattern), 62 start_(Max(0, pattern.length() - kBMMaxShift)) { 63 if (sizeof(PatternChar) > sizeof(SubjectChar)) { 64 if (!IsOneByteString(pattern_)) { 65 strategy_ = &FailSearch; 66 return; 67 } 68 } 69 int pattern_length = pattern_.length(); 70 if (pattern_length < kBMMinPatternLength) { 71 if (pattern_length == 1) { 72 strategy_ = &SingleCharSearch; 73 return; 74 } 75 strategy_ = &LinearSearch; 76 return; 77 } 78 strategy_ = &InitialSearch; 79 } 80 81 int Search(Vector<const SubjectChar> subject, int index) { 82 return strategy_(this, subject, index); 83 } 84 85 static inline int AlphabetSize() { 86 if (sizeof(PatternChar) == 1) { 87 // Latin1 needle. 88 return kLatin1AlphabetSize; 89 } else { 90 DCHECK(sizeof(PatternChar) == 2); 91 // UC16 needle. 92 return kUC16AlphabetSize; 93 } 94 } 95 96 private: 97 typedef int (*SearchFunction)( // NOLINT - it's not a cast! 98 StringSearch<PatternChar, SubjectChar>*, 99 Vector<const SubjectChar>, 100 int); 101 102 static int FailSearch(StringSearch<PatternChar, SubjectChar>*, 103 Vector<const SubjectChar>, 104 int) { 105 return -1; 106 } 107 108 static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search, 109 Vector<const SubjectChar> subject, 110 int start_index); 111 112 static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search, 113 Vector<const SubjectChar> subject, 114 int start_index); 115 116 static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search, 117 Vector<const SubjectChar> subject, 118 int start_index); 119 120 static int BoyerMooreHorspoolSearch( 121 StringSearch<PatternChar, SubjectChar>* search, 122 Vector<const SubjectChar> subject, 123 int start_index); 124 125 static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search, 126 Vector<const SubjectChar> subject, 127 int start_index); 128 129 void PopulateBoyerMooreHorspoolTable(); 130 131 void PopulateBoyerMooreTable(); 132 133 static inline bool exceedsOneByte(uint8_t c) { 134 return false; 135 } 136 137 static inline bool exceedsOneByte(uint16_t c) { 138 return c > String::kMaxOneByteCharCodeU; 139 } 140 141 static inline int CharOccurrence(int* bad_char_occurrence, 142 SubjectChar char_code) { 143 if (sizeof(SubjectChar) == 1) { 144 return bad_char_occurrence[static_cast<int>(char_code)]; 145 } 146 if (sizeof(PatternChar) == 1) { 147 if (exceedsOneByte(char_code)) { 148 return -1; 149 } 150 return bad_char_occurrence[static_cast<unsigned int>(char_code)]; 151 } 152 // Both pattern and subject are UC16. Reduce character to equivalence class. 153 int equiv_class = char_code % kUC16AlphabetSize; 154 return bad_char_occurrence[equiv_class]; 155 } 156 157 // The following tables are shared by all searches. 158 // TODO(lrn): Introduce a way for a pattern to keep its tables 159 // between searches (e.g., for an Atom RegExp). 160 161 // Store for the BoyerMoore(Horspool) bad char shift table. 162 // Return a table covering the last kBMMaxShift+1 positions of 163 // pattern. 164 int* bad_char_table() { 165 return isolate_->bad_char_shift_table(); 166 } 167 168 // Store for the BoyerMoore good suffix shift table. 169 int* good_suffix_shift_table() { 170 // Return biased pointer that maps the range [start_..pattern_.length() 171 // to the kGoodSuffixShiftTable array. 172 return isolate_->good_suffix_shift_table() - start_; 173 } 174 175 // Table used temporarily while building the BoyerMoore good suffix 176 // shift table. 177 int* suffix_table() { 178 // Return biased pointer that maps the range [start_..pattern_.length() 179 // to the kSuffixTable array. 180 return isolate_->suffix_table() - start_; 181 } 182 183 Isolate* isolate_; 184 // The pattern to search for. 185 Vector<const PatternChar> pattern_; 186 // Pointer to implementation of the search. 187 SearchFunction strategy_; 188 // Cache value of Max(0, pattern_length() - kBMMaxShift) 189 int start_; 190 }; 191 192 193 template <typename T, typename U> 194 inline T AlignDown(T value, U alignment) { 195 return reinterpret_cast<T>( 196 (reinterpret_cast<uintptr_t>(value) & ~(alignment - 1))); 197 } 198 199 200 inline uint8_t GetHighestValueByte(uc16 character) { 201 return Max(static_cast<uint8_t>(character & 0xFF), 202 static_cast<uint8_t>(character >> 8)); 203 } 204 205 206 inline uint8_t GetHighestValueByte(uint8_t character) { return character; } 207 208 209 template <typename PatternChar, typename SubjectChar> 210 inline int FindFirstCharacter(Vector<const PatternChar> pattern, 211 Vector<const SubjectChar> subject, int index) { 212 const PatternChar pattern_first_char = pattern[0]; 213 const int max_n = (subject.length() - pattern.length() + 1); 214 215 const uint8_t search_byte = GetHighestValueByte(pattern_first_char); 216 const SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char); 217 int pos = index; 218 do { 219 DCHECK_GE(max_n - pos, 0); 220 const SubjectChar* char_pos = reinterpret_cast<const SubjectChar*>( 221 memchr(subject.start() + pos, search_byte, 222 (max_n - pos) * sizeof(SubjectChar))); 223 if (char_pos == NULL) return -1; 224 char_pos = AlignDown(char_pos, sizeof(SubjectChar)); 225 pos = static_cast<int>(char_pos - subject.start()); 226 if (subject[pos] == search_char) return pos; 227 } while (++pos < max_n); 228 229 return -1; 230 } 231 232 233 //--------------------------------------------------------------------- 234 // Single Character Pattern Search Strategy 235 //--------------------------------------------------------------------- 236 237 template <typename PatternChar, typename SubjectChar> 238 int StringSearch<PatternChar, SubjectChar>::SingleCharSearch( 239 StringSearch<PatternChar, SubjectChar>* search, 240 Vector<const SubjectChar> subject, 241 int index) { 242 DCHECK_EQ(1, search->pattern_.length()); 243 PatternChar pattern_first_char = search->pattern_[0]; 244 if (sizeof(PatternChar) > sizeof(SubjectChar)) { 245 if (exceedsOneByte(pattern_first_char)) { 246 return -1; 247 } 248 } 249 return FindFirstCharacter(search->pattern_, subject, index); 250 } 251 252 //--------------------------------------------------------------------- 253 // Linear Search Strategy 254 //--------------------------------------------------------------------- 255 256 257 template <typename PatternChar, typename SubjectChar> 258 inline bool CharCompare(const PatternChar* pattern, 259 const SubjectChar* subject, 260 int length) { 261 DCHECK(length > 0); 262 int pos = 0; 263 do { 264 if (pattern[pos] != subject[pos]) { 265 return false; 266 } 267 pos++; 268 } while (pos < length); 269 return true; 270 } 271 272 273 // Simple linear search for short patterns. Never bails out. 274 template <typename PatternChar, typename SubjectChar> 275 int StringSearch<PatternChar, SubjectChar>::LinearSearch( 276 StringSearch<PatternChar, SubjectChar>* search, 277 Vector<const SubjectChar> subject, 278 int index) { 279 Vector<const PatternChar> pattern = search->pattern_; 280 DCHECK(pattern.length() > 1); 281 int pattern_length = pattern.length(); 282 int i = index; 283 int n = subject.length() - pattern_length; 284 while (i <= n) { 285 i = FindFirstCharacter(pattern, subject, i); 286 if (i == -1) return -1; 287 DCHECK_LE(i, n); 288 i++; 289 // Loop extracted to separate function to allow using return to do 290 // a deeper break. 291 if (CharCompare(pattern.start() + 1, 292 subject.start() + i, 293 pattern_length - 1)) { 294 return i - 1; 295 } 296 } 297 return -1; 298 } 299 300 //--------------------------------------------------------------------- 301 // Boyer-Moore string search 302 //--------------------------------------------------------------------- 303 304 template <typename PatternChar, typename SubjectChar> 305 int StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch( 306 StringSearch<PatternChar, SubjectChar>* search, 307 Vector<const SubjectChar> subject, 308 int start_index) { 309 Vector<const PatternChar> pattern = search->pattern_; 310 int subject_length = subject.length(); 311 int pattern_length = pattern.length(); 312 // Only preprocess at most kBMMaxShift last characters of pattern. 313 int start = search->start_; 314 315 int* bad_char_occurence = search->bad_char_table(); 316 int* good_suffix_shift = search->good_suffix_shift_table(); 317 318 PatternChar last_char = pattern[pattern_length - 1]; 319 int index = start_index; 320 // Continue search from i. 321 while (index <= subject_length - pattern_length) { 322 int j = pattern_length - 1; 323 int c; 324 while (last_char != (c = subject[index + j])) { 325 int shift = 326 j - CharOccurrence(bad_char_occurence, c); 327 index += shift; 328 if (index > subject_length - pattern_length) { 329 return -1; 330 } 331 } 332 while (j >= 0 && pattern[j] == (c = subject[index + j])) j--; 333 if (j < 0) { 334 return index; 335 } else if (j < start) { 336 // we have matched more than our tables allow us to be smart about. 337 // Fall back on BMH shift. 338 index += pattern_length - 1 339 - CharOccurrence(bad_char_occurence, 340 static_cast<SubjectChar>(last_char)); 341 } else { 342 int gs_shift = good_suffix_shift[j + 1]; 343 int bc_occ = 344 CharOccurrence(bad_char_occurence, c); 345 int shift = j - bc_occ; 346 if (gs_shift > shift) { 347 shift = gs_shift; 348 } 349 index += shift; 350 } 351 } 352 353 return -1; 354 } 355 356 357 template <typename PatternChar, typename SubjectChar> 358 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() { 359 int pattern_length = pattern_.length(); 360 const PatternChar* pattern = pattern_.start(); 361 // Only look at the last kBMMaxShift characters of pattern (from start_ 362 // to pattern_length). 363 int start = start_; 364 int length = pattern_length - start; 365 366 // Biased tables so that we can use pattern indices as table indices, 367 // even if we only cover the part of the pattern from offset start. 368 int* shift_table = good_suffix_shift_table(); 369 int* suffix_table = this->suffix_table(); 370 371 // Initialize table. 372 for (int i = start; i < pattern_length; i++) { 373 shift_table[i] = length; 374 } 375 shift_table[pattern_length] = 1; 376 suffix_table[pattern_length] = pattern_length + 1; 377 378 if (pattern_length <= start) { 379 return; 380 } 381 382 // Find suffixes. 383 PatternChar last_char = pattern[pattern_length - 1]; 384 int suffix = pattern_length + 1; 385 { 386 int i = pattern_length; 387 while (i > start) { 388 PatternChar c = pattern[i - 1]; 389 while (suffix <= pattern_length && c != pattern[suffix - 1]) { 390 if (shift_table[suffix] == length) { 391 shift_table[suffix] = suffix - i; 392 } 393 suffix = suffix_table[suffix]; 394 } 395 suffix_table[--i] = --suffix; 396 if (suffix == pattern_length) { 397 // No suffix to extend, so we check against last_char only. 398 while ((i > start) && (pattern[i - 1] != last_char)) { 399 if (shift_table[pattern_length] == length) { 400 shift_table[pattern_length] = pattern_length - i; 401 } 402 suffix_table[--i] = pattern_length; 403 } 404 if (i > start) { 405 suffix_table[--i] = --suffix; 406 } 407 } 408 } 409 } 410 // Build shift table using suffixes. 411 if (suffix < pattern_length) { 412 for (int i = start; i <= pattern_length; i++) { 413 if (shift_table[i] == length) { 414 shift_table[i] = suffix - start; 415 } 416 if (i == suffix) { 417 suffix = suffix_table[suffix]; 418 } 419 } 420 } 421 } 422 423 //--------------------------------------------------------------------- 424 // Boyer-Moore-Horspool string search. 425 //--------------------------------------------------------------------- 426 427 template <typename PatternChar, typename SubjectChar> 428 int StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch( 429 StringSearch<PatternChar, SubjectChar>* search, 430 Vector<const SubjectChar> subject, 431 int start_index) { 432 Vector<const PatternChar> pattern = search->pattern_; 433 int subject_length = subject.length(); 434 int pattern_length = pattern.length(); 435 int* char_occurrences = search->bad_char_table(); 436 int badness = -pattern_length; 437 438 // How bad we are doing without a good-suffix table. 439 PatternChar last_char = pattern[pattern_length - 1]; 440 int last_char_shift = pattern_length - 1 - 441 CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char)); 442 // Perform search 443 int index = start_index; // No matches found prior to this index. 444 while (index <= subject_length - pattern_length) { 445 int j = pattern_length - 1; 446 int subject_char; 447 while (last_char != (subject_char = subject[index + j])) { 448 int bc_occ = CharOccurrence(char_occurrences, subject_char); 449 int shift = j - bc_occ; 450 index += shift; 451 badness += 1 - shift; // at most zero, so badness cannot increase. 452 if (index > subject_length - pattern_length) { 453 return -1; 454 } 455 } 456 j--; 457 while (j >= 0 && pattern[j] == (subject[index + j])) j--; 458 if (j < 0) { 459 return index; 460 } else { 461 index += last_char_shift; 462 // Badness increases by the number of characters we have 463 // checked, and decreases by the number of characters we 464 // can skip by shifting. It's a measure of how we are doing 465 // compared to reading each character exactly once. 466 badness += (pattern_length - j) - last_char_shift; 467 if (badness > 0) { 468 search->PopulateBoyerMooreTable(); 469 search->strategy_ = &BoyerMooreSearch; 470 return BoyerMooreSearch(search, subject, index); 471 } 472 } 473 } 474 return -1; 475 } 476 477 478 template <typename PatternChar, typename SubjectChar> 479 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() { 480 int pattern_length = pattern_.length(); 481 482 int* bad_char_occurrence = bad_char_table(); 483 484 // Only preprocess at most kBMMaxShift last characters of pattern. 485 int start = start_; 486 // Run forwards to populate bad_char_table, so that *last* instance 487 // of character equivalence class is the one registered. 488 // Notice: Doesn't include the last character. 489 int table_size = AlphabetSize(); 490 if (start == 0) { // All patterns less than kBMMaxShift in length. 491 memset(bad_char_occurrence, 492 -1, 493 table_size * sizeof(*bad_char_occurrence)); 494 } else { 495 for (int i = 0; i < table_size; i++) { 496 bad_char_occurrence[i] = start - 1; 497 } 498 } 499 for (int i = start; i < pattern_length - 1; i++) { 500 PatternChar c = pattern_[i]; 501 int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize(); 502 bad_char_occurrence[bucket] = i; 503 } 504 } 505 506 //--------------------------------------------------------------------- 507 // Linear string search with bailout to BMH. 508 //--------------------------------------------------------------------- 509 510 // Simple linear search for short patterns, which bails out if the string 511 // isn't found very early in the subject. Upgrades to BoyerMooreHorspool. 512 template <typename PatternChar, typename SubjectChar> 513 int StringSearch<PatternChar, SubjectChar>::InitialSearch( 514 StringSearch<PatternChar, SubjectChar>* search, 515 Vector<const SubjectChar> subject, 516 int index) { 517 Vector<const PatternChar> pattern = search->pattern_; 518 int pattern_length = pattern.length(); 519 // Badness is a count of how much work we have done. When we have 520 // done enough work we decide it's probably worth switching to a better 521 // algorithm. 522 int badness = -10 - (pattern_length << 2); 523 524 // We know our pattern is at least 2 characters, we cache the first so 525 // the common case of the first character not matching is faster. 526 for (int i = index, n = subject.length() - pattern_length; i <= n; i++) { 527 badness++; 528 if (badness <= 0) { 529 i = FindFirstCharacter(pattern, subject, i); 530 if (i == -1) return -1; 531 DCHECK_LE(i, n); 532 int j = 1; 533 do { 534 if (pattern[j] != subject[i + j]) { 535 break; 536 } 537 j++; 538 } while (j < pattern_length); 539 if (j == pattern_length) { 540 return i; 541 } 542 badness += j; 543 } else { 544 search->PopulateBoyerMooreHorspoolTable(); 545 search->strategy_ = &BoyerMooreHorspoolSearch; 546 return BoyerMooreHorspoolSearch(search, subject, i); 547 } 548 } 549 return -1; 550 } 551 552 553 // Perform a a single stand-alone search. 554 // If searching multiple times for the same pattern, a search 555 // object should be constructed once and the Search function then called 556 // for each search. 557 template <typename SubjectChar, typename PatternChar> 558 int SearchString(Isolate* isolate, 559 Vector<const SubjectChar> subject, 560 Vector<const PatternChar> pattern, 561 int start_index) { 562 StringSearch<PatternChar, SubjectChar> search(isolate, pattern); 563 return search.Search(subject, start_index); 564 } 565 566 } // namespace internal 567 } // namespace v8 568 569 #endif // V8_STRING_SEARCH_H_ 570