1 // Copyright 2011 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef V8_STRING_SEARCH_H_ 6 #define V8_STRING_SEARCH_H_ 7 8 namespace v8 { 9 namespace internal { 10 11 12 //--------------------------------------------------------------------- 13 // String Search object. 14 //--------------------------------------------------------------------- 15 16 // Class holding constants and methods that apply to all string search variants, 17 // independently of subject and pattern char size. 18 class StringSearchBase { 19 protected: 20 // Cap on the maximal shift in the Boyer-Moore implementation. By setting a 21 // limit, we can fix the size of tables. For a needle longer than this limit, 22 // search will not be optimal, since we only build tables for a suffix 23 // of the string, but it is a safe approximation. 24 static const int kBMMaxShift = Isolate::kBMMaxShift; 25 26 // Reduce alphabet to this size. 27 // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size 28 // proportional to the input alphabet. We reduce the alphabet size by 29 // equating input characters modulo a smaller alphabet size. This gives 30 // a potentially less efficient searching, but is a safe approximation. 31 // For needles using only characters in the same Unicode 256-code point page, 32 // there is no search speed degradation. 33 static const int kLatin1AlphabetSize = 256; 34 static const int kUC16AlphabetSize = Isolate::kUC16AlphabetSize; 35 36 // Bad-char shift table stored in the state. It's length is the alphabet size. 37 // For patterns below this length, the skip length of Boyer-Moore is too short 38 // to compensate for the algorithmic overhead compared to simple brute force. 39 static const int kBMMinPatternLength = 7; 40 41 static inline bool IsOneByteString(Vector<const uint8_t> string) { 42 return true; 43 } 44 45 static inline bool IsOneByteString(Vector<const uc16> string) { 46 return String::IsOneByte(string.start(), string.length()); 47 } 48 49 friend class Isolate; 50 }; 51 52 53 template <typename PatternChar, typename SubjectChar> 54 class StringSearch : private StringSearchBase { 55 public: 56 StringSearch(Isolate* isolate, Vector<const PatternChar> pattern) 57 : isolate_(isolate), 58 pattern_(pattern), 59 start_(Max(0, pattern.length() - kBMMaxShift)) { 60 if (sizeof(PatternChar) > sizeof(SubjectChar)) { 61 if (!IsOneByteString(pattern_)) { 62 strategy_ = &FailSearch; 63 return; 64 } 65 } 66 int pattern_length = pattern_.length(); 67 if (pattern_length < kBMMinPatternLength) { 68 if (pattern_length == 1) { 69 strategy_ = &SingleCharSearch; 70 return; 71 } 72 strategy_ = &LinearSearch; 73 return; 74 } 75 strategy_ = &InitialSearch; 76 } 77 78 int Search(Vector<const SubjectChar> subject, int index) { 79 return strategy_(this, subject, index); 80 } 81 82 static inline int AlphabetSize() { 83 if (sizeof(PatternChar) == 1) { 84 // Latin1 needle. 85 return kLatin1AlphabetSize; 86 } else { 87 DCHECK(sizeof(PatternChar) == 2); 88 // UC16 needle. 89 return kUC16AlphabetSize; 90 } 91 } 92 93 private: 94 typedef int (*SearchFunction)( // NOLINT - it's not a cast! 95 StringSearch<PatternChar, SubjectChar>*, 96 Vector<const SubjectChar>, 97 int); 98 99 static int FailSearch(StringSearch<PatternChar, SubjectChar>*, 100 Vector<const SubjectChar>, 101 int) { 102 return -1; 103 } 104 105 static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search, 106 Vector<const SubjectChar> subject, 107 int start_index); 108 109 static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search, 110 Vector<const SubjectChar> subject, 111 int start_index); 112 113 static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search, 114 Vector<const SubjectChar> subject, 115 int start_index); 116 117 static int BoyerMooreHorspoolSearch( 118 StringSearch<PatternChar, SubjectChar>* search, 119 Vector<const SubjectChar> subject, 120 int start_index); 121 122 static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search, 123 Vector<const SubjectChar> subject, 124 int start_index); 125 126 void PopulateBoyerMooreHorspoolTable(); 127 128 void PopulateBoyerMooreTable(); 129 130 static inline bool exceedsOneByte(uint8_t c) { 131 return false; 132 } 133 134 static inline bool exceedsOneByte(uint16_t c) { 135 return c > String::kMaxOneByteCharCodeU; 136 } 137 138 static inline int CharOccurrence(int* bad_char_occurrence, 139 SubjectChar char_code) { 140 if (sizeof(SubjectChar) == 1) { 141 return bad_char_occurrence[static_cast<int>(char_code)]; 142 } 143 if (sizeof(PatternChar) == 1) { 144 if (exceedsOneByte(char_code)) { 145 return -1; 146 } 147 return bad_char_occurrence[static_cast<unsigned int>(char_code)]; 148 } 149 // Both pattern and subject are UC16. Reduce character to equivalence class. 150 int equiv_class = char_code % kUC16AlphabetSize; 151 return bad_char_occurrence[equiv_class]; 152 } 153 154 // The following tables are shared by all searches. 155 // TODO(lrn): Introduce a way for a pattern to keep its tables 156 // between searches (e.g., for an Atom RegExp). 157 158 // Store for the BoyerMoore(Horspool) bad char shift table. 159 // Return a table covering the last kBMMaxShift+1 positions of 160 // pattern. 161 int* bad_char_table() { 162 return isolate_->bad_char_shift_table(); 163 } 164 165 // Store for the BoyerMoore good suffix shift table. 166 int* good_suffix_shift_table() { 167 // Return biased pointer that maps the range [start_..pattern_.length() 168 // to the kGoodSuffixShiftTable array. 169 return isolate_->good_suffix_shift_table() - start_; 170 } 171 172 // Table used temporarily while building the BoyerMoore good suffix 173 // shift table. 174 int* suffix_table() { 175 // Return biased pointer that maps the range [start_..pattern_.length() 176 // to the kSuffixTable array. 177 return isolate_->suffix_table() - start_; 178 } 179 180 Isolate* isolate_; 181 // The pattern to search for. 182 Vector<const PatternChar> pattern_; 183 // Pointer to implementation of the search. 184 SearchFunction strategy_; 185 // Cache value of Max(0, pattern_length() - kBMMaxShift) 186 int start_; 187 }; 188 189 190 //--------------------------------------------------------------------- 191 // Single Character Pattern Search Strategy 192 //--------------------------------------------------------------------- 193 194 template <typename PatternChar, typename SubjectChar> 195 int StringSearch<PatternChar, SubjectChar>::SingleCharSearch( 196 StringSearch<PatternChar, SubjectChar>* search, 197 Vector<const SubjectChar> subject, 198 int index) { 199 DCHECK_EQ(1, search->pattern_.length()); 200 PatternChar pattern_first_char = search->pattern_[0]; 201 int i = index; 202 if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) { 203 const SubjectChar* pos = reinterpret_cast<const SubjectChar*>( 204 memchr(subject.start() + i, 205 pattern_first_char, 206 subject.length() - i)); 207 if (pos == NULL) return -1; 208 return static_cast<int>(pos - subject.start()); 209 } else { 210 if (sizeof(PatternChar) > sizeof(SubjectChar)) { 211 if (exceedsOneByte(pattern_first_char)) { 212 return -1; 213 } 214 } 215 SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char); 216 int n = subject.length(); 217 while (i < n) { 218 if (subject[i++] == search_char) return i - 1; 219 } 220 return -1; 221 } 222 } 223 224 //--------------------------------------------------------------------- 225 // Linear Search Strategy 226 //--------------------------------------------------------------------- 227 228 229 template <typename PatternChar, typename SubjectChar> 230 inline bool CharCompare(const PatternChar* pattern, 231 const SubjectChar* subject, 232 int length) { 233 DCHECK(length > 0); 234 int pos = 0; 235 do { 236 if (pattern[pos] != subject[pos]) { 237 return false; 238 } 239 pos++; 240 } while (pos < length); 241 return true; 242 } 243 244 245 // Simple linear search for short patterns. Never bails out. 246 template <typename PatternChar, typename SubjectChar> 247 int StringSearch<PatternChar, SubjectChar>::LinearSearch( 248 StringSearch<PatternChar, SubjectChar>* search, 249 Vector<const SubjectChar> subject, 250 int index) { 251 Vector<const PatternChar> pattern = search->pattern_; 252 DCHECK(pattern.length() > 1); 253 int pattern_length = pattern.length(); 254 PatternChar pattern_first_char = pattern[0]; 255 int i = index; 256 int n = subject.length() - pattern_length; 257 while (i <= n) { 258 if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) { 259 const SubjectChar* pos = reinterpret_cast<const SubjectChar*>( 260 memchr(subject.start() + i, 261 pattern_first_char, 262 n - i + 1)); 263 if (pos == NULL) return -1; 264 i = static_cast<int>(pos - subject.start()) + 1; 265 } else { 266 if (subject[i++] != pattern_first_char) continue; 267 } 268 // Loop extracted to separate function to allow using return to do 269 // a deeper break. 270 if (CharCompare(pattern.start() + 1, 271 subject.start() + i, 272 pattern_length - 1)) { 273 return i - 1; 274 } 275 } 276 return -1; 277 } 278 279 //--------------------------------------------------------------------- 280 // Boyer-Moore string search 281 //--------------------------------------------------------------------- 282 283 template <typename PatternChar, typename SubjectChar> 284 int StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch( 285 StringSearch<PatternChar, SubjectChar>* search, 286 Vector<const SubjectChar> subject, 287 int start_index) { 288 Vector<const PatternChar> pattern = search->pattern_; 289 int subject_length = subject.length(); 290 int pattern_length = pattern.length(); 291 // Only preprocess at most kBMMaxShift last characters of pattern. 292 int start = search->start_; 293 294 int* bad_char_occurence = search->bad_char_table(); 295 int* good_suffix_shift = search->good_suffix_shift_table(); 296 297 PatternChar last_char = pattern[pattern_length - 1]; 298 int index = start_index; 299 // Continue search from i. 300 while (index <= subject_length - pattern_length) { 301 int j = pattern_length - 1; 302 int c; 303 while (last_char != (c = subject[index + j])) { 304 int shift = 305 j - CharOccurrence(bad_char_occurence, c); 306 index += shift; 307 if (index > subject_length - pattern_length) { 308 return -1; 309 } 310 } 311 while (j >= 0 && pattern[j] == (c = subject[index + j])) j--; 312 if (j < 0) { 313 return index; 314 } else if (j < start) { 315 // we have matched more than our tables allow us to be smart about. 316 // Fall back on BMH shift. 317 index += pattern_length - 1 318 - CharOccurrence(bad_char_occurence, 319 static_cast<SubjectChar>(last_char)); 320 } else { 321 int gs_shift = good_suffix_shift[j + 1]; 322 int bc_occ = 323 CharOccurrence(bad_char_occurence, c); 324 int shift = j - bc_occ; 325 if (gs_shift > shift) { 326 shift = gs_shift; 327 } 328 index += shift; 329 } 330 } 331 332 return -1; 333 } 334 335 336 template <typename PatternChar, typename SubjectChar> 337 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() { 338 int pattern_length = pattern_.length(); 339 const PatternChar* pattern = pattern_.start(); 340 // Only look at the last kBMMaxShift characters of pattern (from start_ 341 // to pattern_length). 342 int start = start_; 343 int length = pattern_length - start; 344 345 // Biased tables so that we can use pattern indices as table indices, 346 // even if we only cover the part of the pattern from offset start. 347 int* shift_table = good_suffix_shift_table(); 348 int* suffix_table = this->suffix_table(); 349 350 // Initialize table. 351 for (int i = start; i < pattern_length; i++) { 352 shift_table[i] = length; 353 } 354 shift_table[pattern_length] = 1; 355 suffix_table[pattern_length] = pattern_length + 1; 356 357 if (pattern_length <= start) { 358 return; 359 } 360 361 // Find suffixes. 362 PatternChar last_char = pattern[pattern_length - 1]; 363 int suffix = pattern_length + 1; 364 { 365 int i = pattern_length; 366 while (i > start) { 367 PatternChar c = pattern[i - 1]; 368 while (suffix <= pattern_length && c != pattern[suffix - 1]) { 369 if (shift_table[suffix] == length) { 370 shift_table[suffix] = suffix - i; 371 } 372 suffix = suffix_table[suffix]; 373 } 374 suffix_table[--i] = --suffix; 375 if (suffix == pattern_length) { 376 // No suffix to extend, so we check against last_char only. 377 while ((i > start) && (pattern[i - 1] != last_char)) { 378 if (shift_table[pattern_length] == length) { 379 shift_table[pattern_length] = pattern_length - i; 380 } 381 suffix_table[--i] = pattern_length; 382 } 383 if (i > start) { 384 suffix_table[--i] = --suffix; 385 } 386 } 387 } 388 } 389 // Build shift table using suffixes. 390 if (suffix < pattern_length) { 391 for (int i = start; i <= pattern_length; i++) { 392 if (shift_table[i] == length) { 393 shift_table[i] = suffix - start; 394 } 395 if (i == suffix) { 396 suffix = suffix_table[suffix]; 397 } 398 } 399 } 400 } 401 402 //--------------------------------------------------------------------- 403 // Boyer-Moore-Horspool string search. 404 //--------------------------------------------------------------------- 405 406 template <typename PatternChar, typename SubjectChar> 407 int StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch( 408 StringSearch<PatternChar, SubjectChar>* search, 409 Vector<const SubjectChar> subject, 410 int start_index) { 411 Vector<const PatternChar> pattern = search->pattern_; 412 int subject_length = subject.length(); 413 int pattern_length = pattern.length(); 414 int* char_occurrences = search->bad_char_table(); 415 int badness = -pattern_length; 416 417 // How bad we are doing without a good-suffix table. 418 PatternChar last_char = pattern[pattern_length - 1]; 419 int last_char_shift = pattern_length - 1 - 420 CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char)); 421 // Perform search 422 int index = start_index; // No matches found prior to this index. 423 while (index <= subject_length - pattern_length) { 424 int j = pattern_length - 1; 425 int subject_char; 426 while (last_char != (subject_char = subject[index + j])) { 427 int bc_occ = CharOccurrence(char_occurrences, subject_char); 428 int shift = j - bc_occ; 429 index += shift; 430 badness += 1 - shift; // at most zero, so badness cannot increase. 431 if (index > subject_length - pattern_length) { 432 return -1; 433 } 434 } 435 j--; 436 while (j >= 0 && pattern[j] == (subject[index + j])) j--; 437 if (j < 0) { 438 return index; 439 } else { 440 index += last_char_shift; 441 // Badness increases by the number of characters we have 442 // checked, and decreases by the number of characters we 443 // can skip by shifting. It's a measure of how we are doing 444 // compared to reading each character exactly once. 445 badness += (pattern_length - j) - last_char_shift; 446 if (badness > 0) { 447 search->PopulateBoyerMooreTable(); 448 search->strategy_ = &BoyerMooreSearch; 449 return BoyerMooreSearch(search, subject, index); 450 } 451 } 452 } 453 return -1; 454 } 455 456 457 template <typename PatternChar, typename SubjectChar> 458 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() { 459 int pattern_length = pattern_.length(); 460 461 int* bad_char_occurrence = bad_char_table(); 462 463 // Only preprocess at most kBMMaxShift last characters of pattern. 464 int start = start_; 465 // Run forwards to populate bad_char_table, so that *last* instance 466 // of character equivalence class is the one registered. 467 // Notice: Doesn't include the last character. 468 int table_size = AlphabetSize(); 469 if (start == 0) { // All patterns less than kBMMaxShift in length. 470 memset(bad_char_occurrence, 471 -1, 472 table_size * sizeof(*bad_char_occurrence)); 473 } else { 474 for (int i = 0; i < table_size; i++) { 475 bad_char_occurrence[i] = start - 1; 476 } 477 } 478 for (int i = start; i < pattern_length - 1; i++) { 479 PatternChar c = pattern_[i]; 480 int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize(); 481 bad_char_occurrence[bucket] = i; 482 } 483 } 484 485 //--------------------------------------------------------------------- 486 // Linear string search with bailout to BMH. 487 //--------------------------------------------------------------------- 488 489 // Simple linear search for short patterns, which bails out if the string 490 // isn't found very early in the subject. Upgrades to BoyerMooreHorspool. 491 template <typename PatternChar, typename SubjectChar> 492 int StringSearch<PatternChar, SubjectChar>::InitialSearch( 493 StringSearch<PatternChar, SubjectChar>* search, 494 Vector<const SubjectChar> subject, 495 int index) { 496 Vector<const PatternChar> pattern = search->pattern_; 497 int pattern_length = pattern.length(); 498 // Badness is a count of how much work we have done. When we have 499 // done enough work we decide it's probably worth switching to a better 500 // algorithm. 501 int badness = -10 - (pattern_length << 2); 502 503 // We know our pattern is at least 2 characters, we cache the first so 504 // the common case of the first character not matching is faster. 505 PatternChar pattern_first_char = pattern[0]; 506 for (int i = index, n = subject.length() - pattern_length; i <= n; i++) { 507 badness++; 508 if (badness <= 0) { 509 if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) { 510 const SubjectChar* pos = reinterpret_cast<const SubjectChar*>( 511 memchr(subject.start() + i, 512 pattern_first_char, 513 n - i + 1)); 514 if (pos == NULL) { 515 return -1; 516 } 517 i = static_cast<int>(pos - subject.start()); 518 } else { 519 if (subject[i] != pattern_first_char) continue; 520 } 521 int j = 1; 522 do { 523 if (pattern[j] != subject[i + j]) { 524 break; 525 } 526 j++; 527 } while (j < pattern_length); 528 if (j == pattern_length) { 529 return i; 530 } 531 badness += j; 532 } else { 533 search->PopulateBoyerMooreHorspoolTable(); 534 search->strategy_ = &BoyerMooreHorspoolSearch; 535 return BoyerMooreHorspoolSearch(search, subject, i); 536 } 537 } 538 return -1; 539 } 540 541 542 // Perform a a single stand-alone search. 543 // If searching multiple times for the same pattern, a search 544 // object should be constructed once and the Search function then called 545 // for each search. 546 template <typename SubjectChar, typename PatternChar> 547 int SearchString(Isolate* isolate, 548 Vector<const SubjectChar> subject, 549 Vector<const PatternChar> pattern, 550 int start_index) { 551 StringSearch<PatternChar, SubjectChar> search(isolate, pattern); 552 return search.Search(subject, start_index); 553 } 554 555 }} // namespace v8::internal 556 557 #endif // V8_STRING_SEARCH_H_ 558