1 // Copyright 2011 the V8 project authors. All rights reserved. 2 // Redistribution and use in source and binary forms, with or without 3 // modification, are permitted provided that the following conditions are 4 // met: 5 // 6 // * Redistributions of source code must retain the above copyright 7 // notice, this list of conditions and the following disclaimer. 8 // * Redistributions in binary form must reproduce the above 9 // copyright notice, this list of conditions and the following 10 // disclaimer in the documentation and/or other materials provided 11 // with the distribution. 12 // * Neither the name of Google Inc. nor the names of its 13 // contributors may be used to endorse or promote products derived 14 // from this software without specific prior written permission. 15 // 16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 28 #ifndef V8_STRING_SEARCH_H_ 29 #define V8_STRING_SEARCH_H_ 30 31 namespace v8 { 32 namespace internal { 33 34 35 //--------------------------------------------------------------------- 36 // String Search object. 37 //--------------------------------------------------------------------- 38 39 // Class holding constants and methods that apply to all string search variants, 40 // independently of subject and pattern char size. 41 class StringSearchBase { 42 protected: 43 // Cap on the maximal shift in the Boyer-Moore implementation. By setting a 44 // limit, we can fix the size of tables. For a needle longer than this limit, 45 // search will not be optimal, since we only build tables for a suffix 46 // of the string, but it is a safe approximation. 47 static const int kBMMaxShift = Isolate::kBMMaxShift; 48 49 // Reduce alphabet to this size. 50 // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size 51 // proportional to the input alphabet. We reduce the alphabet size by 52 // equating input characters modulo a smaller alphabet size. This gives 53 // a potentially less efficient searching, but is a safe approximation. 54 // For needles using only characters in the same Unicode 256-code point page, 55 // there is no search speed degradation. 56 static const int kAsciiAlphabetSize = 128; 57 static const int kUC16AlphabetSize = Isolate::kUC16AlphabetSize; 58 59 // Bad-char shift table stored in the state. It's length is the alphabet size. 60 // For patterns below this length, the skip length of Boyer-Moore is too short 61 // to compensate for the algorithmic overhead compared to simple brute force. 62 static const int kBMMinPatternLength = 7; 63 64 static inline bool IsAsciiString(Vector<const char>) { 65 return true; 66 } 67 68 static inline bool IsAsciiString(Vector<const uc16> string) { 69 return String::IsAscii(string.start(), string.length()); 70 } 71 72 friend class Isolate; 73 }; 74 75 76 template <typename PatternChar, typename SubjectChar> 77 class StringSearch : private StringSearchBase { 78 public: 79 StringSearch(Isolate* isolate, Vector<const PatternChar> pattern) 80 : isolate_(isolate), 81 pattern_(pattern), 82 start_(Max(0, pattern.length() - kBMMaxShift)) { 83 if (sizeof(PatternChar) > sizeof(SubjectChar)) { 84 if (!IsAsciiString(pattern_)) { 85 strategy_ = &FailSearch; 86 return; 87 } 88 } 89 int pattern_length = pattern_.length(); 90 if (pattern_length < kBMMinPatternLength) { 91 if (pattern_length == 1) { 92 strategy_ = &SingleCharSearch; 93 return; 94 } 95 strategy_ = &LinearSearch; 96 return; 97 } 98 strategy_ = &InitialSearch; 99 } 100 101 int Search(Vector<const SubjectChar> subject, int index) { 102 return strategy_(this, subject, index); 103 } 104 105 static inline int AlphabetSize() { 106 if (sizeof(PatternChar) == 1) { 107 // ASCII needle. 108 return kAsciiAlphabetSize; 109 } else { 110 ASSERT(sizeof(PatternChar) == 2); 111 // UC16 needle. 112 return kUC16AlphabetSize; 113 } 114 } 115 116 private: 117 typedef int (*SearchFunction)( // NOLINT - it's not a cast! 118 StringSearch<PatternChar, SubjectChar>*, 119 Vector<const SubjectChar>, 120 int); 121 122 static int FailSearch(StringSearch<PatternChar, SubjectChar>*, 123 Vector<const SubjectChar>, 124 int) { 125 return -1; 126 } 127 128 static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search, 129 Vector<const SubjectChar> subject, 130 int start_index); 131 132 static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search, 133 Vector<const SubjectChar> subject, 134 int start_index); 135 136 static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search, 137 Vector<const SubjectChar> subject, 138 int start_index); 139 140 static int BoyerMooreHorspoolSearch( 141 StringSearch<PatternChar, SubjectChar>* search, 142 Vector<const SubjectChar> subject, 143 int start_index); 144 145 static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search, 146 Vector<const SubjectChar> subject, 147 int start_index); 148 149 void PopulateBoyerMooreHorspoolTable(); 150 151 void PopulateBoyerMooreTable(); 152 153 static inline int CharOccurrence(int* bad_char_occurrence, 154 SubjectChar char_code) { 155 if (sizeof(SubjectChar) == 1) { 156 return bad_char_occurrence[static_cast<int>(char_code)]; 157 } 158 if (sizeof(PatternChar) == 1) { 159 if (static_cast<unsigned int>(char_code) > String::kMaxAsciiCharCodeU) { 160 return -1; 161 } 162 return bad_char_occurrence[static_cast<unsigned int>(char_code)]; 163 } 164 // Both pattern and subject are UC16. Reduce character to equivalence class. 165 int equiv_class = char_code % kUC16AlphabetSize; 166 return bad_char_occurrence[equiv_class]; 167 } 168 169 // The following tables are shared by all searches. 170 // TODO(lrn): Introduce a way for a pattern to keep its tables 171 // between searches (e.g., for an Atom RegExp). 172 173 // Store for the BoyerMoore(Horspool) bad char shift table. 174 // Return a table covering the last kBMMaxShift+1 positions of 175 // pattern. 176 int* bad_char_table() { 177 return isolate_->bad_char_shift_table(); 178 } 179 180 // Store for the BoyerMoore good suffix shift table. 181 int* good_suffix_shift_table() { 182 // Return biased pointer that maps the range [start_..pattern_.length() 183 // to the kGoodSuffixShiftTable array. 184 return isolate_->good_suffix_shift_table() - start_; 185 } 186 187 // Table used temporarily while building the BoyerMoore good suffix 188 // shift table. 189 int* suffix_table() { 190 // Return biased pointer that maps the range [start_..pattern_.length() 191 // to the kSuffixTable array. 192 return isolate_->suffix_table() - start_; 193 } 194 195 Isolate* isolate_; 196 // The pattern to search for. 197 Vector<const PatternChar> pattern_; 198 // Pointer to implementation of the search. 199 SearchFunction strategy_; 200 // Cache value of Max(0, pattern_length() - kBMMaxShift) 201 int start_; 202 }; 203 204 205 //--------------------------------------------------------------------- 206 // Single Character Pattern Search Strategy 207 //--------------------------------------------------------------------- 208 209 template <typename PatternChar, typename SubjectChar> 210 int StringSearch<PatternChar, SubjectChar>::SingleCharSearch( 211 StringSearch<PatternChar, SubjectChar>* search, 212 Vector<const SubjectChar> subject, 213 int index) { 214 ASSERT_EQ(1, search->pattern_.length()); 215 PatternChar pattern_first_char = search->pattern_[0]; 216 int i = index; 217 if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) { 218 const SubjectChar* pos = reinterpret_cast<const SubjectChar*>( 219 memchr(subject.start() + i, 220 pattern_first_char, 221 subject.length() - i)); 222 if (pos == NULL) return -1; 223 return static_cast<int>(pos - subject.start()); 224 } else { 225 if (sizeof(PatternChar) > sizeof(SubjectChar)) { 226 if (static_cast<uc16>(pattern_first_char) > String::kMaxAsciiCharCodeU) { 227 return -1; 228 } 229 } 230 SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char); 231 int n = subject.length(); 232 while (i < n) { 233 if (subject[i++] == search_char) return i - 1; 234 } 235 return -1; 236 } 237 } 238 239 //--------------------------------------------------------------------- 240 // Linear Search Strategy 241 //--------------------------------------------------------------------- 242 243 244 template <typename PatternChar, typename SubjectChar> 245 inline bool CharCompare(const PatternChar* pattern, 246 const SubjectChar* subject, 247 int length) { 248 ASSERT(length > 0); 249 int pos = 0; 250 do { 251 if (pattern[pos] != subject[pos]) { 252 return false; 253 } 254 pos++; 255 } while (pos < length); 256 return true; 257 } 258 259 260 // Simple linear search for short patterns. Never bails out. 261 template <typename PatternChar, typename SubjectChar> 262 int StringSearch<PatternChar, SubjectChar>::LinearSearch( 263 StringSearch<PatternChar, SubjectChar>* search, 264 Vector<const SubjectChar> subject, 265 int index) { 266 Vector<const PatternChar> pattern = search->pattern_; 267 ASSERT(pattern.length() > 1); 268 int pattern_length = pattern.length(); 269 PatternChar pattern_first_char = pattern[0]; 270 int i = index; 271 int n = subject.length() - pattern_length; 272 while (i <= n) { 273 if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) { 274 const SubjectChar* pos = reinterpret_cast<const SubjectChar*>( 275 memchr(subject.start() + i, 276 pattern_first_char, 277 n - i + 1)); 278 if (pos == NULL) return -1; 279 i = static_cast<int>(pos - subject.start()) + 1; 280 } else { 281 if (subject[i++] != pattern_first_char) continue; 282 } 283 // Loop extracted to separate function to allow using return to do 284 // a deeper break. 285 if (CharCompare(pattern.start() + 1, 286 subject.start() + i, 287 pattern_length - 1)) { 288 return i - 1; 289 } 290 } 291 return -1; 292 } 293 294 //--------------------------------------------------------------------- 295 // Boyer-Moore string search 296 //--------------------------------------------------------------------- 297 298 template <typename PatternChar, typename SubjectChar> 299 int StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch( 300 StringSearch<PatternChar, SubjectChar>* search, 301 Vector<const SubjectChar> subject, 302 int start_index) { 303 Vector<const PatternChar> pattern = search->pattern_; 304 int subject_length = subject.length(); 305 int pattern_length = pattern.length(); 306 // Only preprocess at most kBMMaxShift last characters of pattern. 307 int start = search->start_; 308 309 int* bad_char_occurence = search->bad_char_table(); 310 int* good_suffix_shift = search->good_suffix_shift_table(); 311 312 PatternChar last_char = pattern[pattern_length - 1]; 313 int index = start_index; 314 // Continue search from i. 315 while (index <= subject_length - pattern_length) { 316 int j = pattern_length - 1; 317 int c; 318 while (last_char != (c = subject[index + j])) { 319 int shift = 320 j - CharOccurrence(bad_char_occurence, c); 321 index += shift; 322 if (index > subject_length - pattern_length) { 323 return -1; 324 } 325 } 326 while (j >= 0 && pattern[j] == (c = subject[index + j])) j--; 327 if (j < 0) { 328 return index; 329 } else if (j < start) { 330 // we have matched more than our tables allow us to be smart about. 331 // Fall back on BMH shift. 332 index += pattern_length - 1 333 - CharOccurrence(bad_char_occurence, 334 static_cast<SubjectChar>(last_char)); 335 } else { 336 int gs_shift = good_suffix_shift[j + 1]; 337 int bc_occ = 338 CharOccurrence(bad_char_occurence, c); 339 int shift = j - bc_occ; 340 if (gs_shift > shift) { 341 shift = gs_shift; 342 } 343 index += shift; 344 } 345 } 346 347 return -1; 348 } 349 350 351 template <typename PatternChar, typename SubjectChar> 352 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() { 353 int pattern_length = pattern_.length(); 354 const PatternChar* pattern = pattern_.start(); 355 // Only look at the last kBMMaxShift characters of pattern (from start_ 356 // to pattern_length). 357 int start = start_; 358 int length = pattern_length - start; 359 360 // Biased tables so that we can use pattern indices as table indices, 361 // even if we only cover the part of the pattern from offset start. 362 int* shift_table = good_suffix_shift_table(); 363 int* suffix_table = this->suffix_table(); 364 365 // Initialize table. 366 for (int i = start; i < pattern_length; i++) { 367 shift_table[i] = length; 368 } 369 shift_table[pattern_length] = 1; 370 suffix_table[pattern_length] = pattern_length + 1; 371 372 if (pattern_length <= start) { 373 return; 374 } 375 376 // Find suffixes. 377 PatternChar last_char = pattern[pattern_length - 1]; 378 int suffix = pattern_length + 1; 379 { 380 int i = pattern_length; 381 while (i > start) { 382 PatternChar c = pattern[i - 1]; 383 while (suffix <= pattern_length && c != pattern[suffix - 1]) { 384 if (shift_table[suffix] == length) { 385 shift_table[suffix] = suffix - i; 386 } 387 suffix = suffix_table[suffix]; 388 } 389 suffix_table[--i] = --suffix; 390 if (suffix == pattern_length) { 391 // No suffix to extend, so we check against last_char only. 392 while ((i > start) && (pattern[i - 1] != last_char)) { 393 if (shift_table[pattern_length] == length) { 394 shift_table[pattern_length] = pattern_length - i; 395 } 396 suffix_table[--i] = pattern_length; 397 } 398 if (i > start) { 399 suffix_table[--i] = --suffix; 400 } 401 } 402 } 403 } 404 // Build shift table using suffixes. 405 if (suffix < pattern_length) { 406 for (int i = start; i <= pattern_length; i++) { 407 if (shift_table[i] == length) { 408 shift_table[i] = suffix - start; 409 } 410 if (i == suffix) { 411 suffix = suffix_table[suffix]; 412 } 413 } 414 } 415 } 416 417 //--------------------------------------------------------------------- 418 // Boyer-Moore-Horspool string search. 419 //--------------------------------------------------------------------- 420 421 template <typename PatternChar, typename SubjectChar> 422 int StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch( 423 StringSearch<PatternChar, SubjectChar>* search, 424 Vector<const SubjectChar> subject, 425 int start_index) { 426 Vector<const PatternChar> pattern = search->pattern_; 427 int subject_length = subject.length(); 428 int pattern_length = pattern.length(); 429 int* char_occurrences = search->bad_char_table(); 430 int badness = -pattern_length; 431 432 // How bad we are doing without a good-suffix table. 433 PatternChar last_char = pattern[pattern_length - 1]; 434 int last_char_shift = pattern_length - 1 - 435 CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char)); 436 // Perform search 437 int index = start_index; // No matches found prior to this index. 438 while (index <= subject_length - pattern_length) { 439 int j = pattern_length - 1; 440 int subject_char; 441 while (last_char != (subject_char = subject[index + j])) { 442 int bc_occ = CharOccurrence(char_occurrences, subject_char); 443 int shift = j - bc_occ; 444 index += shift; 445 badness += 1 - shift; // at most zero, so badness cannot increase. 446 if (index > subject_length - pattern_length) { 447 return -1; 448 } 449 } 450 j--; 451 while (j >= 0 && pattern[j] == (subject[index + j])) j--; 452 if (j < 0) { 453 return index; 454 } else { 455 index += last_char_shift; 456 // Badness increases by the number of characters we have 457 // checked, and decreases by the number of characters we 458 // can skip by shifting. It's a measure of how we are doing 459 // compared to reading each character exactly once. 460 badness += (pattern_length - j) - last_char_shift; 461 if (badness > 0) { 462 search->PopulateBoyerMooreTable(); 463 search->strategy_ = &BoyerMooreSearch; 464 return BoyerMooreSearch(search, subject, index); 465 } 466 } 467 } 468 return -1; 469 } 470 471 472 template <typename PatternChar, typename SubjectChar> 473 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() { 474 int pattern_length = pattern_.length(); 475 476 int* bad_char_occurrence = bad_char_table(); 477 478 // Only preprocess at most kBMMaxShift last characters of pattern. 479 int start = start_; 480 // Run forwards to populate bad_char_table, so that *last* instance 481 // of character equivalence class is the one registered. 482 // Notice: Doesn't include the last character. 483 int table_size = AlphabetSize(); 484 if (start == 0) { // All patterns less than kBMMaxShift in length. 485 memset(bad_char_occurrence, 486 -1, 487 table_size * sizeof(*bad_char_occurrence)); 488 } else { 489 for (int i = 0; i < table_size; i++) { 490 bad_char_occurrence[i] = start - 1; 491 } 492 } 493 for (int i = start; i < pattern_length - 1; i++) { 494 PatternChar c = pattern_[i]; 495 int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize(); 496 bad_char_occurrence[bucket] = i; 497 } 498 } 499 500 //--------------------------------------------------------------------- 501 // Linear string search with bailout to BMH. 502 //--------------------------------------------------------------------- 503 504 // Simple linear search for short patterns, which bails out if the string 505 // isn't found very early in the subject. Upgrades to BoyerMooreHorspool. 506 template <typename PatternChar, typename SubjectChar> 507 int StringSearch<PatternChar, SubjectChar>::InitialSearch( 508 StringSearch<PatternChar, SubjectChar>* search, 509 Vector<const SubjectChar> subject, 510 int index) { 511 Vector<const PatternChar> pattern = search->pattern_; 512 int pattern_length = pattern.length(); 513 // Badness is a count of how much work we have done. When we have 514 // done enough work we decide it's probably worth switching to a better 515 // algorithm. 516 int badness = -10 - (pattern_length << 2); 517 518 // We know our pattern is at least 2 characters, we cache the first so 519 // the common case of the first character not matching is faster. 520 PatternChar pattern_first_char = pattern[0]; 521 for (int i = index, n = subject.length() - pattern_length; i <= n; i++) { 522 badness++; 523 if (badness <= 0) { 524 if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) { 525 const SubjectChar* pos = reinterpret_cast<const SubjectChar*>( 526 memchr(subject.start() + i, 527 pattern_first_char, 528 n - i + 1)); 529 if (pos == NULL) { 530 return -1; 531 } 532 i = static_cast<int>(pos - subject.start()); 533 } else { 534 if (subject[i] != pattern_first_char) continue; 535 } 536 int j = 1; 537 do { 538 if (pattern[j] != subject[i + j]) { 539 break; 540 } 541 j++; 542 } while (j < pattern_length); 543 if (j == pattern_length) { 544 return i; 545 } 546 badness += j; 547 } else { 548 search->PopulateBoyerMooreHorspoolTable(); 549 search->strategy_ = &BoyerMooreHorspoolSearch; 550 return BoyerMooreHorspoolSearch(search, subject, i); 551 } 552 } 553 return -1; 554 } 555 556 557 // Perform a a single stand-alone search. 558 // If searching multiple times for the same pattern, a search 559 // object should be constructed once and the Search function then called 560 // for each search. 561 template <typename SubjectChar, typename PatternChar> 562 int SearchString(Isolate* isolate, 563 Vector<const SubjectChar> subject, 564 Vector<const PatternChar> pattern, 565 int start_index) { 566 StringSearch<PatternChar, SubjectChar> search(isolate, pattern); 567 return search.Search(subject, start_index); 568 } 569 570 }} // namespace v8::internal 571 572 #endif // V8_STRING_SEARCH_H_ 573