Home | History | Annotate | Download | only in src
      1 // Copyright 2011 the V8 project authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef V8_STRING_SEARCH_H_
      6 #define V8_STRING_SEARCH_H_
      7 
      8 namespace v8 {
      9 namespace internal {
     10 
     11 
     12 //---------------------------------------------------------------------
     13 // String Search object.
     14 //---------------------------------------------------------------------
     15 
     16 // Class holding constants and methods that apply to all string search variants,
     17 // independently of subject and pattern char size.
     18 class StringSearchBase {
     19  protected:
     20   // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
     21   // limit, we can fix the size of tables. For a needle longer than this limit,
     22   // search will not be optimal, since we only build tables for a suffix
     23   // of the string, but it is a safe approximation.
     24   static const int kBMMaxShift = Isolate::kBMMaxShift;
     25 
     26   // Reduce alphabet to this size.
     27   // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
     28   // proportional to the input alphabet. We reduce the alphabet size by
     29   // equating input characters modulo a smaller alphabet size. This gives
     30   // a potentially less efficient searching, but is a safe approximation.
     31   // For needles using only characters in the same Unicode 256-code point page,
     32   // there is no search speed degradation.
     33   static const int kAsciiAlphabetSize = 256;
     34   static const int kUC16AlphabetSize = Isolate::kUC16AlphabetSize;
     35 
     36   // Bad-char shift table stored in the state. It's length is the alphabet size.
     37   // For patterns below this length, the skip length of Boyer-Moore is too short
     38   // to compensate for the algorithmic overhead compared to simple brute force.
     39   static const int kBMMinPatternLength = 7;
     40 
     41   static inline bool IsOneByteString(Vector<const uint8_t> string) {
     42     return true;
     43   }
     44 
     45   static inline bool IsOneByteString(Vector<const uc16> string) {
     46     return String::IsOneByte(string.start(), string.length());
     47   }
     48 
     49   friend class Isolate;
     50 };
     51 
     52 
     53 template <typename PatternChar, typename SubjectChar>
     54 class StringSearch : private StringSearchBase {
     55  public:
     56   StringSearch(Isolate* isolate, Vector<const PatternChar> pattern)
     57       : isolate_(isolate),
     58         pattern_(pattern),
     59         start_(Max(0, pattern.length() - kBMMaxShift)) {
     60     if (sizeof(PatternChar) > sizeof(SubjectChar)) {
     61       if (!IsOneByteString(pattern_)) {
     62         strategy_ = &FailSearch;
     63         return;
     64       }
     65     }
     66     int pattern_length = pattern_.length();
     67     if (pattern_length < kBMMinPatternLength) {
     68       if (pattern_length == 1) {
     69         strategy_ = &SingleCharSearch;
     70         return;
     71       }
     72       strategy_ = &LinearSearch;
     73       return;
     74     }
     75     strategy_ = &InitialSearch;
     76   }
     77 
     78   int Search(Vector<const SubjectChar> subject, int index) {
     79     return strategy_(this, subject, index);
     80   }
     81 
     82   static inline int AlphabetSize() {
     83     if (sizeof(PatternChar) == 1) {
     84       // ASCII needle.
     85       return kAsciiAlphabetSize;
     86     } else {
     87       ASSERT(sizeof(PatternChar) == 2);
     88       // UC16 needle.
     89       return kUC16AlphabetSize;
     90     }
     91   }
     92 
     93  private:
     94   typedef int (*SearchFunction)(  // NOLINT - it's not a cast!
     95       StringSearch<PatternChar, SubjectChar>*,
     96       Vector<const SubjectChar>,
     97       int);
     98 
     99   static int FailSearch(StringSearch<PatternChar, SubjectChar>*,
    100                         Vector<const SubjectChar>,
    101                         int) {
    102     return -1;
    103   }
    104 
    105   static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search,
    106                               Vector<const SubjectChar> subject,
    107                               int start_index);
    108 
    109   static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search,
    110                           Vector<const SubjectChar> subject,
    111                           int start_index);
    112 
    113   static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search,
    114                            Vector<const SubjectChar> subject,
    115                            int start_index);
    116 
    117   static int BoyerMooreHorspoolSearch(
    118       StringSearch<PatternChar, SubjectChar>* search,
    119       Vector<const SubjectChar> subject,
    120       int start_index);
    121 
    122   static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search,
    123                               Vector<const SubjectChar> subject,
    124                               int start_index);
    125 
    126   void PopulateBoyerMooreHorspoolTable();
    127 
    128   void PopulateBoyerMooreTable();
    129 
    130   static inline bool exceedsOneByte(uint8_t c) {
    131     return false;
    132   }
    133 
    134   static inline bool exceedsOneByte(uint16_t c) {
    135     return c > String::kMaxOneByteCharCodeU;
    136   }
    137 
    138   static inline int CharOccurrence(int* bad_char_occurrence,
    139                                    SubjectChar char_code) {
    140     if (sizeof(SubjectChar) == 1) {
    141       return bad_char_occurrence[static_cast<int>(char_code)];
    142     }
    143     if (sizeof(PatternChar) == 1) {
    144       if (exceedsOneByte(char_code)) {
    145         return -1;
    146       }
    147       return bad_char_occurrence[static_cast<unsigned int>(char_code)];
    148     }
    149     // Both pattern and subject are UC16. Reduce character to equivalence class.
    150     int equiv_class = char_code % kUC16AlphabetSize;
    151     return bad_char_occurrence[equiv_class];
    152   }
    153 
    154   // The following tables are shared by all searches.
    155   // TODO(lrn): Introduce a way for a pattern to keep its tables
    156   // between searches (e.g., for an Atom RegExp).
    157 
    158   // Store for the BoyerMoore(Horspool) bad char shift table.
    159   // Return a table covering the last kBMMaxShift+1 positions of
    160   // pattern.
    161   int* bad_char_table() {
    162     return isolate_->bad_char_shift_table();
    163   }
    164 
    165   // Store for the BoyerMoore good suffix shift table.
    166   int* good_suffix_shift_table() {
    167     // Return biased pointer that maps the range  [start_..pattern_.length()
    168     // to the kGoodSuffixShiftTable array.
    169     return isolate_->good_suffix_shift_table() - start_;
    170   }
    171 
    172   // Table used temporarily while building the BoyerMoore good suffix
    173   // shift table.
    174   int* suffix_table() {
    175     // Return biased pointer that maps the range  [start_..pattern_.length()
    176     // to the kSuffixTable array.
    177     return isolate_->suffix_table() - start_;
    178   }
    179 
    180   Isolate* isolate_;
    181   // The pattern to search for.
    182   Vector<const PatternChar> pattern_;
    183   // Pointer to implementation of the search.
    184   SearchFunction strategy_;
    185   // Cache value of Max(0, pattern_length() - kBMMaxShift)
    186   int start_;
    187 };
    188 
    189 
    190 //---------------------------------------------------------------------
    191 // Single Character Pattern Search Strategy
    192 //---------------------------------------------------------------------
    193 
    194 template <typename PatternChar, typename SubjectChar>
    195 int StringSearch<PatternChar, SubjectChar>::SingleCharSearch(
    196     StringSearch<PatternChar, SubjectChar>* search,
    197     Vector<const SubjectChar> subject,
    198     int index) {
    199   ASSERT_EQ(1, search->pattern_.length());
    200   PatternChar pattern_first_char = search->pattern_[0];
    201   int i = index;
    202   if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
    203     const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
    204         memchr(subject.start() + i,
    205                pattern_first_char,
    206                subject.length() - i));
    207     if (pos == NULL) return -1;
    208     return static_cast<int>(pos - subject.start());
    209   } else {
    210     if (sizeof(PatternChar) > sizeof(SubjectChar)) {
    211       if (exceedsOneByte(pattern_first_char)) {
    212         return -1;
    213       }
    214     }
    215     SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char);
    216     int n = subject.length();
    217     while (i < n) {
    218       if (subject[i++] == search_char) return i - 1;
    219     }
    220     return -1;
    221   }
    222 }
    223 
    224 //---------------------------------------------------------------------
    225 // Linear Search Strategy
    226 //---------------------------------------------------------------------
    227 
    228 
    229 template <typename PatternChar, typename SubjectChar>
    230 inline bool CharCompare(const PatternChar* pattern,
    231                         const SubjectChar* subject,
    232                         int length) {
    233   ASSERT(length > 0);
    234   int pos = 0;
    235   do {
    236     if (pattern[pos] != subject[pos]) {
    237       return false;
    238     }
    239     pos++;
    240   } while (pos < length);
    241   return true;
    242 }
    243 
    244 
    245 // Simple linear search for short patterns. Never bails out.
    246 template <typename PatternChar, typename SubjectChar>
    247 int StringSearch<PatternChar, SubjectChar>::LinearSearch(
    248     StringSearch<PatternChar, SubjectChar>* search,
    249     Vector<const SubjectChar> subject,
    250     int index) {
    251   Vector<const PatternChar> pattern = search->pattern_;
    252   ASSERT(pattern.length() > 1);
    253   int pattern_length = pattern.length();
    254   PatternChar pattern_first_char = pattern[0];
    255   int i = index;
    256   int n = subject.length() - pattern_length;
    257   while (i <= n) {
    258     if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
    259       const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
    260           memchr(subject.start() + i,
    261                  pattern_first_char,
    262                  n - i + 1));
    263       if (pos == NULL) return -1;
    264       i = static_cast<int>(pos - subject.start()) + 1;
    265     } else {
    266       if (subject[i++] != pattern_first_char) continue;
    267     }
    268     // Loop extracted to separate function to allow using return to do
    269     // a deeper break.
    270     if (CharCompare(pattern.start() + 1,
    271                     subject.start() + i,
    272                     pattern_length - 1)) {
    273       return i - 1;
    274     }
    275   }
    276   return -1;
    277 }
    278 
    279 //---------------------------------------------------------------------
    280 // Boyer-Moore string search
    281 //---------------------------------------------------------------------
    282 
    283 template <typename PatternChar, typename SubjectChar>
    284 int StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch(
    285     StringSearch<PatternChar, SubjectChar>* search,
    286     Vector<const SubjectChar> subject,
    287     int start_index) {
    288   Vector<const PatternChar> pattern = search->pattern_;
    289   int subject_length = subject.length();
    290   int pattern_length = pattern.length();
    291   // Only preprocess at most kBMMaxShift last characters of pattern.
    292   int start = search->start_;
    293 
    294   int* bad_char_occurence = search->bad_char_table();
    295   int* good_suffix_shift = search->good_suffix_shift_table();
    296 
    297   PatternChar last_char = pattern[pattern_length - 1];
    298   int index = start_index;
    299   // Continue search from i.
    300   while (index <= subject_length - pattern_length) {
    301     int j = pattern_length - 1;
    302     int c;
    303     while (last_char != (c = subject[index + j])) {
    304       int shift =
    305           j - CharOccurrence(bad_char_occurence, c);
    306       index += shift;
    307       if (index > subject_length - pattern_length) {
    308         return -1;
    309       }
    310     }
    311     while (j >= 0 && pattern[j] == (c = subject[index + j])) j--;
    312     if (j < 0) {
    313       return index;
    314     } else if (j < start) {
    315       // we have matched more than our tables allow us to be smart about.
    316       // Fall back on BMH shift.
    317       index += pattern_length - 1
    318           - CharOccurrence(bad_char_occurence,
    319                            static_cast<SubjectChar>(last_char));
    320     } else {
    321       int gs_shift = good_suffix_shift[j + 1];
    322       int bc_occ =
    323           CharOccurrence(bad_char_occurence, c);
    324       int shift = j - bc_occ;
    325       if (gs_shift > shift) {
    326         shift = gs_shift;
    327       }
    328       index += shift;
    329     }
    330   }
    331 
    332   return -1;
    333 }
    334 
    335 
    336 template <typename PatternChar, typename SubjectChar>
    337 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() {
    338   int pattern_length = pattern_.length();
    339   const PatternChar* pattern = pattern_.start();
    340   // Only look at the last kBMMaxShift characters of pattern (from start_
    341   // to pattern_length).
    342   int start = start_;
    343   int length = pattern_length - start;
    344 
    345   // Biased tables so that we can use pattern indices as table indices,
    346   // even if we only cover the part of the pattern from offset start.
    347   int* shift_table = good_suffix_shift_table();
    348   int* suffix_table = this->suffix_table();
    349 
    350   // Initialize table.
    351   for (int i = start; i < pattern_length; i++) {
    352     shift_table[i] = length;
    353   }
    354   shift_table[pattern_length] = 1;
    355   suffix_table[pattern_length] = pattern_length + 1;
    356 
    357   if (pattern_length <= start) {
    358     return;
    359   }
    360 
    361   // Find suffixes.
    362   PatternChar last_char = pattern[pattern_length - 1];
    363   int suffix = pattern_length + 1;
    364   {
    365     int i = pattern_length;
    366     while (i > start) {
    367       PatternChar c = pattern[i - 1];
    368       while (suffix <= pattern_length && c != pattern[suffix - 1]) {
    369         if (shift_table[suffix] == length) {
    370           shift_table[suffix] = suffix - i;
    371         }
    372         suffix = suffix_table[suffix];
    373       }
    374       suffix_table[--i] = --suffix;
    375       if (suffix == pattern_length) {
    376         // No suffix to extend, so we check against last_char only.
    377         while ((i > start) && (pattern[i - 1] != last_char)) {
    378           if (shift_table[pattern_length] == length) {
    379             shift_table[pattern_length] = pattern_length - i;
    380           }
    381           suffix_table[--i] = pattern_length;
    382         }
    383         if (i > start) {
    384           suffix_table[--i] = --suffix;
    385         }
    386       }
    387     }
    388   }
    389   // Build shift table using suffixes.
    390   if (suffix < pattern_length) {
    391     for (int i = start; i <= pattern_length; i++) {
    392       if (shift_table[i] == length) {
    393         shift_table[i] = suffix - start;
    394       }
    395       if (i == suffix) {
    396         suffix = suffix_table[suffix];
    397       }
    398     }
    399   }
    400 }
    401 
    402 //---------------------------------------------------------------------
    403 // Boyer-Moore-Horspool string search.
    404 //---------------------------------------------------------------------
    405 
    406 template <typename PatternChar, typename SubjectChar>
    407 int StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch(
    408     StringSearch<PatternChar, SubjectChar>* search,
    409     Vector<const SubjectChar> subject,
    410     int start_index) {
    411   Vector<const PatternChar> pattern = search->pattern_;
    412   int subject_length = subject.length();
    413   int pattern_length = pattern.length();
    414   int* char_occurrences = search->bad_char_table();
    415   int badness = -pattern_length;
    416 
    417   // How bad we are doing without a good-suffix table.
    418   PatternChar last_char = pattern[pattern_length - 1];
    419   int last_char_shift = pattern_length - 1 -
    420       CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char));
    421   // Perform search
    422   int index = start_index;  // No matches found prior to this index.
    423   while (index <= subject_length - pattern_length) {
    424     int j = pattern_length - 1;
    425     int subject_char;
    426     while (last_char != (subject_char = subject[index + j])) {
    427       int bc_occ = CharOccurrence(char_occurrences, subject_char);
    428       int shift = j - bc_occ;
    429       index += shift;
    430       badness += 1 - shift;  // at most zero, so badness cannot increase.
    431       if (index > subject_length - pattern_length) {
    432         return -1;
    433       }
    434     }
    435     j--;
    436     while (j >= 0 && pattern[j] == (subject[index + j])) j--;
    437     if (j < 0) {
    438       return index;
    439     } else {
    440       index += last_char_shift;
    441       // Badness increases by the number of characters we have
    442       // checked, and decreases by the number of characters we
    443       // can skip by shifting. It's a measure of how we are doing
    444       // compared to reading each character exactly once.
    445       badness += (pattern_length - j) - last_char_shift;
    446       if (badness > 0) {
    447         search->PopulateBoyerMooreTable();
    448         search->strategy_ = &BoyerMooreSearch;
    449         return BoyerMooreSearch(search, subject, index);
    450       }
    451     }
    452   }
    453   return -1;
    454 }
    455 
    456 
    457 template <typename PatternChar, typename SubjectChar>
    458 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() {
    459   int pattern_length = pattern_.length();
    460 
    461   int* bad_char_occurrence = bad_char_table();
    462 
    463   // Only preprocess at most kBMMaxShift last characters of pattern.
    464   int start = start_;
    465   // Run forwards to populate bad_char_table, so that *last* instance
    466   // of character equivalence class is the one registered.
    467   // Notice: Doesn't include the last character.
    468   int table_size = AlphabetSize();
    469   if (start == 0) {  // All patterns less than kBMMaxShift in length.
    470     memset(bad_char_occurrence,
    471            -1,
    472            table_size * sizeof(*bad_char_occurrence));
    473   } else {
    474     for (int i = 0; i < table_size; i++) {
    475       bad_char_occurrence[i] = start - 1;
    476     }
    477   }
    478   for (int i = start; i < pattern_length - 1; i++) {
    479     PatternChar c = pattern_[i];
    480     int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize();
    481     bad_char_occurrence[bucket] = i;
    482   }
    483 }
    484 
    485 //---------------------------------------------------------------------
    486 // Linear string search with bailout to BMH.
    487 //---------------------------------------------------------------------
    488 
    489 // Simple linear search for short patterns, which bails out if the string
    490 // isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
    491 template <typename PatternChar, typename SubjectChar>
    492 int StringSearch<PatternChar, SubjectChar>::InitialSearch(
    493     StringSearch<PatternChar, SubjectChar>* search,
    494     Vector<const SubjectChar> subject,
    495     int index) {
    496   Vector<const PatternChar> pattern = search->pattern_;
    497   int pattern_length = pattern.length();
    498   // Badness is a count of how much work we have done.  When we have
    499   // done enough work we decide it's probably worth switching to a better
    500   // algorithm.
    501   int badness = -10 - (pattern_length << 2);
    502 
    503   // We know our pattern is at least 2 characters, we cache the first so
    504   // the common case of the first character not matching is faster.
    505   PatternChar pattern_first_char = pattern[0];
    506   for (int i = index, n = subject.length() - pattern_length; i <= n; i++) {
    507     badness++;
    508     if (badness <= 0) {
    509       if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
    510         const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
    511             memchr(subject.start() + i,
    512                    pattern_first_char,
    513                    n - i + 1));
    514         if (pos == NULL) {
    515           return -1;
    516         }
    517         i = static_cast<int>(pos - subject.start());
    518       } else {
    519         if (subject[i] != pattern_first_char) continue;
    520       }
    521       int j = 1;
    522       do {
    523         if (pattern[j] != subject[i + j]) {
    524           break;
    525         }
    526         j++;
    527       } while (j < pattern_length);
    528       if (j == pattern_length) {
    529         return i;
    530       }
    531       badness += j;
    532     } else {
    533       search->PopulateBoyerMooreHorspoolTable();
    534       search->strategy_ = &BoyerMooreHorspoolSearch;
    535       return BoyerMooreHorspoolSearch(search, subject, i);
    536     }
    537   }
    538   return -1;
    539 }
    540 
    541 
    542 // Perform a a single stand-alone search.
    543 // If searching multiple times for the same pattern, a search
    544 // object should be constructed once and the Search function then called
    545 // for each search.
    546 template <typename SubjectChar, typename PatternChar>
    547 int SearchString(Isolate* isolate,
    548                  Vector<const SubjectChar> subject,
    549                  Vector<const PatternChar> pattern,
    550                  int start_index) {
    551   StringSearch<PatternChar, SubjectChar> search(isolate, pattern);
    552   return search.Search(subject, start_index);
    553 }
    554 
    555 }}  // namespace v8::internal
    556 
    557 #endif  // V8_STRING_SEARCH_H_
    558