Home | History | Annotate | Download | only in src
      1 // Copyright 2011 the V8 project authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef V8_STRING_SEARCH_H_
      6 #define V8_STRING_SEARCH_H_
      7 
      8 #include "src/isolate.h"
      9 #include "src/vector.h"
     10 
     11 namespace v8 {
     12 namespace internal {
     13 
     14 
     15 //---------------------------------------------------------------------
     16 // String Search object.
     17 //---------------------------------------------------------------------
     18 
     19 // Class holding constants and methods that apply to all string search variants,
     20 // independently of subject and pattern char size.
     21 class StringSearchBase {
     22  protected:
     23   // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
     24   // limit, we can fix the size of tables. For a needle longer than this limit,
     25   // search will not be optimal, since we only build tables for a suffix
     26   // of the string, but it is a safe approximation.
     27   static const int kBMMaxShift = Isolate::kBMMaxShift;
     28 
     29   // Reduce alphabet to this size.
     30   // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
     31   // proportional to the input alphabet. We reduce the alphabet size by
     32   // equating input characters modulo a smaller alphabet size. This gives
     33   // a potentially less efficient searching, but is a safe approximation.
     34   // For needles using only characters in the same Unicode 256-code point page,
     35   // there is no search speed degradation.
     36   static const int kLatin1AlphabetSize = 256;
     37   static const int kUC16AlphabetSize = Isolate::kUC16AlphabetSize;
     38 
     39   // Bad-char shift table stored in the state. It's length is the alphabet size.
     40   // For patterns below this length, the skip length of Boyer-Moore is too short
     41   // to compensate for the algorithmic overhead compared to simple brute force.
     42   static const int kBMMinPatternLength = 7;
     43 
     44   static inline bool IsOneByteString(Vector<const uint8_t> string) {
     45     return true;
     46   }
     47 
     48   static inline bool IsOneByteString(Vector<const uc16> string) {
     49     return String::IsOneByte(string.start(), string.length());
     50   }
     51 
     52   friend class Isolate;
     53 };
     54 
     55 
     56 template <typename PatternChar, typename SubjectChar>
     57 class StringSearch : private StringSearchBase {
     58  public:
     59   StringSearch(Isolate* isolate, Vector<const PatternChar> pattern)
     60       : isolate_(isolate),
     61         pattern_(pattern),
     62         start_(Max(0, pattern.length() - kBMMaxShift)) {
     63     if (sizeof(PatternChar) > sizeof(SubjectChar)) {
     64       if (!IsOneByteString(pattern_)) {
     65         strategy_ = &FailSearch;
     66         return;
     67       }
     68     }
     69     int pattern_length = pattern_.length();
     70     if (pattern_length < kBMMinPatternLength) {
     71       if (pattern_length == 1) {
     72         strategy_ = &SingleCharSearch;
     73         return;
     74       }
     75       strategy_ = &LinearSearch;
     76       return;
     77     }
     78     strategy_ = &InitialSearch;
     79   }
     80 
     81   int Search(Vector<const SubjectChar> subject, int index) {
     82     return strategy_(this, subject, index);
     83   }
     84 
     85   static inline int AlphabetSize() {
     86     if (sizeof(PatternChar) == 1) {
     87       // Latin1 needle.
     88       return kLatin1AlphabetSize;
     89     } else {
     90       DCHECK(sizeof(PatternChar) == 2);
     91       // UC16 needle.
     92       return kUC16AlphabetSize;
     93     }
     94   }
     95 
     96  private:
     97   typedef int (*SearchFunction)(  // NOLINT - it's not a cast!
     98       StringSearch<PatternChar, SubjectChar>*,
     99       Vector<const SubjectChar>,
    100       int);
    101 
    102   static int FailSearch(StringSearch<PatternChar, SubjectChar>*,
    103                         Vector<const SubjectChar>,
    104                         int) {
    105     return -1;
    106   }
    107 
    108   static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search,
    109                               Vector<const SubjectChar> subject,
    110                               int start_index);
    111 
    112   static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search,
    113                           Vector<const SubjectChar> subject,
    114                           int start_index);
    115 
    116   static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search,
    117                            Vector<const SubjectChar> subject,
    118                            int start_index);
    119 
    120   static int BoyerMooreHorspoolSearch(
    121       StringSearch<PatternChar, SubjectChar>* search,
    122       Vector<const SubjectChar> subject,
    123       int start_index);
    124 
    125   static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search,
    126                               Vector<const SubjectChar> subject,
    127                               int start_index);
    128 
    129   void PopulateBoyerMooreHorspoolTable();
    130 
    131   void PopulateBoyerMooreTable();
    132 
    133   static inline bool exceedsOneByte(uint8_t c) {
    134     return false;
    135   }
    136 
    137   static inline bool exceedsOneByte(uint16_t c) {
    138     return c > String::kMaxOneByteCharCodeU;
    139   }
    140 
    141   static inline int CharOccurrence(int* bad_char_occurrence,
    142                                    SubjectChar char_code) {
    143     if (sizeof(SubjectChar) == 1) {
    144       return bad_char_occurrence[static_cast<int>(char_code)];
    145     }
    146     if (sizeof(PatternChar) == 1) {
    147       if (exceedsOneByte(char_code)) {
    148         return -1;
    149       }
    150       return bad_char_occurrence[static_cast<unsigned int>(char_code)];
    151     }
    152     // Both pattern and subject are UC16. Reduce character to equivalence class.
    153     int equiv_class = char_code % kUC16AlphabetSize;
    154     return bad_char_occurrence[equiv_class];
    155   }
    156 
    157   // The following tables are shared by all searches.
    158   // TODO(lrn): Introduce a way for a pattern to keep its tables
    159   // between searches (e.g., for an Atom RegExp).
    160 
    161   // Store for the BoyerMoore(Horspool) bad char shift table.
    162   // Return a table covering the last kBMMaxShift+1 positions of
    163   // pattern.
    164   int* bad_char_table() {
    165     return isolate_->bad_char_shift_table();
    166   }
    167 
    168   // Store for the BoyerMoore good suffix shift table.
    169   int* good_suffix_shift_table() {
    170     // Return biased pointer that maps the range  [start_..pattern_.length()
    171     // to the kGoodSuffixShiftTable array.
    172     return isolate_->good_suffix_shift_table() - start_;
    173   }
    174 
    175   // Table used temporarily while building the BoyerMoore good suffix
    176   // shift table.
    177   int* suffix_table() {
    178     // Return biased pointer that maps the range  [start_..pattern_.length()
    179     // to the kSuffixTable array.
    180     return isolate_->suffix_table() - start_;
    181   }
    182 
    183   Isolate* isolate_;
    184   // The pattern to search for.
    185   Vector<const PatternChar> pattern_;
    186   // Pointer to implementation of the search.
    187   SearchFunction strategy_;
    188   // Cache value of Max(0, pattern_length() - kBMMaxShift)
    189   int start_;
    190 };
    191 
    192 
    193 template <typename T, typename U>
    194 inline T AlignDown(T value, U alignment) {
    195   return reinterpret_cast<T>(
    196       (reinterpret_cast<uintptr_t>(value) & ~(alignment - 1)));
    197 }
    198 
    199 
    200 inline uint8_t GetHighestValueByte(uc16 character) {
    201   return Max(static_cast<uint8_t>(character & 0xFF),
    202              static_cast<uint8_t>(character >> 8));
    203 }
    204 
    205 
    206 inline uint8_t GetHighestValueByte(uint8_t character) { return character; }
    207 
    208 
    209 template <typename PatternChar, typename SubjectChar>
    210 inline int FindFirstCharacter(Vector<const PatternChar> pattern,
    211                               Vector<const SubjectChar> subject, int index) {
    212   const PatternChar pattern_first_char = pattern[0];
    213   const int max_n = (subject.length() - pattern.length() + 1);
    214 
    215   const uint8_t search_byte = GetHighestValueByte(pattern_first_char);
    216   const SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char);
    217   int pos = index;
    218   do {
    219     DCHECK_GE(max_n - pos, 0);
    220     const SubjectChar* char_pos = reinterpret_cast<const SubjectChar*>(
    221         memchr(subject.start() + pos, search_byte,
    222                (max_n - pos) * sizeof(SubjectChar)));
    223     if (char_pos == NULL) return -1;
    224     char_pos = AlignDown(char_pos, sizeof(SubjectChar));
    225     pos = static_cast<int>(char_pos - subject.start());
    226     if (subject[pos] == search_char) return pos;
    227   } while (++pos < max_n);
    228 
    229   return -1;
    230 }
    231 
    232 
    233 //---------------------------------------------------------------------
    234 // Single Character Pattern Search Strategy
    235 //---------------------------------------------------------------------
    236 
    237 template <typename PatternChar, typename SubjectChar>
    238 int StringSearch<PatternChar, SubjectChar>::SingleCharSearch(
    239     StringSearch<PatternChar, SubjectChar>* search,
    240     Vector<const SubjectChar> subject,
    241     int index) {
    242   DCHECK_EQ(1, search->pattern_.length());
    243   PatternChar pattern_first_char = search->pattern_[0];
    244   if (sizeof(PatternChar) > sizeof(SubjectChar)) {
    245     if (exceedsOneByte(pattern_first_char)) {
    246       return -1;
    247     }
    248   }
    249   return FindFirstCharacter(search->pattern_, subject, index);
    250 }
    251 
    252 //---------------------------------------------------------------------
    253 // Linear Search Strategy
    254 //---------------------------------------------------------------------
    255 
    256 
    257 template <typename PatternChar, typename SubjectChar>
    258 inline bool CharCompare(const PatternChar* pattern,
    259                         const SubjectChar* subject,
    260                         int length) {
    261   DCHECK(length > 0);
    262   int pos = 0;
    263   do {
    264     if (pattern[pos] != subject[pos]) {
    265       return false;
    266     }
    267     pos++;
    268   } while (pos < length);
    269   return true;
    270 }
    271 
    272 
    273 // Simple linear search for short patterns. Never bails out.
    274 template <typename PatternChar, typename SubjectChar>
    275 int StringSearch<PatternChar, SubjectChar>::LinearSearch(
    276     StringSearch<PatternChar, SubjectChar>* search,
    277     Vector<const SubjectChar> subject,
    278     int index) {
    279   Vector<const PatternChar> pattern = search->pattern_;
    280   DCHECK(pattern.length() > 1);
    281   int pattern_length = pattern.length();
    282   int i = index;
    283   int n = subject.length() - pattern_length;
    284   while (i <= n) {
    285     i = FindFirstCharacter(pattern, subject, i);
    286     if (i == -1) return -1;
    287     DCHECK_LE(i, n);
    288     i++;
    289     // Loop extracted to separate function to allow using return to do
    290     // a deeper break.
    291     if (CharCompare(pattern.start() + 1,
    292                     subject.start() + i,
    293                     pattern_length - 1)) {
    294       return i - 1;
    295     }
    296   }
    297   return -1;
    298 }
    299 
    300 //---------------------------------------------------------------------
    301 // Boyer-Moore string search
    302 //---------------------------------------------------------------------
    303 
    304 template <typename PatternChar, typename SubjectChar>
    305 int StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch(
    306     StringSearch<PatternChar, SubjectChar>* search,
    307     Vector<const SubjectChar> subject,
    308     int start_index) {
    309   Vector<const PatternChar> pattern = search->pattern_;
    310   int subject_length = subject.length();
    311   int pattern_length = pattern.length();
    312   // Only preprocess at most kBMMaxShift last characters of pattern.
    313   int start = search->start_;
    314 
    315   int* bad_char_occurence = search->bad_char_table();
    316   int* good_suffix_shift = search->good_suffix_shift_table();
    317 
    318   PatternChar last_char = pattern[pattern_length - 1];
    319   int index = start_index;
    320   // Continue search from i.
    321   while (index <= subject_length - pattern_length) {
    322     int j = pattern_length - 1;
    323     int c;
    324     while (last_char != (c = subject[index + j])) {
    325       int shift =
    326           j - CharOccurrence(bad_char_occurence, c);
    327       index += shift;
    328       if (index > subject_length - pattern_length) {
    329         return -1;
    330       }
    331     }
    332     while (j >= 0 && pattern[j] == (c = subject[index + j])) j--;
    333     if (j < 0) {
    334       return index;
    335     } else if (j < start) {
    336       // we have matched more than our tables allow us to be smart about.
    337       // Fall back on BMH shift.
    338       index += pattern_length - 1
    339           - CharOccurrence(bad_char_occurence,
    340                            static_cast<SubjectChar>(last_char));
    341     } else {
    342       int gs_shift = good_suffix_shift[j + 1];
    343       int bc_occ =
    344           CharOccurrence(bad_char_occurence, c);
    345       int shift = j - bc_occ;
    346       if (gs_shift > shift) {
    347         shift = gs_shift;
    348       }
    349       index += shift;
    350     }
    351   }
    352 
    353   return -1;
    354 }
    355 
    356 
    357 template <typename PatternChar, typename SubjectChar>
    358 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() {
    359   int pattern_length = pattern_.length();
    360   const PatternChar* pattern = pattern_.start();
    361   // Only look at the last kBMMaxShift characters of pattern (from start_
    362   // to pattern_length).
    363   int start = start_;
    364   int length = pattern_length - start;
    365 
    366   // Biased tables so that we can use pattern indices as table indices,
    367   // even if we only cover the part of the pattern from offset start.
    368   int* shift_table = good_suffix_shift_table();
    369   int* suffix_table = this->suffix_table();
    370 
    371   // Initialize table.
    372   for (int i = start; i < pattern_length; i++) {
    373     shift_table[i] = length;
    374   }
    375   shift_table[pattern_length] = 1;
    376   suffix_table[pattern_length] = pattern_length + 1;
    377 
    378   if (pattern_length <= start) {
    379     return;
    380   }
    381 
    382   // Find suffixes.
    383   PatternChar last_char = pattern[pattern_length - 1];
    384   int suffix = pattern_length + 1;
    385   {
    386     int i = pattern_length;
    387     while (i > start) {
    388       PatternChar c = pattern[i - 1];
    389       while (suffix <= pattern_length && c != pattern[suffix - 1]) {
    390         if (shift_table[suffix] == length) {
    391           shift_table[suffix] = suffix - i;
    392         }
    393         suffix = suffix_table[suffix];
    394       }
    395       suffix_table[--i] = --suffix;
    396       if (suffix == pattern_length) {
    397         // No suffix to extend, so we check against last_char only.
    398         while ((i > start) && (pattern[i - 1] != last_char)) {
    399           if (shift_table[pattern_length] == length) {
    400             shift_table[pattern_length] = pattern_length - i;
    401           }
    402           suffix_table[--i] = pattern_length;
    403         }
    404         if (i > start) {
    405           suffix_table[--i] = --suffix;
    406         }
    407       }
    408     }
    409   }
    410   // Build shift table using suffixes.
    411   if (suffix < pattern_length) {
    412     for (int i = start; i <= pattern_length; i++) {
    413       if (shift_table[i] == length) {
    414         shift_table[i] = suffix - start;
    415       }
    416       if (i == suffix) {
    417         suffix = suffix_table[suffix];
    418       }
    419     }
    420   }
    421 }
    422 
    423 //---------------------------------------------------------------------
    424 // Boyer-Moore-Horspool string search.
    425 //---------------------------------------------------------------------
    426 
    427 template <typename PatternChar, typename SubjectChar>
    428 int StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch(
    429     StringSearch<PatternChar, SubjectChar>* search,
    430     Vector<const SubjectChar> subject,
    431     int start_index) {
    432   Vector<const PatternChar> pattern = search->pattern_;
    433   int subject_length = subject.length();
    434   int pattern_length = pattern.length();
    435   int* char_occurrences = search->bad_char_table();
    436   int badness = -pattern_length;
    437 
    438   // How bad we are doing without a good-suffix table.
    439   PatternChar last_char = pattern[pattern_length - 1];
    440   int last_char_shift = pattern_length - 1 -
    441       CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char));
    442   // Perform search
    443   int index = start_index;  // No matches found prior to this index.
    444   while (index <= subject_length - pattern_length) {
    445     int j = pattern_length - 1;
    446     int subject_char;
    447     while (last_char != (subject_char = subject[index + j])) {
    448       int bc_occ = CharOccurrence(char_occurrences, subject_char);
    449       int shift = j - bc_occ;
    450       index += shift;
    451       badness += 1 - shift;  // at most zero, so badness cannot increase.
    452       if (index > subject_length - pattern_length) {
    453         return -1;
    454       }
    455     }
    456     j--;
    457     while (j >= 0 && pattern[j] == (subject[index + j])) j--;
    458     if (j < 0) {
    459       return index;
    460     } else {
    461       index += last_char_shift;
    462       // Badness increases by the number of characters we have
    463       // checked, and decreases by the number of characters we
    464       // can skip by shifting. It's a measure of how we are doing
    465       // compared to reading each character exactly once.
    466       badness += (pattern_length - j) - last_char_shift;
    467       if (badness > 0) {
    468         search->PopulateBoyerMooreTable();
    469         search->strategy_ = &BoyerMooreSearch;
    470         return BoyerMooreSearch(search, subject, index);
    471       }
    472     }
    473   }
    474   return -1;
    475 }
    476 
    477 
    478 template <typename PatternChar, typename SubjectChar>
    479 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() {
    480   int pattern_length = pattern_.length();
    481 
    482   int* bad_char_occurrence = bad_char_table();
    483 
    484   // Only preprocess at most kBMMaxShift last characters of pattern.
    485   int start = start_;
    486   // Run forwards to populate bad_char_table, so that *last* instance
    487   // of character equivalence class is the one registered.
    488   // Notice: Doesn't include the last character.
    489   int table_size = AlphabetSize();
    490   if (start == 0) {  // All patterns less than kBMMaxShift in length.
    491     memset(bad_char_occurrence,
    492            -1,
    493            table_size * sizeof(*bad_char_occurrence));
    494   } else {
    495     for (int i = 0; i < table_size; i++) {
    496       bad_char_occurrence[i] = start - 1;
    497     }
    498   }
    499   for (int i = start; i < pattern_length - 1; i++) {
    500     PatternChar c = pattern_[i];
    501     int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize();
    502     bad_char_occurrence[bucket] = i;
    503   }
    504 }
    505 
    506 //---------------------------------------------------------------------
    507 // Linear string search with bailout to BMH.
    508 //---------------------------------------------------------------------
    509 
    510 // Simple linear search for short patterns, which bails out if the string
    511 // isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
    512 template <typename PatternChar, typename SubjectChar>
    513 int StringSearch<PatternChar, SubjectChar>::InitialSearch(
    514     StringSearch<PatternChar, SubjectChar>* search,
    515     Vector<const SubjectChar> subject,
    516     int index) {
    517   Vector<const PatternChar> pattern = search->pattern_;
    518   int pattern_length = pattern.length();
    519   // Badness is a count of how much work we have done.  When we have
    520   // done enough work we decide it's probably worth switching to a better
    521   // algorithm.
    522   int badness = -10 - (pattern_length << 2);
    523 
    524   // We know our pattern is at least 2 characters, we cache the first so
    525   // the common case of the first character not matching is faster.
    526   for (int i = index, n = subject.length() - pattern_length; i <= n; i++) {
    527     badness++;
    528     if (badness <= 0) {
    529       i = FindFirstCharacter(pattern, subject, i);
    530       if (i == -1) return -1;
    531       DCHECK_LE(i, n);
    532       int j = 1;
    533       do {
    534         if (pattern[j] != subject[i + j]) {
    535           break;
    536         }
    537         j++;
    538       } while (j < pattern_length);
    539       if (j == pattern_length) {
    540         return i;
    541       }
    542       badness += j;
    543     } else {
    544       search->PopulateBoyerMooreHorspoolTable();
    545       search->strategy_ = &BoyerMooreHorspoolSearch;
    546       return BoyerMooreHorspoolSearch(search, subject, i);
    547     }
    548   }
    549   return -1;
    550 }
    551 
    552 
    553 // Perform a a single stand-alone search.
    554 // If searching multiple times for the same pattern, a search
    555 // object should be constructed once and the Search function then called
    556 // for each search.
    557 template <typename SubjectChar, typename PatternChar>
    558 int SearchString(Isolate* isolate,
    559                  Vector<const SubjectChar> subject,
    560                  Vector<const PatternChar> pattern,
    561                  int start_index) {
    562   StringSearch<PatternChar, SubjectChar> search(isolate, pattern);
    563   return search.Search(subject, start_index);
    564 }
    565 
    566 }  // namespace internal
    567 }  // namespace v8
    568 
    569 #endif  // V8_STRING_SEARCH_H_
    570