Home | History | Annotate | Download | only in src
      1 // Copyright 2011 the V8 project authors. All rights reserved.
      2 // Redistribution and use in source and binary forms, with or without
      3 // modification, are permitted provided that the following conditions are
      4 // met:
      5 //
      6 //     * Redistributions of source code must retain the above copyright
      7 //       notice, this list of conditions and the following disclaimer.
      8 //     * Redistributions in binary form must reproduce the above
      9 //       copyright notice, this list of conditions and the following
     10 //       disclaimer in the documentation and/or other materials provided
     11 //       with the distribution.
     12 //     * Neither the name of Google Inc. nor the names of its
     13 //       contributors may be used to endorse or promote products derived
     14 //       from this software without specific prior written permission.
     15 //
     16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27 
     28 #ifndef V8_STRING_SEARCH_H_
     29 #define V8_STRING_SEARCH_H_
     30 
     31 namespace v8 {
     32 namespace internal {
     33 
     34 
     35 //---------------------------------------------------------------------
     36 // String Search object.
     37 //---------------------------------------------------------------------
     38 
     39 // Class holding constants and methods that apply to all string search variants,
     40 // independently of subject and pattern char size.
     41 class StringSearchBase {
     42  protected:
     43   // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
     44   // limit, we can fix the size of tables. For a needle longer than this limit,
     45   // search will not be optimal, since we only build tables for a suffix
     46   // of the string, but it is a safe approximation.
     47   static const int kBMMaxShift = Isolate::kBMMaxShift;
     48 
     49   // Reduce alphabet to this size.
     50   // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
     51   // proportional to the input alphabet. We reduce the alphabet size by
     52   // equating input characters modulo a smaller alphabet size. This gives
     53   // a potentially less efficient searching, but is a safe approximation.
     54   // For needles using only characters in the same Unicode 256-code point page,
     55   // there is no search speed degradation.
     56   static const int kAsciiAlphabetSize = 128;
     57   static const int kUC16AlphabetSize = Isolate::kUC16AlphabetSize;
     58 
     59   // Bad-char shift table stored in the state. It's length is the alphabet size.
     60   // For patterns below this length, the skip length of Boyer-Moore is too short
     61   // to compensate for the algorithmic overhead compared to simple brute force.
     62   static const int kBMMinPatternLength = 7;
     63 
     64   static inline bool IsAsciiString(Vector<const char>) {
     65     return true;
     66   }
     67 
     68   static inline bool IsAsciiString(Vector<const uc16> string) {
     69     return String::IsAscii(string.start(), string.length());
     70   }
     71 
     72   friend class Isolate;
     73 };
     74 
     75 
     76 template <typename PatternChar, typename SubjectChar>
     77 class StringSearch : private StringSearchBase {
     78  public:
     79   StringSearch(Isolate* isolate, Vector<const PatternChar> pattern)
     80       : isolate_(isolate),
     81         pattern_(pattern),
     82         start_(Max(0, pattern.length() - kBMMaxShift)) {
     83     if (sizeof(PatternChar) > sizeof(SubjectChar)) {
     84       if (!IsAsciiString(pattern_)) {
     85         strategy_ = &FailSearch;
     86         return;
     87       }
     88     }
     89     int pattern_length = pattern_.length();
     90     if (pattern_length < kBMMinPatternLength) {
     91       if (pattern_length == 1) {
     92         strategy_ = &SingleCharSearch;
     93         return;
     94       }
     95       strategy_ = &LinearSearch;
     96       return;
     97     }
     98     strategy_ = &InitialSearch;
     99   }
    100 
    101   int Search(Vector<const SubjectChar> subject, int index) {
    102     return strategy_(this, subject, index);
    103   }
    104 
    105   static inline int AlphabetSize() {
    106     if (sizeof(PatternChar) == 1) {
    107       // ASCII needle.
    108       return kAsciiAlphabetSize;
    109     } else {
    110       ASSERT(sizeof(PatternChar) == 2);
    111       // UC16 needle.
    112       return kUC16AlphabetSize;
    113     }
    114   }
    115 
    116  private:
    117   typedef int (*SearchFunction)(  // NOLINT - it's not a cast!
    118       StringSearch<PatternChar, SubjectChar>*,
    119       Vector<const SubjectChar>,
    120       int);
    121 
    122   static int FailSearch(StringSearch<PatternChar, SubjectChar>*,
    123                         Vector<const SubjectChar>,
    124                         int) {
    125     return -1;
    126   }
    127 
    128   static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search,
    129                               Vector<const SubjectChar> subject,
    130                               int start_index);
    131 
    132   static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search,
    133                           Vector<const SubjectChar> subject,
    134                           int start_index);
    135 
    136   static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search,
    137                            Vector<const SubjectChar> subject,
    138                            int start_index);
    139 
    140   static int BoyerMooreHorspoolSearch(
    141       StringSearch<PatternChar, SubjectChar>* search,
    142       Vector<const SubjectChar> subject,
    143       int start_index);
    144 
    145   static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search,
    146                               Vector<const SubjectChar> subject,
    147                               int start_index);
    148 
    149   void PopulateBoyerMooreHorspoolTable();
    150 
    151   void PopulateBoyerMooreTable();
    152 
    153   static inline int CharOccurrence(int* bad_char_occurrence,
    154                                    SubjectChar char_code) {
    155     if (sizeof(SubjectChar) == 1) {
    156       return bad_char_occurrence[static_cast<int>(char_code)];
    157     }
    158     if (sizeof(PatternChar) == 1) {
    159       if (static_cast<unsigned int>(char_code) > String::kMaxAsciiCharCodeU) {
    160         return -1;
    161       }
    162       return bad_char_occurrence[static_cast<unsigned int>(char_code)];
    163     }
    164     // Both pattern and subject are UC16. Reduce character to equivalence class.
    165     int equiv_class = char_code % kUC16AlphabetSize;
    166     return bad_char_occurrence[equiv_class];
    167   }
    168 
    169   // The following tables are shared by all searches.
    170   // TODO(lrn): Introduce a way for a pattern to keep its tables
    171   // between searches (e.g., for an Atom RegExp).
    172 
    173   // Store for the BoyerMoore(Horspool) bad char shift table.
    174   // Return a table covering the last kBMMaxShift+1 positions of
    175   // pattern.
    176   int* bad_char_table() {
    177     return isolate_->bad_char_shift_table();
    178   }
    179 
    180   // Store for the BoyerMoore good suffix shift table.
    181   int* good_suffix_shift_table() {
    182     // Return biased pointer that maps the range  [start_..pattern_.length()
    183     // to the kGoodSuffixShiftTable array.
    184     return isolate_->good_suffix_shift_table() - start_;
    185   }
    186 
    187   // Table used temporarily while building the BoyerMoore good suffix
    188   // shift table.
    189   int* suffix_table() {
    190     // Return biased pointer that maps the range  [start_..pattern_.length()
    191     // to the kSuffixTable array.
    192     return isolate_->suffix_table() - start_;
    193   }
    194 
    195   Isolate* isolate_;
    196   // The pattern to search for.
    197   Vector<const PatternChar> pattern_;
    198   // Pointer to implementation of the search.
    199   SearchFunction strategy_;
    200   // Cache value of Max(0, pattern_length() - kBMMaxShift)
    201   int start_;
    202 };
    203 
    204 
    205 //---------------------------------------------------------------------
    206 // Single Character Pattern Search Strategy
    207 //---------------------------------------------------------------------
    208 
    209 template <typename PatternChar, typename SubjectChar>
    210 int StringSearch<PatternChar, SubjectChar>::SingleCharSearch(
    211     StringSearch<PatternChar, SubjectChar>* search,
    212     Vector<const SubjectChar> subject,
    213     int index) {
    214   ASSERT_EQ(1, search->pattern_.length());
    215   PatternChar pattern_first_char = search->pattern_[0];
    216   int i = index;
    217   if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
    218     const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
    219         memchr(subject.start() + i,
    220                pattern_first_char,
    221                subject.length() - i));
    222     if (pos == NULL) return -1;
    223     return static_cast<int>(pos - subject.start());
    224   } else {
    225     if (sizeof(PatternChar) > sizeof(SubjectChar)) {
    226       if (static_cast<uc16>(pattern_first_char) > String::kMaxAsciiCharCodeU) {
    227         return -1;
    228       }
    229     }
    230     SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char);
    231     int n = subject.length();
    232     while (i < n) {
    233       if (subject[i++] == search_char) return i - 1;
    234     }
    235     return -1;
    236   }
    237 }
    238 
    239 //---------------------------------------------------------------------
    240 // Linear Search Strategy
    241 //---------------------------------------------------------------------
    242 
    243 
    244 template <typename PatternChar, typename SubjectChar>
    245 inline bool CharCompare(const PatternChar* pattern,
    246                         const SubjectChar* subject,
    247                         int length) {
    248   ASSERT(length > 0);
    249   int pos = 0;
    250   do {
    251     if (pattern[pos] != subject[pos]) {
    252       return false;
    253     }
    254     pos++;
    255   } while (pos < length);
    256   return true;
    257 }
    258 
    259 
    260 // Simple linear search for short patterns. Never bails out.
    261 template <typename PatternChar, typename SubjectChar>
    262 int StringSearch<PatternChar, SubjectChar>::LinearSearch(
    263     StringSearch<PatternChar, SubjectChar>* search,
    264     Vector<const SubjectChar> subject,
    265     int index) {
    266   Vector<const PatternChar> pattern = search->pattern_;
    267   ASSERT(pattern.length() > 1);
    268   int pattern_length = pattern.length();
    269   PatternChar pattern_first_char = pattern[0];
    270   int i = index;
    271   int n = subject.length() - pattern_length;
    272   while (i <= n) {
    273     if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
    274       const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
    275           memchr(subject.start() + i,
    276                  pattern_first_char,
    277                  n - i + 1));
    278       if (pos == NULL) return -1;
    279       i = static_cast<int>(pos - subject.start()) + 1;
    280     } else {
    281       if (subject[i++] != pattern_first_char) continue;
    282     }
    283     // Loop extracted to separate function to allow using return to do
    284     // a deeper break.
    285     if (CharCompare(pattern.start() + 1,
    286                     subject.start() + i,
    287                     pattern_length - 1)) {
    288       return i - 1;
    289     }
    290   }
    291   return -1;
    292 }
    293 
    294 //---------------------------------------------------------------------
    295 // Boyer-Moore string search
    296 //---------------------------------------------------------------------
    297 
    298 template <typename PatternChar, typename SubjectChar>
    299 int StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch(
    300     StringSearch<PatternChar, SubjectChar>* search,
    301     Vector<const SubjectChar> subject,
    302     int start_index) {
    303   Vector<const PatternChar> pattern = search->pattern_;
    304   int subject_length = subject.length();
    305   int pattern_length = pattern.length();
    306   // Only preprocess at most kBMMaxShift last characters of pattern.
    307   int start = search->start_;
    308 
    309   int* bad_char_occurence = search->bad_char_table();
    310   int* good_suffix_shift = search->good_suffix_shift_table();
    311 
    312   PatternChar last_char = pattern[pattern_length - 1];
    313   int index = start_index;
    314   // Continue search from i.
    315   while (index <= subject_length - pattern_length) {
    316     int j = pattern_length - 1;
    317     int c;
    318     while (last_char != (c = subject[index + j])) {
    319       int shift =
    320           j - CharOccurrence(bad_char_occurence, c);
    321       index += shift;
    322       if (index > subject_length - pattern_length) {
    323         return -1;
    324       }
    325     }
    326     while (j >= 0 && pattern[j] == (c = subject[index + j])) j--;
    327     if (j < 0) {
    328       return index;
    329     } else if (j < start) {
    330       // we have matched more than our tables allow us to be smart about.
    331       // Fall back on BMH shift.
    332       index += pattern_length - 1
    333           - CharOccurrence(bad_char_occurence,
    334                            static_cast<SubjectChar>(last_char));
    335     } else {
    336       int gs_shift = good_suffix_shift[j + 1];
    337       int bc_occ =
    338           CharOccurrence(bad_char_occurence, c);
    339       int shift = j - bc_occ;
    340       if (gs_shift > shift) {
    341         shift = gs_shift;
    342       }
    343       index += shift;
    344     }
    345   }
    346 
    347   return -1;
    348 }
    349 
    350 
    351 template <typename PatternChar, typename SubjectChar>
    352 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() {
    353   int pattern_length = pattern_.length();
    354   const PatternChar* pattern = pattern_.start();
    355   // Only look at the last kBMMaxShift characters of pattern (from start_
    356   // to pattern_length).
    357   int start = start_;
    358   int length = pattern_length - start;
    359 
    360   // Biased tables so that we can use pattern indices as table indices,
    361   // even if we only cover the part of the pattern from offset start.
    362   int* shift_table = good_suffix_shift_table();
    363   int* suffix_table = this->suffix_table();
    364 
    365   // Initialize table.
    366   for (int i = start; i < pattern_length; i++) {
    367     shift_table[i] = length;
    368   }
    369   shift_table[pattern_length] = 1;
    370   suffix_table[pattern_length] = pattern_length + 1;
    371 
    372   if (pattern_length <= start) {
    373     return;
    374   }
    375 
    376   // Find suffixes.
    377   PatternChar last_char = pattern[pattern_length - 1];
    378   int suffix = pattern_length + 1;
    379   {
    380     int i = pattern_length;
    381     while (i > start) {
    382       PatternChar c = pattern[i - 1];
    383       while (suffix <= pattern_length && c != pattern[suffix - 1]) {
    384         if (shift_table[suffix] == length) {
    385           shift_table[suffix] = suffix - i;
    386         }
    387         suffix = suffix_table[suffix];
    388       }
    389       suffix_table[--i] = --suffix;
    390       if (suffix == pattern_length) {
    391         // No suffix to extend, so we check against last_char only.
    392         while ((i > start) && (pattern[i - 1] != last_char)) {
    393           if (shift_table[pattern_length] == length) {
    394             shift_table[pattern_length] = pattern_length - i;
    395           }
    396           suffix_table[--i] = pattern_length;
    397         }
    398         if (i > start) {
    399           suffix_table[--i] = --suffix;
    400         }
    401       }
    402     }
    403   }
    404   // Build shift table using suffixes.
    405   if (suffix < pattern_length) {
    406     for (int i = start; i <= pattern_length; i++) {
    407       if (shift_table[i] == length) {
    408         shift_table[i] = suffix - start;
    409       }
    410       if (i == suffix) {
    411         suffix = suffix_table[suffix];
    412       }
    413     }
    414   }
    415 }
    416 
    417 //---------------------------------------------------------------------
    418 // Boyer-Moore-Horspool string search.
    419 //---------------------------------------------------------------------
    420 
    421 template <typename PatternChar, typename SubjectChar>
    422 int StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch(
    423     StringSearch<PatternChar, SubjectChar>* search,
    424     Vector<const SubjectChar> subject,
    425     int start_index) {
    426   Vector<const PatternChar> pattern = search->pattern_;
    427   int subject_length = subject.length();
    428   int pattern_length = pattern.length();
    429   int* char_occurrences = search->bad_char_table();
    430   int badness = -pattern_length;
    431 
    432   // How bad we are doing without a good-suffix table.
    433   PatternChar last_char = pattern[pattern_length - 1];
    434   int last_char_shift = pattern_length - 1 -
    435       CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char));
    436   // Perform search
    437   int index = start_index;  // No matches found prior to this index.
    438   while (index <= subject_length - pattern_length) {
    439     int j = pattern_length - 1;
    440     int subject_char;
    441     while (last_char != (subject_char = subject[index + j])) {
    442       int bc_occ = CharOccurrence(char_occurrences, subject_char);
    443       int shift = j - bc_occ;
    444       index += shift;
    445       badness += 1 - shift;  // at most zero, so badness cannot increase.
    446       if (index > subject_length - pattern_length) {
    447         return -1;
    448       }
    449     }
    450     j--;
    451     while (j >= 0 && pattern[j] == (subject[index + j])) j--;
    452     if (j < 0) {
    453       return index;
    454     } else {
    455       index += last_char_shift;
    456       // Badness increases by the number of characters we have
    457       // checked, and decreases by the number of characters we
    458       // can skip by shifting. It's a measure of how we are doing
    459       // compared to reading each character exactly once.
    460       badness += (pattern_length - j) - last_char_shift;
    461       if (badness > 0) {
    462         search->PopulateBoyerMooreTable();
    463         search->strategy_ = &BoyerMooreSearch;
    464         return BoyerMooreSearch(search, subject, index);
    465       }
    466     }
    467   }
    468   return -1;
    469 }
    470 
    471 
    472 template <typename PatternChar, typename SubjectChar>
    473 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() {
    474   int pattern_length = pattern_.length();
    475 
    476   int* bad_char_occurrence = bad_char_table();
    477 
    478   // Only preprocess at most kBMMaxShift last characters of pattern.
    479   int start = start_;
    480   // Run forwards to populate bad_char_table, so that *last* instance
    481   // of character equivalence class is the one registered.
    482   // Notice: Doesn't include the last character.
    483   int table_size = AlphabetSize();
    484   if (start == 0) {  // All patterns less than kBMMaxShift in length.
    485     memset(bad_char_occurrence,
    486            -1,
    487            table_size * sizeof(*bad_char_occurrence));
    488   } else {
    489     for (int i = 0; i < table_size; i++) {
    490       bad_char_occurrence[i] = start - 1;
    491     }
    492   }
    493   for (int i = start; i < pattern_length - 1; i++) {
    494     PatternChar c = pattern_[i];
    495     int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize();
    496     bad_char_occurrence[bucket] = i;
    497   }
    498 }
    499 
    500 //---------------------------------------------------------------------
    501 // Linear string search with bailout to BMH.
    502 //---------------------------------------------------------------------
    503 
    504 // Simple linear search for short patterns, which bails out if the string
    505 // isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
    506 template <typename PatternChar, typename SubjectChar>
    507 int StringSearch<PatternChar, SubjectChar>::InitialSearch(
    508     StringSearch<PatternChar, SubjectChar>* search,
    509     Vector<const SubjectChar> subject,
    510     int index) {
    511   Vector<const PatternChar> pattern = search->pattern_;
    512   int pattern_length = pattern.length();
    513   // Badness is a count of how much work we have done.  When we have
    514   // done enough work we decide it's probably worth switching to a better
    515   // algorithm.
    516   int badness = -10 - (pattern_length << 2);
    517 
    518   // We know our pattern is at least 2 characters, we cache the first so
    519   // the common case of the first character not matching is faster.
    520   PatternChar pattern_first_char = pattern[0];
    521   for (int i = index, n = subject.length() - pattern_length; i <= n; i++) {
    522     badness++;
    523     if (badness <= 0) {
    524       if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
    525         const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
    526             memchr(subject.start() + i,
    527                    pattern_first_char,
    528                    n - i + 1));
    529         if (pos == NULL) {
    530           return -1;
    531         }
    532         i = static_cast<int>(pos - subject.start());
    533       } else {
    534         if (subject[i] != pattern_first_char) continue;
    535       }
    536       int j = 1;
    537       do {
    538         if (pattern[j] != subject[i + j]) {
    539           break;
    540         }
    541         j++;
    542       } while (j < pattern_length);
    543       if (j == pattern_length) {
    544         return i;
    545       }
    546       badness += j;
    547     } else {
    548       search->PopulateBoyerMooreHorspoolTable();
    549       search->strategy_ = &BoyerMooreHorspoolSearch;
    550       return BoyerMooreHorspoolSearch(search, subject, i);
    551     }
    552   }
    553   return -1;
    554 }
    555 
    556 
    557 // Perform a a single stand-alone search.
    558 // If searching multiple times for the same pattern, a search
    559 // object should be constructed once and the Search function then called
    560 // for each search.
    561 template <typename SubjectChar, typename PatternChar>
    562 int SearchString(Isolate* isolate,
    563                  Vector<const SubjectChar> subject,
    564                  Vector<const PatternChar> pattern,
    565                  int start_index) {
    566   StringSearch<PatternChar, SubjectChar> search(isolate, pattern);
    567   return search.Search(subject, start_index);
    568 }
    569 
    570 }}  // namespace v8::internal
    571 
    572 #endif  // V8_STRING_SEARCH_H_
    573