Home | History | Annotate | Download | only in src
      1 // Copyright 2011 the V8 project authors. All rights reserved.
      2 // Redistribution and use in source and binary forms, with or without
      3 // modification, are permitted provided that the following conditions are
      4 // met:
      5 //
      6 //     * Redistributions of source code must retain the above copyright
      7 //       notice, this list of conditions and the following disclaimer.
      8 //     * Redistributions in binary form must reproduce the above
      9 //       copyright notice, this list of conditions and the following
     10 //       disclaimer in the documentation and/or other materials provided
     11 //       with the distribution.
     12 //     * Neither the name of Google Inc. nor the names of its
     13 //       contributors may be used to endorse or promote products derived
     14 //       from this software without specific prior written permission.
     15 //
     16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27 
     28 #ifndef V8_STRING_SEARCH_H_
     29 #define V8_STRING_SEARCH_H_
     30 
     31 namespace v8 {
     32 namespace internal {
     33 
     34 
     35 //---------------------------------------------------------------------
     36 // String Search object.
     37 //---------------------------------------------------------------------
     38 
     39 // Class holding constants and methods that apply to all string search variants,
     40 // independently of subject and pattern char size.
     41 class StringSearchBase {
     42  protected:
     43   // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
     44   // limit, we can fix the size of tables. For a needle longer than this limit,
     45   // search will not be optimal, since we only build tables for a suffix
     46   // of the string, but it is a safe approximation.
     47   static const int kBMMaxShift = Isolate::kBMMaxShift;
     48 
     49   // Reduce alphabet to this size.
     50   // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
     51   // proportional to the input alphabet. We reduce the alphabet size by
     52   // equating input characters modulo a smaller alphabet size. This gives
     53   // a potentially less efficient searching, but is a safe approximation.
     54   // For needles using only characters in the same Unicode 256-code point page,
     55   // there is no search speed degradation.
     56   static const int kAsciiAlphabetSize = 256;
     57   static const int kUC16AlphabetSize = Isolate::kUC16AlphabetSize;
     58 
     59   // Bad-char shift table stored in the state. It's length is the alphabet size.
     60   // For patterns below this length, the skip length of Boyer-Moore is too short
     61   // to compensate for the algorithmic overhead compared to simple brute force.
     62   static const int kBMMinPatternLength = 7;
     63 
     64   static inline bool IsOneByteString(Vector<const uint8_t> string) {
     65     return true;
     66   }
     67 
     68   static inline bool IsOneByteString(Vector<const uc16> string) {
     69     return String::IsOneByte(string.start(), string.length());
     70   }
     71 
     72   friend class Isolate;
     73 };
     74 
     75 
     76 template <typename PatternChar, typename SubjectChar>
     77 class StringSearch : private StringSearchBase {
     78  public:
     79   StringSearch(Isolate* isolate, Vector<const PatternChar> pattern)
     80       : isolate_(isolate),
     81         pattern_(pattern),
     82         start_(Max(0, pattern.length() - kBMMaxShift)) {
     83     if (sizeof(PatternChar) > sizeof(SubjectChar)) {
     84       if (!IsOneByteString(pattern_)) {
     85         strategy_ = &FailSearch;
     86         return;
     87       }
     88     }
     89     int pattern_length = pattern_.length();
     90     if (pattern_length < kBMMinPatternLength) {
     91       if (pattern_length == 1) {
     92         strategy_ = &SingleCharSearch;
     93         return;
     94       }
     95       strategy_ = &LinearSearch;
     96       return;
     97     }
     98     strategy_ = &InitialSearch;
     99   }
    100 
    101   int Search(Vector<const SubjectChar> subject, int index) {
    102     return strategy_(this, subject, index);
    103   }
    104 
    105   static inline int AlphabetSize() {
    106     if (sizeof(PatternChar) == 1) {
    107       // ASCII needle.
    108       return kAsciiAlphabetSize;
    109     } else {
    110       ASSERT(sizeof(PatternChar) == 2);
    111       // UC16 needle.
    112       return kUC16AlphabetSize;
    113     }
    114   }
    115 
    116  private:
    117   typedef int (*SearchFunction)(  // NOLINT - it's not a cast!
    118       StringSearch<PatternChar, SubjectChar>*,
    119       Vector<const SubjectChar>,
    120       int);
    121 
    122   static int FailSearch(StringSearch<PatternChar, SubjectChar>*,
    123                         Vector<const SubjectChar>,
    124                         int) {
    125     return -1;
    126   }
    127 
    128   static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search,
    129                               Vector<const SubjectChar> subject,
    130                               int start_index);
    131 
    132   static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search,
    133                           Vector<const SubjectChar> subject,
    134                           int start_index);
    135 
    136   static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search,
    137                            Vector<const SubjectChar> subject,
    138                            int start_index);
    139 
    140   static int BoyerMooreHorspoolSearch(
    141       StringSearch<PatternChar, SubjectChar>* search,
    142       Vector<const SubjectChar> subject,
    143       int start_index);
    144 
    145   static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search,
    146                               Vector<const SubjectChar> subject,
    147                               int start_index);
    148 
    149   void PopulateBoyerMooreHorspoolTable();
    150 
    151   void PopulateBoyerMooreTable();
    152 
    153   static inline bool exceedsOneByte(uint8_t c) {
    154     return false;
    155   }
    156 
    157   static inline bool exceedsOneByte(uint16_t c) {
    158     return c > String::kMaxOneByteCharCodeU;
    159   }
    160 
    161   static inline int CharOccurrence(int* bad_char_occurrence,
    162                                    SubjectChar char_code) {
    163     if (sizeof(SubjectChar) == 1) {
    164       return bad_char_occurrence[static_cast<int>(char_code)];
    165     }
    166     if (sizeof(PatternChar) == 1) {
    167       if (exceedsOneByte(char_code)) {
    168         return -1;
    169       }
    170       return bad_char_occurrence[static_cast<unsigned int>(char_code)];
    171     }
    172     // Both pattern and subject are UC16. Reduce character to equivalence class.
    173     int equiv_class = char_code % kUC16AlphabetSize;
    174     return bad_char_occurrence[equiv_class];
    175   }
    176 
    177   // The following tables are shared by all searches.
    178   // TODO(lrn): Introduce a way for a pattern to keep its tables
    179   // between searches (e.g., for an Atom RegExp).
    180 
    181   // Store for the BoyerMoore(Horspool) bad char shift table.
    182   // Return a table covering the last kBMMaxShift+1 positions of
    183   // pattern.
    184   int* bad_char_table() {
    185     return isolate_->bad_char_shift_table();
    186   }
    187 
    188   // Store for the BoyerMoore good suffix shift table.
    189   int* good_suffix_shift_table() {
    190     // Return biased pointer that maps the range  [start_..pattern_.length()
    191     // to the kGoodSuffixShiftTable array.
    192     return isolate_->good_suffix_shift_table() - start_;
    193   }
    194 
    195   // Table used temporarily while building the BoyerMoore good suffix
    196   // shift table.
    197   int* suffix_table() {
    198     // Return biased pointer that maps the range  [start_..pattern_.length()
    199     // to the kSuffixTable array.
    200     return isolate_->suffix_table() - start_;
    201   }
    202 
    203   Isolate* isolate_;
    204   // The pattern to search for.
    205   Vector<const PatternChar> pattern_;
    206   // Pointer to implementation of the search.
    207   SearchFunction strategy_;
    208   // Cache value of Max(0, pattern_length() - kBMMaxShift)
    209   int start_;
    210 };
    211 
    212 
    213 //---------------------------------------------------------------------
    214 // Single Character Pattern Search Strategy
    215 //---------------------------------------------------------------------
    216 
    217 template <typename PatternChar, typename SubjectChar>
    218 int StringSearch<PatternChar, SubjectChar>::SingleCharSearch(
    219     StringSearch<PatternChar, SubjectChar>* search,
    220     Vector<const SubjectChar> subject,
    221     int index) {
    222   ASSERT_EQ(1, search->pattern_.length());
    223   PatternChar pattern_first_char = search->pattern_[0];
    224   int i = index;
    225   if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
    226     const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
    227         memchr(subject.start() + i,
    228                pattern_first_char,
    229                subject.length() - i));
    230     if (pos == NULL) return -1;
    231     return static_cast<int>(pos - subject.start());
    232   } else {
    233     if (sizeof(PatternChar) > sizeof(SubjectChar)) {
    234       if (exceedsOneByte(pattern_first_char)) {
    235         return -1;
    236       }
    237     }
    238     SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char);
    239     int n = subject.length();
    240     while (i < n) {
    241       if (subject[i++] == search_char) return i - 1;
    242     }
    243     return -1;
    244   }
    245 }
    246 
    247 //---------------------------------------------------------------------
    248 // Linear Search Strategy
    249 //---------------------------------------------------------------------
    250 
    251 
    252 template <typename PatternChar, typename SubjectChar>
    253 inline bool CharCompare(const PatternChar* pattern,
    254                         const SubjectChar* subject,
    255                         int length) {
    256   ASSERT(length > 0);
    257   int pos = 0;
    258   do {
    259     if (pattern[pos] != subject[pos]) {
    260       return false;
    261     }
    262     pos++;
    263   } while (pos < length);
    264   return true;
    265 }
    266 
    267 
    268 // Simple linear search for short patterns. Never bails out.
    269 template <typename PatternChar, typename SubjectChar>
    270 int StringSearch<PatternChar, SubjectChar>::LinearSearch(
    271     StringSearch<PatternChar, SubjectChar>* search,
    272     Vector<const SubjectChar> subject,
    273     int index) {
    274   Vector<const PatternChar> pattern = search->pattern_;
    275   ASSERT(pattern.length() > 1);
    276   int pattern_length = pattern.length();
    277   PatternChar pattern_first_char = pattern[0];
    278   int i = index;
    279   int n = subject.length() - pattern_length;
    280   while (i <= n) {
    281     if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
    282       const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
    283           memchr(subject.start() + i,
    284                  pattern_first_char,
    285                  n - i + 1));
    286       if (pos == NULL) return -1;
    287       i = static_cast<int>(pos - subject.start()) + 1;
    288     } else {
    289       if (subject[i++] != pattern_first_char) continue;
    290     }
    291     // Loop extracted to separate function to allow using return to do
    292     // a deeper break.
    293     if (CharCompare(pattern.start() + 1,
    294                     subject.start() + i,
    295                     pattern_length - 1)) {
    296       return i - 1;
    297     }
    298   }
    299   return -1;
    300 }
    301 
    302 //---------------------------------------------------------------------
    303 // Boyer-Moore string search
    304 //---------------------------------------------------------------------
    305 
    306 template <typename PatternChar, typename SubjectChar>
    307 int StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch(
    308     StringSearch<PatternChar, SubjectChar>* search,
    309     Vector<const SubjectChar> subject,
    310     int start_index) {
    311   Vector<const PatternChar> pattern = search->pattern_;
    312   int subject_length = subject.length();
    313   int pattern_length = pattern.length();
    314   // Only preprocess at most kBMMaxShift last characters of pattern.
    315   int start = search->start_;
    316 
    317   int* bad_char_occurence = search->bad_char_table();
    318   int* good_suffix_shift = search->good_suffix_shift_table();
    319 
    320   PatternChar last_char = pattern[pattern_length - 1];
    321   int index = start_index;
    322   // Continue search from i.
    323   while (index <= subject_length - pattern_length) {
    324     int j = pattern_length - 1;
    325     int c;
    326     while (last_char != (c = subject[index + j])) {
    327       int shift =
    328           j - CharOccurrence(bad_char_occurence, c);
    329       index += shift;
    330       if (index > subject_length - pattern_length) {
    331         return -1;
    332       }
    333     }
    334     while (j >= 0 && pattern[j] == (c = subject[index + j])) j--;
    335     if (j < 0) {
    336       return index;
    337     } else if (j < start) {
    338       // we have matched more than our tables allow us to be smart about.
    339       // Fall back on BMH shift.
    340       index += pattern_length - 1
    341           - CharOccurrence(bad_char_occurence,
    342                            static_cast<SubjectChar>(last_char));
    343     } else {
    344       int gs_shift = good_suffix_shift[j + 1];
    345       int bc_occ =
    346           CharOccurrence(bad_char_occurence, c);
    347       int shift = j - bc_occ;
    348       if (gs_shift > shift) {
    349         shift = gs_shift;
    350       }
    351       index += shift;
    352     }
    353   }
    354 
    355   return -1;
    356 }
    357 
    358 
    359 template <typename PatternChar, typename SubjectChar>
    360 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() {
    361   int pattern_length = pattern_.length();
    362   const PatternChar* pattern = pattern_.start();
    363   // Only look at the last kBMMaxShift characters of pattern (from start_
    364   // to pattern_length).
    365   int start = start_;
    366   int length = pattern_length - start;
    367 
    368   // Biased tables so that we can use pattern indices as table indices,
    369   // even if we only cover the part of the pattern from offset start.
    370   int* shift_table = good_suffix_shift_table();
    371   int* suffix_table = this->suffix_table();
    372 
    373   // Initialize table.
    374   for (int i = start; i < pattern_length; i++) {
    375     shift_table[i] = length;
    376   }
    377   shift_table[pattern_length] = 1;
    378   suffix_table[pattern_length] = pattern_length + 1;
    379 
    380   if (pattern_length <= start) {
    381     return;
    382   }
    383 
    384   // Find suffixes.
    385   PatternChar last_char = pattern[pattern_length - 1];
    386   int suffix = pattern_length + 1;
    387   {
    388     int i = pattern_length;
    389     while (i > start) {
    390       PatternChar c = pattern[i - 1];
    391       while (suffix <= pattern_length && c != pattern[suffix - 1]) {
    392         if (shift_table[suffix] == length) {
    393           shift_table[suffix] = suffix - i;
    394         }
    395         suffix = suffix_table[suffix];
    396       }
    397       suffix_table[--i] = --suffix;
    398       if (suffix == pattern_length) {
    399         // No suffix to extend, so we check against last_char only.
    400         while ((i > start) && (pattern[i - 1] != last_char)) {
    401           if (shift_table[pattern_length] == length) {
    402             shift_table[pattern_length] = pattern_length - i;
    403           }
    404           suffix_table[--i] = pattern_length;
    405         }
    406         if (i > start) {
    407           suffix_table[--i] = --suffix;
    408         }
    409       }
    410     }
    411   }
    412   // Build shift table using suffixes.
    413   if (suffix < pattern_length) {
    414     for (int i = start; i <= pattern_length; i++) {
    415       if (shift_table[i] == length) {
    416         shift_table[i] = suffix - start;
    417       }
    418       if (i == suffix) {
    419         suffix = suffix_table[suffix];
    420       }
    421     }
    422   }
    423 }
    424 
    425 //---------------------------------------------------------------------
    426 // Boyer-Moore-Horspool string search.
    427 //---------------------------------------------------------------------
    428 
    429 template <typename PatternChar, typename SubjectChar>
    430 int StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch(
    431     StringSearch<PatternChar, SubjectChar>* search,
    432     Vector<const SubjectChar> subject,
    433     int start_index) {
    434   Vector<const PatternChar> pattern = search->pattern_;
    435   int subject_length = subject.length();
    436   int pattern_length = pattern.length();
    437   int* char_occurrences = search->bad_char_table();
    438   int badness = -pattern_length;
    439 
    440   // How bad we are doing without a good-suffix table.
    441   PatternChar last_char = pattern[pattern_length - 1];
    442   int last_char_shift = pattern_length - 1 -
    443       CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char));
    444   // Perform search
    445   int index = start_index;  // No matches found prior to this index.
    446   while (index <= subject_length - pattern_length) {
    447     int j = pattern_length - 1;
    448     int subject_char;
    449     while (last_char != (subject_char = subject[index + j])) {
    450       int bc_occ = CharOccurrence(char_occurrences, subject_char);
    451       int shift = j - bc_occ;
    452       index += shift;
    453       badness += 1 - shift;  // at most zero, so badness cannot increase.
    454       if (index > subject_length - pattern_length) {
    455         return -1;
    456       }
    457     }
    458     j--;
    459     while (j >= 0 && pattern[j] == (subject[index + j])) j--;
    460     if (j < 0) {
    461       return index;
    462     } else {
    463       index += last_char_shift;
    464       // Badness increases by the number of characters we have
    465       // checked, and decreases by the number of characters we
    466       // can skip by shifting. It's a measure of how we are doing
    467       // compared to reading each character exactly once.
    468       badness += (pattern_length - j) - last_char_shift;
    469       if (badness > 0) {
    470         search->PopulateBoyerMooreTable();
    471         search->strategy_ = &BoyerMooreSearch;
    472         return BoyerMooreSearch(search, subject, index);
    473       }
    474     }
    475   }
    476   return -1;
    477 }
    478 
    479 
    480 template <typename PatternChar, typename SubjectChar>
    481 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() {
    482   int pattern_length = pattern_.length();
    483 
    484   int* bad_char_occurrence = bad_char_table();
    485 
    486   // Only preprocess at most kBMMaxShift last characters of pattern.
    487   int start = start_;
    488   // Run forwards to populate bad_char_table, so that *last* instance
    489   // of character equivalence class is the one registered.
    490   // Notice: Doesn't include the last character.
    491   int table_size = AlphabetSize();
    492   if (start == 0) {  // All patterns less than kBMMaxShift in length.
    493     memset(bad_char_occurrence,
    494            -1,
    495            table_size * sizeof(*bad_char_occurrence));
    496   } else {
    497     for (int i = 0; i < table_size; i++) {
    498       bad_char_occurrence[i] = start - 1;
    499     }
    500   }
    501   for (int i = start; i < pattern_length - 1; i++) {
    502     PatternChar c = pattern_[i];
    503     int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize();
    504     bad_char_occurrence[bucket] = i;
    505   }
    506 }
    507 
    508 //---------------------------------------------------------------------
    509 // Linear string search with bailout to BMH.
    510 //---------------------------------------------------------------------
    511 
    512 // Simple linear search for short patterns, which bails out if the string
    513 // isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
    514 template <typename PatternChar, typename SubjectChar>
    515 int StringSearch<PatternChar, SubjectChar>::InitialSearch(
    516     StringSearch<PatternChar, SubjectChar>* search,
    517     Vector<const SubjectChar> subject,
    518     int index) {
    519   Vector<const PatternChar> pattern = search->pattern_;
    520   int pattern_length = pattern.length();
    521   // Badness is a count of how much work we have done.  When we have
    522   // done enough work we decide it's probably worth switching to a better
    523   // algorithm.
    524   int badness = -10 - (pattern_length << 2);
    525 
    526   // We know our pattern is at least 2 characters, we cache the first so
    527   // the common case of the first character not matching is faster.
    528   PatternChar pattern_first_char = pattern[0];
    529   for (int i = index, n = subject.length() - pattern_length; i <= n; i++) {
    530     badness++;
    531     if (badness <= 0) {
    532       if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
    533         const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
    534             memchr(subject.start() + i,
    535                    pattern_first_char,
    536                    n - i + 1));
    537         if (pos == NULL) {
    538           return -1;
    539         }
    540         i = static_cast<int>(pos - subject.start());
    541       } else {
    542         if (subject[i] != pattern_first_char) continue;
    543       }
    544       int j = 1;
    545       do {
    546         if (pattern[j] != subject[i + j]) {
    547           break;
    548         }
    549         j++;
    550       } while (j < pattern_length);
    551       if (j == pattern_length) {
    552         return i;
    553       }
    554       badness += j;
    555     } else {
    556       search->PopulateBoyerMooreHorspoolTable();
    557       search->strategy_ = &BoyerMooreHorspoolSearch;
    558       return BoyerMooreHorspoolSearch(search, subject, i);
    559     }
    560   }
    561   return -1;
    562 }
    563 
    564 
    565 // Perform a a single stand-alone search.
    566 // If searching multiple times for the same pattern, a search
    567 // object should be constructed once and the Search function then called
    568 // for each search.
    569 template <typename SubjectChar, typename PatternChar>
    570 int SearchString(Isolate* isolate,
    571                  Vector<const SubjectChar> subject,
    572                  Vector<const PatternChar> pattern,
    573                  int start_index) {
    574   StringSearch<PatternChar, SubjectChar> search(isolate, pattern);
    575   return search.Search(subject, start_index);
    576 }
    577 
    578 }}  // namespace v8::internal
    579 
    580 #endif  // V8_STRING_SEARCH_H_
    581