Home | History | Annotate | Download | only in strings
      1 // Copyright 2015 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "base/strings/pattern.h"
      6 
      7 #include "base/third_party/icu/icu_utf.h"
      8 
      9 namespace base {
     10 
     11 namespace {
     12 
     13 constexpr bool IsWildcard(base_icu::UChar32 character) {
     14   return character == '*' || character == '?';
     15 }
     16 
     17 // Searches for the next subpattern of |pattern| in |string|, up to the given
     18 // |maximum_distance|. The subpattern extends from the start of |pattern| up to
     19 // the first wildcard character (or the end of the string). If the value of
     20 // |maximum_distance| is negative, the maximum distance is considered infinite.
     21 template <typename CHAR, typename NEXT>
     22 constexpr bool SearchForChars(const CHAR** pattern,
     23                               const CHAR* pattern_end,
     24                               const CHAR** string,
     25                               const CHAR* string_end,
     26                               int maximum_distance,
     27                               NEXT next) {
     28   const CHAR* pattern_start = *pattern;
     29   const CHAR* string_start = *string;
     30   bool escape = false;
     31   while (true) {
     32     if (*pattern == pattern_end) {
     33       // If this is the end of the pattern, only accept the end of the string;
     34       // anything else falls through to the mismatch case.
     35       if (*string == string_end)
     36         return true;
     37     } else {
     38       // If we have found a wildcard, we're done.
     39       if (!escape && IsWildcard(**pattern))
     40         return true;
     41 
     42       // Check if the escape character is found. If so, skip it and move to the
     43       // next character.
     44       if (!escape && **pattern == '\\') {
     45         escape = true;
     46         next(pattern, pattern_end);
     47         continue;
     48       }
     49 
     50       escape = false;
     51 
     52       if (*string == string_end)
     53         return false;
     54 
     55       // Check if the chars match, if so, increment the ptrs.
     56       const CHAR* pattern_next = *pattern;
     57       const CHAR* string_next = *string;
     58       base_icu::UChar32 pattern_char = next(&pattern_next, pattern_end);
     59       if (pattern_char == next(&string_next, string_end) &&
     60           pattern_char != CBU_SENTINEL) {
     61         *pattern = pattern_next;
     62         *string = string_next;
     63         continue;
     64       }
     65     }
     66 
     67     // Mismatch. If we have reached the maximum distance, return false,
     68     // otherwise restart at the beginning of the pattern with the next character
     69     // in the string.
     70     // TODO(bauerb): This is a naive implementation of substring search, which
     71     // could be implemented with a more efficient algorithm, e.g.
     72     // Knuth-Morris-Pratt (at the expense of requiring preprocessing).
     73     if (maximum_distance == 0)
     74       return false;
     75 
     76     // Because unlimited distance is represented as -1, this will never reach 0
     77     // and therefore fail the match above.
     78     maximum_distance--;
     79     *pattern = pattern_start;
     80     next(&string_start, string_end);
     81     *string = string_start;
     82   }
     83 }
     84 
     85 // Consumes consecutive wildcard characters (? or *). Returns the maximum number
     86 // of characters matched by the sequence of wildcards, or -1 if the wildcards
     87 // match an arbitrary number of characters (which is the case if it contains at
     88 // least one *).
     89 template <typename CHAR, typename NEXT>
     90 constexpr int EatWildcards(const CHAR** pattern, const CHAR* end, NEXT next) {
     91   int num_question_marks = 0;
     92   bool has_asterisk = false;
     93   while (*pattern != end) {
     94     if (**pattern == '?') {
     95       num_question_marks++;
     96     } else if (**pattern == '*') {
     97       has_asterisk = true;
     98     } else {
     99       break;
    100     }
    101 
    102     next(pattern, end);
    103   }
    104   return has_asterisk ? -1 : num_question_marks;
    105 }
    106 
    107 template <typename CHAR, typename NEXT>
    108 constexpr bool MatchPatternT(const CHAR* eval,
    109                              const CHAR* eval_end,
    110                              const CHAR* pattern,
    111                              const CHAR* pattern_end,
    112                              NEXT next) {
    113   do {
    114     int maximum_wildcard_length = EatWildcards(&pattern, pattern_end, next);
    115     if (!SearchForChars(&pattern, pattern_end, &eval, eval_end,
    116                         maximum_wildcard_length, next)) {
    117       return false;
    118     }
    119   } while (pattern != pattern_end);
    120   return true;
    121 }
    122 
    123 struct NextCharUTF8 {
    124   base_icu::UChar32 operator()(const char** p, const char* end) {
    125     base_icu::UChar32 c;
    126     int offset = 0;
    127     CBU8_NEXT(*p, offset, end - *p, c);
    128     *p += offset;
    129     return c;
    130   }
    131 };
    132 
    133 struct NextCharUTF16 {
    134   base_icu::UChar32 operator()(const char16** p, const char16* end) {
    135     base_icu::UChar32 c;
    136     int offset = 0;
    137     CBU16_NEXT(*p, offset, end - *p, c);
    138     *p += offset;
    139     return c;
    140   }
    141 };
    142 
    143 }  // namespace
    144 
    145 bool MatchPattern(StringPiece eval, StringPiece pattern) {
    146   return MatchPatternT(eval.data(), eval.data() + eval.size(), pattern.data(),
    147                        pattern.data() + pattern.size(), NextCharUTF8());
    148 }
    149 
    150 bool MatchPattern(StringPiece16 eval, StringPiece16 pattern) {
    151   return MatchPatternT(eval.data(), eval.data() + eval.size(), pattern.data(),
    152                        pattern.data() + pattern.size(), NextCharUTF16());
    153 }
    154 
    155 }  // namespace base
    156