Home | History | Annotate | Download | only in phonenumbers
      1 // Copyright (C) 2011 The Libphonenumber Authors
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 // http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 // Author: George Yakovlev
     16 //         Philippe Liard
     17 
     18 // Note that we don't use features of ICU that depend on std::string (e.g.
     19 // UnicodeString::toUTF8String()) to support clients that build ICU without
     20 // -DU_HAVE_STD_STRING.
     21 
     22 #include "phonenumbers/regexp_adapter_icu.h"
     23 
     24 #include <stddef.h>
     25 #include <string>
     26 
     27 #include <unicode/regex.h>
     28 #include <unicode/stringpiece.h>
     29 #include <unicode/unistr.h>
     30 
     31 #include "phonenumbers/base/basictypes.h"
     32 #include "phonenumbers/base/logging.h"
     33 #include "phonenumbers/base/memory/scoped_ptr.h"
     34 #include "phonenumbers/default_logger.h"
     35 #include "phonenumbers/string_byte_sink.h"
     36 
     37 namespace i18n {
     38 namespace phonenumbers {
     39 
     40 using icu::RegexMatcher;
     41 using icu::RegexPattern;
     42 using icu::UnicodeString;
     43 
     44 namespace {
     45 
     46 // Converts UnicodeString 'source' to a UTF8-formatted std::string.
     47 string UnicodeStringToUtf8String(const UnicodeString& source) {
     48   string data;
     49   StringByteSink sink(&data);
     50   source.toUTF8(sink);
     51   return data;
     52 }
     53 
     54 // Converts UTF8-formatted std::string 'source' to a UnicodeString.
     55 UnicodeString Utf8StringToUnicodeString(const string& source) {
     56   // Note that we don't use icu::StringPiece(const string&).
     57   return UnicodeString::fromUTF8(
     58       icu::StringPiece(source.c_str(), source.size()));
     59 }
     60 
     61 }  // namespace
     62 
     63 // Implementation of the abstract classes RegExpInput and RegExp using ICU
     64 // regular expression capabilities.
     65 
     66 // ICU implementation of the RegExpInput abstract class.
     67 class IcuRegExpInput : public RegExpInput {
     68  public:
     69   explicit IcuRegExpInput(const string& utf8_input)
     70       : utf8_input_(Utf8StringToUnicodeString(utf8_input)),
     71         position_(0) {}
     72 
     73   virtual ~IcuRegExpInput() {}
     74 
     75   virtual string ToString() const {
     76     return UnicodeStringToUtf8String(utf8_input_.tempSubString(position_));
     77   }
     78 
     79   UnicodeString* Data() {
     80     return &utf8_input_;
     81   }
     82 
     83   // The current start position. For a newly created input, position is 0. Each
     84   // call to ConsumeRegExp() or RegExp::Consume() advances the position in the
     85   // case of the successful match to be after the match.
     86   int position() const {
     87     return position_;
     88   }
     89 
     90   void set_position(int position) {
     91     DCHECK(position >= 0 && position <= utf8_input_.length());
     92     position_ = position;
     93   }
     94 
     95  private:
     96   UnicodeString utf8_input_;
     97   int position_;
     98 
     99   DISALLOW_COPY_AND_ASSIGN(IcuRegExpInput);
    100 };
    101 
    102 // ICU implementation of the RegExp abstract class.
    103 class IcuRegExp : public RegExp {
    104  public:
    105   explicit IcuRegExp(const string& utf8_regexp) {
    106     UParseError parse_error;
    107     UErrorCode status = U_ZERO_ERROR;
    108     utf8_regexp_.reset(RegexPattern::compile(
    109         Utf8StringToUnicodeString(utf8_regexp), 0, parse_error, status));
    110     if (U_FAILURE(status)) {
    111       // The provided regular expressions should compile correctly.
    112       LOG(ERROR) << "Error compiling regular expression: " << utf8_regexp;
    113       utf8_regexp_.reset(NULL);
    114     }
    115   }
    116 
    117   virtual ~IcuRegExp() {}
    118 
    119   virtual bool Consume(RegExpInput* input_string,
    120                        bool anchor_at_start,
    121                        string* matched_string1,
    122                        string* matched_string2,
    123                        string* matched_string3) const {
    124     DCHECK(input_string);
    125     if (!utf8_regexp_.get()) {
    126       return false;
    127     }
    128     IcuRegExpInput* const input = static_cast<IcuRegExpInput*>(input_string);
    129     UErrorCode status = U_ZERO_ERROR;
    130     const scoped_ptr<RegexMatcher> matcher(
    131         utf8_regexp_->matcher(*input->Data(), status));
    132     bool match_succeeded = anchor_at_start
    133         ? matcher->lookingAt(input->position(), status)
    134         : matcher->find(input->position(), status);
    135     if (!match_succeeded || U_FAILURE(status)) {
    136       return false;
    137     }
    138     string* const matched_strings[] = {
    139       matched_string1, matched_string2, matched_string3
    140     };
    141     // If less matches than expected - fail.
    142     for (size_t i = 0; i < arraysize(matched_strings); ++i) {
    143       if (matched_strings[i]) {
    144         // Groups are counted from 1 rather than 0.
    145         const int group_index = i + 1;
    146         if (group_index > matcher->groupCount()) {
    147           return false;
    148         }
    149         *matched_strings[i] =
    150             UnicodeStringToUtf8String(matcher->group(group_index, status));
    151       }
    152     }
    153     input->set_position(matcher->end(status));
    154     return !U_FAILURE(status);
    155   }
    156 
    157   bool Match(const string& input_string,
    158              bool full_match,
    159              string* matched_string) const {
    160     if (!utf8_regexp_.get()) {
    161       return false;
    162     }
    163     IcuRegExpInput input(input_string);
    164     UErrorCode status = U_ZERO_ERROR;
    165     const scoped_ptr<RegexMatcher> matcher(
    166         utf8_regexp_->matcher(*input.Data(), status));
    167     bool match_succeeded = full_match
    168         ? matcher->matches(input.position(), status)
    169         : matcher->find(input.position(), status);
    170     if (!match_succeeded || U_FAILURE(status)) {
    171       return false;
    172     }
    173     if (matcher->groupCount() > 0 && matched_string) {
    174       *matched_string = UnicodeStringToUtf8String(matcher->group(1, status));
    175     }
    176     return !U_FAILURE(status);
    177   }
    178 
    179   bool Replace(string* string_to_process,
    180                bool global,
    181                const string& replacement_string) const {
    182     DCHECK(string_to_process);
    183     if (!utf8_regexp_.get()) {
    184       return false;
    185     }
    186     IcuRegExpInput input(*string_to_process);
    187     UErrorCode status = U_ZERO_ERROR;
    188     const scoped_ptr<RegexMatcher> matcher(
    189         utf8_regexp_->matcher(*input.Data(), status));
    190     if (U_FAILURE(status)) {
    191       return false;
    192     }
    193 
    194     UnicodeString output;
    195     // We reimplement ReplaceFirst and ReplaceAll such that their behaviour is
    196     // consistent with the RE2 reg-ex matcher.
    197     if (!matcher->find()) {
    198       return false;
    199     }
    200     matcher->appendReplacement(output,
    201                                Utf8StringToUnicodeString(replacement_string),
    202                                status);
    203     if (global) {
    204       // Continue and look for more matches.
    205       while (matcher->find()) {
    206         matcher->appendReplacement(
    207             output,
    208             Utf8StringToUnicodeString(replacement_string),
    209             status);
    210       }
    211     }
    212 
    213     matcher->appendTail(output);
    214     if (U_FAILURE(status)) {
    215       return false;
    216     }
    217     const string replaced_string = UnicodeStringToUtf8String(output);
    218     *string_to_process = replaced_string;
    219     return true;
    220   }
    221 
    222  private:
    223   scoped_ptr<RegexPattern> utf8_regexp_;
    224 
    225   DISALLOW_COPY_AND_ASSIGN(IcuRegExp);
    226 };
    227 
    228 RegExpInput* ICURegExpFactory::CreateInput(const string& utf8_input) const {
    229   return new IcuRegExpInput(utf8_input);
    230 }
    231 
    232 RegExp* ICURegExpFactory::CreateRegExp(const string& utf8_regexp) const {
    233   return new IcuRegExp(utf8_regexp);
    234 }
    235 
    236 }  // namespace phonenumbers
    237 }  // namespace i18n
    238