1 // Copyright (C) 2011 The Libphonenumber Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Author: George Yakovlev 16 // Philippe Liard 17 18 // Note that we don't use features of ICU that depend on std::string (e.g. 19 // UnicodeString::toUTF8String()) to support clients that build ICU without 20 // -DU_HAVE_STD_STRING. 21 22 #include "phonenumbers/regexp_adapter_icu.h" 23 24 #include <stddef.h> 25 #include <string> 26 27 #include <unicode/regex.h> 28 #include <unicode/stringpiece.h> 29 #include <unicode/unistr.h> 30 31 #include "phonenumbers/base/basictypes.h" 32 #include "phonenumbers/base/logging.h" 33 #include "phonenumbers/base/memory/scoped_ptr.h" 34 #include "phonenumbers/default_logger.h" 35 #include "phonenumbers/string_byte_sink.h" 36 37 namespace i18n { 38 namespace phonenumbers { 39 40 using icu::RegexMatcher; 41 using icu::RegexPattern; 42 using icu::UnicodeString; 43 44 namespace { 45 46 // Converts UnicodeString 'source' to a UTF8-formatted std::string. 47 string UnicodeStringToUtf8String(const UnicodeString& source) { 48 string data; 49 StringByteSink sink(&data); 50 source.toUTF8(sink); 51 return data; 52 } 53 54 // Converts UTF8-formatted std::string 'source' to a UnicodeString. 55 UnicodeString Utf8StringToUnicodeString(const string& source) { 56 // Note that we don't use icu::StringPiece(const string&). 57 return UnicodeString::fromUTF8( 58 icu::StringPiece(source.c_str(), source.size())); 59 } 60 61 } // namespace 62 63 // Implementation of the abstract classes RegExpInput and RegExp using ICU 64 // regular expression capabilities. 65 66 // ICU implementation of the RegExpInput abstract class. 67 class IcuRegExpInput : public RegExpInput { 68 public: 69 explicit IcuRegExpInput(const string& utf8_input) 70 : utf8_input_(Utf8StringToUnicodeString(utf8_input)), 71 position_(0) {} 72 73 virtual ~IcuRegExpInput() {} 74 75 virtual string ToString() const { 76 return UnicodeStringToUtf8String(utf8_input_.tempSubString(position_)); 77 } 78 79 UnicodeString* Data() { 80 return &utf8_input_; 81 } 82 83 // The current start position. For a newly created input, position is 0. Each 84 // call to ConsumeRegExp() or RegExp::Consume() advances the position in the 85 // case of the successful match to be after the match. 86 int position() const { 87 return position_; 88 } 89 90 void set_position(int position) { 91 DCHECK(position >= 0 && position <= utf8_input_.length()); 92 position_ = position; 93 } 94 95 private: 96 UnicodeString utf8_input_; 97 int position_; 98 99 DISALLOW_COPY_AND_ASSIGN(IcuRegExpInput); 100 }; 101 102 // ICU implementation of the RegExp abstract class. 103 class IcuRegExp : public RegExp { 104 public: 105 explicit IcuRegExp(const string& utf8_regexp) { 106 UParseError parse_error; 107 UErrorCode status = U_ZERO_ERROR; 108 utf8_regexp_.reset(RegexPattern::compile( 109 Utf8StringToUnicodeString(utf8_regexp), 0, parse_error, status)); 110 if (U_FAILURE(status)) { 111 // The provided regular expressions should compile correctly. 112 LOG(ERROR) << "Error compiling regular expression: " << utf8_regexp; 113 utf8_regexp_.reset(NULL); 114 } 115 } 116 117 virtual ~IcuRegExp() {} 118 119 virtual bool Consume(RegExpInput* input_string, 120 bool anchor_at_start, 121 string* matched_string1, 122 string* matched_string2, 123 string* matched_string3) const { 124 DCHECK(input_string); 125 if (!utf8_regexp_.get()) { 126 return false; 127 } 128 IcuRegExpInput* const input = static_cast<IcuRegExpInput*>(input_string); 129 UErrorCode status = U_ZERO_ERROR; 130 const scoped_ptr<RegexMatcher> matcher( 131 utf8_regexp_->matcher(*input->Data(), status)); 132 bool match_succeeded = anchor_at_start 133 ? matcher->lookingAt(input->position(), status) 134 : matcher->find(input->position(), status); 135 if (!match_succeeded || U_FAILURE(status)) { 136 return false; 137 } 138 string* const matched_strings[] = { 139 matched_string1, matched_string2, matched_string3 140 }; 141 // If less matches than expected - fail. 142 for (size_t i = 0; i < arraysize(matched_strings); ++i) { 143 if (matched_strings[i]) { 144 // Groups are counted from 1 rather than 0. 145 const int group_index = i + 1; 146 if (group_index > matcher->groupCount()) { 147 return false; 148 } 149 *matched_strings[i] = 150 UnicodeStringToUtf8String(matcher->group(group_index, status)); 151 } 152 } 153 input->set_position(matcher->end(status)); 154 return !U_FAILURE(status); 155 } 156 157 bool Match(const string& input_string, 158 bool full_match, 159 string* matched_string) const { 160 if (!utf8_regexp_.get()) { 161 return false; 162 } 163 IcuRegExpInput input(input_string); 164 UErrorCode status = U_ZERO_ERROR; 165 const scoped_ptr<RegexMatcher> matcher( 166 utf8_regexp_->matcher(*input.Data(), status)); 167 bool match_succeeded = full_match 168 ? matcher->matches(input.position(), status) 169 : matcher->find(input.position(), status); 170 if (!match_succeeded || U_FAILURE(status)) { 171 return false; 172 } 173 if (matcher->groupCount() > 0 && matched_string) { 174 *matched_string = UnicodeStringToUtf8String(matcher->group(1, status)); 175 } 176 return !U_FAILURE(status); 177 } 178 179 bool Replace(string* string_to_process, 180 bool global, 181 const string& replacement_string) const { 182 DCHECK(string_to_process); 183 if (!utf8_regexp_.get()) { 184 return false; 185 } 186 IcuRegExpInput input(*string_to_process); 187 UErrorCode status = U_ZERO_ERROR; 188 const scoped_ptr<RegexMatcher> matcher( 189 utf8_regexp_->matcher(*input.Data(), status)); 190 if (U_FAILURE(status)) { 191 return false; 192 } 193 194 UnicodeString output; 195 // We reimplement ReplaceFirst and ReplaceAll such that their behaviour is 196 // consistent with the RE2 reg-ex matcher. 197 if (!matcher->find()) { 198 return false; 199 } 200 matcher->appendReplacement(output, 201 Utf8StringToUnicodeString(replacement_string), 202 status); 203 if (global) { 204 // Continue and look for more matches. 205 while (matcher->find()) { 206 matcher->appendReplacement( 207 output, 208 Utf8StringToUnicodeString(replacement_string), 209 status); 210 } 211 } 212 213 matcher->appendTail(output); 214 if (U_FAILURE(status)) { 215 return false; 216 } 217 const string replaced_string = UnicodeStringToUtf8String(output); 218 *string_to_process = replaced_string; 219 return true; 220 } 221 222 private: 223 scoped_ptr<RegexPattern> utf8_regexp_; 224 225 DISALLOW_COPY_AND_ASSIGN(IcuRegExp); 226 }; 227 228 RegExpInput* ICURegExpFactory::CreateInput(const string& utf8_input) const { 229 return new IcuRegExpInput(utf8_input); 230 } 231 232 RegExp* ICURegExpFactory::CreateRegExp(const string& utf8_regexp) const { 233 return new IcuRegExp(utf8_regexp); 234 } 235 236 } // namespace phonenumbers 237 } // namespace i18n 238