1 /* 2 * Copyright (C) 2017 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "util/utf8/unilib-icu.h" 18 19 #include <utility> 20 21 namespace libtextclassifier2 { 22 23 bool UniLib::ParseInt32(const UnicodeText& text, int* result) const { 24 UErrorCode status = U_ZERO_ERROR; 25 UNumberFormat* format_alias = 26 unum_open(UNUM_DECIMAL, nullptr, 0, "en_US_POSIX", nullptr, &status); 27 if (U_FAILURE(status)) { 28 return false; 29 } 30 icu::UnicodeString utf8_string = icu::UnicodeString::fromUTF8( 31 icu::StringPiece(text.data(), text.size_bytes())); 32 int parse_index = 0; 33 const int32 integer = unum_parse(format_alias, utf8_string.getBuffer(), 34 utf8_string.length(), &parse_index, &status); 35 *result = integer; 36 unum_close(format_alias); 37 if (U_FAILURE(status) || parse_index != utf8_string.length()) { 38 return false; 39 } 40 return true; 41 } 42 43 bool UniLib::IsOpeningBracket(char32 codepoint) const { 44 return u_getIntPropertyValue(codepoint, UCHAR_BIDI_PAIRED_BRACKET_TYPE) == 45 U_BPT_OPEN; 46 } 47 48 bool UniLib::IsClosingBracket(char32 codepoint) const { 49 return u_getIntPropertyValue(codepoint, UCHAR_BIDI_PAIRED_BRACKET_TYPE) == 50 U_BPT_CLOSE; 51 } 52 53 bool UniLib::IsWhitespace(char32 codepoint) const { 54 return u_isWhitespace(codepoint); 55 } 56 57 bool UniLib::IsDigit(char32 codepoint) const { return u_isdigit(codepoint); } 58 59 bool UniLib::IsUpper(char32 codepoint) const { return u_isupper(codepoint); } 60 61 char32 UniLib::ToLower(char32 codepoint) const { return u_tolower(codepoint); } 62 63 char32 UniLib::GetPairedBracket(char32 codepoint) const { 64 return u_getBidiPairedBracket(codepoint); 65 } 66 67 UniLib::RegexMatcher::RegexMatcher(icu::RegexPattern* pattern, 68 icu::UnicodeString text) 69 : text_(std::move(text)), 70 last_find_offset_(0), 71 last_find_offset_codepoints_(0), 72 last_find_offset_dirty_(true) { 73 UErrorCode status = U_ZERO_ERROR; 74 matcher_.reset(pattern->matcher(text_, status)); 75 if (U_FAILURE(status)) { 76 matcher_.reset(nullptr); 77 } 78 } 79 80 std::unique_ptr<UniLib::RegexMatcher> UniLib::RegexPattern::Matcher( 81 const UnicodeText& input) const { 82 return std::unique_ptr<UniLib::RegexMatcher>(new UniLib::RegexMatcher( 83 pattern_.get(), icu::UnicodeString::fromUTF8( 84 icu::StringPiece(input.data(), input.size_bytes())))); 85 } 86 87 constexpr int UniLib::RegexMatcher::kError; 88 constexpr int UniLib::RegexMatcher::kNoError; 89 90 bool UniLib::RegexMatcher::Matches(int* status) const { 91 if (!matcher_) { 92 *status = kError; 93 return false; 94 } 95 96 UErrorCode icu_status = U_ZERO_ERROR; 97 const bool result = matcher_->matches(/*startIndex=*/0, icu_status); 98 if (U_FAILURE(icu_status)) { 99 *status = kError; 100 return false; 101 } 102 *status = kNoError; 103 return result; 104 } 105 106 bool UniLib::RegexMatcher::ApproximatelyMatches(int* status) { 107 if (!matcher_) { 108 *status = kError; 109 return false; 110 } 111 112 matcher_->reset(); 113 *status = kNoError; 114 if (!Find(status) || *status != kNoError) { 115 return false; 116 } 117 const int found_start = Start(status); 118 if (*status != kNoError) { 119 return false; 120 } 121 const int found_end = End(status); 122 if (*status != kNoError) { 123 return false; 124 } 125 if (found_start != 0 || found_end != text_.countChar32()) { 126 return false; 127 } 128 return true; 129 } 130 131 bool UniLib::RegexMatcher::UpdateLastFindOffset() const { 132 if (!last_find_offset_dirty_) { 133 return true; 134 } 135 136 // Update the position of the match. 137 UErrorCode icu_status = U_ZERO_ERROR; 138 const int find_offset = matcher_->start(0, icu_status); 139 if (U_FAILURE(icu_status)) { 140 return false; 141 } 142 last_find_offset_codepoints_ += 143 text_.countChar32(last_find_offset_, find_offset - last_find_offset_); 144 last_find_offset_ = find_offset; 145 last_find_offset_dirty_ = false; 146 147 return true; 148 } 149 150 bool UniLib::RegexMatcher::Find(int* status) { 151 if (!matcher_) { 152 *status = kError; 153 return false; 154 } 155 UErrorCode icu_status = U_ZERO_ERROR; 156 const bool result = matcher_->find(icu_status); 157 if (U_FAILURE(icu_status)) { 158 *status = kError; 159 return false; 160 } 161 162 last_find_offset_dirty_ = true; 163 *status = kNoError; 164 return result; 165 } 166 167 int UniLib::RegexMatcher::Start(int* status) const { 168 return Start(/*group_idx=*/0, status); 169 } 170 171 int UniLib::RegexMatcher::Start(int group_idx, int* status) const { 172 if (!matcher_ || !UpdateLastFindOffset()) { 173 *status = kError; 174 return kError; 175 } 176 177 UErrorCode icu_status = U_ZERO_ERROR; 178 const int result = matcher_->start(group_idx, icu_status); 179 if (U_FAILURE(icu_status)) { 180 *status = kError; 181 return kError; 182 } 183 *status = kNoError; 184 185 // If the group didn't participate in the match the result is -1 and is 186 // incompatible with the caching logic bellow. 187 if (result == -1) { 188 return -1; 189 } 190 191 return last_find_offset_codepoints_ + 192 text_.countChar32(/*start=*/last_find_offset_, 193 /*length=*/result - last_find_offset_); 194 } 195 196 int UniLib::RegexMatcher::End(int* status) const { 197 return End(/*group_idx=*/0, status); 198 } 199 200 int UniLib::RegexMatcher::End(int group_idx, int* status) const { 201 if (!matcher_ || !UpdateLastFindOffset()) { 202 *status = kError; 203 return kError; 204 } 205 UErrorCode icu_status = U_ZERO_ERROR; 206 const int result = matcher_->end(group_idx, icu_status); 207 if (U_FAILURE(icu_status)) { 208 *status = kError; 209 return kError; 210 } 211 *status = kNoError; 212 213 // If the group didn't participate in the match the result is -1 and is 214 // incompatible with the caching logic bellow. 215 if (result == -1) { 216 return -1; 217 } 218 219 return last_find_offset_codepoints_ + 220 text_.countChar32(/*start=*/last_find_offset_, 221 /*length=*/result - last_find_offset_); 222 } 223 224 UnicodeText UniLib::RegexMatcher::Group(int* status) const { 225 return Group(/*group_idx=*/0, status); 226 } 227 228 UnicodeText UniLib::RegexMatcher::Group(int group_idx, int* status) const { 229 if (!matcher_) { 230 *status = kError; 231 return UTF8ToUnicodeText("", /*do_copy=*/false); 232 } 233 std::string result = ""; 234 UErrorCode icu_status = U_ZERO_ERROR; 235 const icu::UnicodeString result_icu = matcher_->group(group_idx, icu_status); 236 if (U_FAILURE(icu_status)) { 237 *status = kError; 238 return UTF8ToUnicodeText("", /*do_copy=*/false); 239 } 240 result_icu.toUTF8String(result); 241 *status = kNoError; 242 return UTF8ToUnicodeText(result, /*do_copy=*/true); 243 } 244 245 constexpr int UniLib::BreakIterator::kDone; 246 247 UniLib::BreakIterator::BreakIterator(const UnicodeText& text) 248 : text_(icu::UnicodeString::fromUTF8( 249 icu::StringPiece(text.data(), text.size_bytes()))), 250 last_break_index_(0), 251 last_unicode_index_(0) { 252 icu::ErrorCode status; 253 break_iterator_.reset( 254 icu::BreakIterator::createWordInstance(icu::Locale("en"), status)); 255 if (!status.isSuccess()) { 256 break_iterator_.reset(); 257 return; 258 } 259 break_iterator_->setText(text_); 260 } 261 262 int UniLib::BreakIterator::Next() { 263 const int break_index = break_iterator_->next(); 264 if (break_index == icu::BreakIterator::DONE) { 265 return BreakIterator::kDone; 266 } 267 last_unicode_index_ += 268 text_.countChar32(last_break_index_, break_index - last_break_index_); 269 last_break_index_ = break_index; 270 return last_unicode_index_; 271 } 272 273 std::unique_ptr<UniLib::RegexPattern> UniLib::CreateRegexPattern( 274 const UnicodeText& regex) const { 275 UErrorCode status = U_ZERO_ERROR; 276 std::unique_ptr<icu::RegexPattern> pattern( 277 icu::RegexPattern::compile(icu::UnicodeString::fromUTF8(icu::StringPiece( 278 regex.data(), regex.size_bytes())), 279 /*flags=*/UREGEX_MULTILINE, status)); 280 if (U_FAILURE(status) || !pattern) { 281 return nullptr; 282 } 283 return std::unique_ptr<UniLib::RegexPattern>( 284 new UniLib::RegexPattern(std::move(pattern))); 285 } 286 287 std::unique_ptr<UniLib::BreakIterator> UniLib::CreateBreakIterator( 288 const UnicodeText& text) const { 289 return std::unique_ptr<UniLib::BreakIterator>( 290 new UniLib::BreakIterator(text)); 291 } 292 293 } // namespace libtextclassifier2 294