Home | History | Annotate | Download | only in utf8
      1 /*
      2  * Copyright (C) 2017 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "util/utf8/unilib-icu.h"
     18 
     19 #include <utility>
     20 
     21 namespace libtextclassifier2 {
     22 
     23 bool UniLib::ParseInt32(const UnicodeText& text, int* result) const {
     24   UErrorCode status = U_ZERO_ERROR;
     25   UNumberFormat* format_alias =
     26       unum_open(UNUM_DECIMAL, nullptr, 0, "en_US_POSIX", nullptr, &status);
     27   if (U_FAILURE(status)) {
     28     return false;
     29   }
     30   icu::UnicodeString utf8_string = icu::UnicodeString::fromUTF8(
     31       icu::StringPiece(text.data(), text.size_bytes()));
     32   int parse_index = 0;
     33   const int32 integer = unum_parse(format_alias, utf8_string.getBuffer(),
     34                                    utf8_string.length(), &parse_index, &status);
     35   *result = integer;
     36   unum_close(format_alias);
     37   if (U_FAILURE(status) || parse_index != utf8_string.length()) {
     38     return false;
     39   }
     40   return true;
     41 }
     42 
     43 bool UniLib::IsOpeningBracket(char32 codepoint) const {
     44   return u_getIntPropertyValue(codepoint, UCHAR_BIDI_PAIRED_BRACKET_TYPE) ==
     45          U_BPT_OPEN;
     46 }
     47 
     48 bool UniLib::IsClosingBracket(char32 codepoint) const {
     49   return u_getIntPropertyValue(codepoint, UCHAR_BIDI_PAIRED_BRACKET_TYPE) ==
     50          U_BPT_CLOSE;
     51 }
     52 
     53 bool UniLib::IsWhitespace(char32 codepoint) const {
     54   return u_isWhitespace(codepoint);
     55 }
     56 
     57 bool UniLib::IsDigit(char32 codepoint) const { return u_isdigit(codepoint); }
     58 
     59 bool UniLib::IsUpper(char32 codepoint) const { return u_isupper(codepoint); }
     60 
     61 char32 UniLib::ToLower(char32 codepoint) const { return u_tolower(codepoint); }
     62 
     63 char32 UniLib::GetPairedBracket(char32 codepoint) const {
     64   return u_getBidiPairedBracket(codepoint);
     65 }
     66 
     67 UniLib::RegexMatcher::RegexMatcher(icu::RegexPattern* pattern,
     68                                    icu::UnicodeString text)
     69     : text_(std::move(text)),
     70       last_find_offset_(0),
     71       last_find_offset_codepoints_(0),
     72       last_find_offset_dirty_(true) {
     73   UErrorCode status = U_ZERO_ERROR;
     74   matcher_.reset(pattern->matcher(text_, status));
     75   if (U_FAILURE(status)) {
     76     matcher_.reset(nullptr);
     77   }
     78 }
     79 
     80 std::unique_ptr<UniLib::RegexMatcher> UniLib::RegexPattern::Matcher(
     81     const UnicodeText& input) const {
     82   return std::unique_ptr<UniLib::RegexMatcher>(new UniLib::RegexMatcher(
     83       pattern_.get(), icu::UnicodeString::fromUTF8(
     84                           icu::StringPiece(input.data(), input.size_bytes()))));
     85 }
     86 
     87 constexpr int UniLib::RegexMatcher::kError;
     88 constexpr int UniLib::RegexMatcher::kNoError;
     89 
     90 bool UniLib::RegexMatcher::Matches(int* status) const {
     91   if (!matcher_) {
     92     *status = kError;
     93     return false;
     94   }
     95 
     96   UErrorCode icu_status = U_ZERO_ERROR;
     97   const bool result = matcher_->matches(/*startIndex=*/0, icu_status);
     98   if (U_FAILURE(icu_status)) {
     99     *status = kError;
    100     return false;
    101   }
    102   *status = kNoError;
    103   return result;
    104 }
    105 
    106 bool UniLib::RegexMatcher::ApproximatelyMatches(int* status) {
    107   if (!matcher_) {
    108     *status = kError;
    109     return false;
    110   }
    111 
    112   matcher_->reset();
    113   *status = kNoError;
    114   if (!Find(status) || *status != kNoError) {
    115     return false;
    116   }
    117   const int found_start = Start(status);
    118   if (*status != kNoError) {
    119     return false;
    120   }
    121   const int found_end = End(status);
    122   if (*status != kNoError) {
    123     return false;
    124   }
    125   if (found_start != 0 || found_end != text_.countChar32()) {
    126     return false;
    127   }
    128   return true;
    129 }
    130 
    131 bool UniLib::RegexMatcher::UpdateLastFindOffset() const {
    132   if (!last_find_offset_dirty_) {
    133     return true;
    134   }
    135 
    136   // Update the position of the match.
    137   UErrorCode icu_status = U_ZERO_ERROR;
    138   const int find_offset = matcher_->start(0, icu_status);
    139   if (U_FAILURE(icu_status)) {
    140     return false;
    141   }
    142   last_find_offset_codepoints_ +=
    143       text_.countChar32(last_find_offset_, find_offset - last_find_offset_);
    144   last_find_offset_ = find_offset;
    145   last_find_offset_dirty_ = false;
    146 
    147   return true;
    148 }
    149 
    150 bool UniLib::RegexMatcher::Find(int* status) {
    151   if (!matcher_) {
    152     *status = kError;
    153     return false;
    154   }
    155   UErrorCode icu_status = U_ZERO_ERROR;
    156   const bool result = matcher_->find(icu_status);
    157   if (U_FAILURE(icu_status)) {
    158     *status = kError;
    159     return false;
    160   }
    161 
    162   last_find_offset_dirty_ = true;
    163   *status = kNoError;
    164   return result;
    165 }
    166 
    167 int UniLib::RegexMatcher::Start(int* status) const {
    168   return Start(/*group_idx=*/0, status);
    169 }
    170 
    171 int UniLib::RegexMatcher::Start(int group_idx, int* status) const {
    172   if (!matcher_ || !UpdateLastFindOffset()) {
    173     *status = kError;
    174     return kError;
    175   }
    176 
    177   UErrorCode icu_status = U_ZERO_ERROR;
    178   const int result = matcher_->start(group_idx, icu_status);
    179   if (U_FAILURE(icu_status)) {
    180     *status = kError;
    181     return kError;
    182   }
    183   *status = kNoError;
    184 
    185   // If the group didn't participate in the match the result is -1 and is
    186   // incompatible with the caching logic bellow.
    187   if (result == -1) {
    188     return -1;
    189   }
    190 
    191   return last_find_offset_codepoints_ +
    192          text_.countChar32(/*start=*/last_find_offset_,
    193                            /*length=*/result - last_find_offset_);
    194 }
    195 
    196 int UniLib::RegexMatcher::End(int* status) const {
    197   return End(/*group_idx=*/0, status);
    198 }
    199 
    200 int UniLib::RegexMatcher::End(int group_idx, int* status) const {
    201   if (!matcher_ || !UpdateLastFindOffset()) {
    202     *status = kError;
    203     return kError;
    204   }
    205   UErrorCode icu_status = U_ZERO_ERROR;
    206   const int result = matcher_->end(group_idx, icu_status);
    207   if (U_FAILURE(icu_status)) {
    208     *status = kError;
    209     return kError;
    210   }
    211   *status = kNoError;
    212 
    213   // If the group didn't participate in the match the result is -1 and is
    214   // incompatible with the caching logic bellow.
    215   if (result == -1) {
    216     return -1;
    217   }
    218 
    219   return last_find_offset_codepoints_ +
    220          text_.countChar32(/*start=*/last_find_offset_,
    221                            /*length=*/result - last_find_offset_);
    222 }
    223 
    224 UnicodeText UniLib::RegexMatcher::Group(int* status) const {
    225   return Group(/*group_idx=*/0, status);
    226 }
    227 
    228 UnicodeText UniLib::RegexMatcher::Group(int group_idx, int* status) const {
    229   if (!matcher_) {
    230     *status = kError;
    231     return UTF8ToUnicodeText("", /*do_copy=*/false);
    232   }
    233   std::string result = "";
    234   UErrorCode icu_status = U_ZERO_ERROR;
    235   const icu::UnicodeString result_icu = matcher_->group(group_idx, icu_status);
    236   if (U_FAILURE(icu_status)) {
    237     *status = kError;
    238     return UTF8ToUnicodeText("", /*do_copy=*/false);
    239   }
    240   result_icu.toUTF8String(result);
    241   *status = kNoError;
    242   return UTF8ToUnicodeText(result, /*do_copy=*/true);
    243 }
    244 
    245 constexpr int UniLib::BreakIterator::kDone;
    246 
    247 UniLib::BreakIterator::BreakIterator(const UnicodeText& text)
    248     : text_(icu::UnicodeString::fromUTF8(
    249           icu::StringPiece(text.data(), text.size_bytes()))),
    250       last_break_index_(0),
    251       last_unicode_index_(0) {
    252   icu::ErrorCode status;
    253   break_iterator_.reset(
    254       icu::BreakIterator::createWordInstance(icu::Locale("en"), status));
    255   if (!status.isSuccess()) {
    256     break_iterator_.reset();
    257     return;
    258   }
    259   break_iterator_->setText(text_);
    260 }
    261 
    262 int UniLib::BreakIterator::Next() {
    263   const int break_index = break_iterator_->next();
    264   if (break_index == icu::BreakIterator::DONE) {
    265     return BreakIterator::kDone;
    266   }
    267   last_unicode_index_ +=
    268       text_.countChar32(last_break_index_, break_index - last_break_index_);
    269   last_break_index_ = break_index;
    270   return last_unicode_index_;
    271 }
    272 
    273 std::unique_ptr<UniLib::RegexPattern> UniLib::CreateRegexPattern(
    274     const UnicodeText& regex) const {
    275   UErrorCode status = U_ZERO_ERROR;
    276   std::unique_ptr<icu::RegexPattern> pattern(
    277       icu::RegexPattern::compile(icu::UnicodeString::fromUTF8(icu::StringPiece(
    278                                      regex.data(), regex.size_bytes())),
    279                                  /*flags=*/UREGEX_MULTILINE, status));
    280   if (U_FAILURE(status) || !pattern) {
    281     return nullptr;
    282   }
    283   return std::unique_ptr<UniLib::RegexPattern>(
    284       new UniLib::RegexPattern(std::move(pattern)));
    285 }
    286 
    287 std::unique_ptr<UniLib::BreakIterator> UniLib::CreateBreakIterator(
    288     const UnicodeText& text) const {
    289   return std::unique_ptr<UniLib::BreakIterator>(
    290       new UniLib::BreakIterator(text));
    291 }
    292 
    293 }  // namespace libtextclassifier2
    294