Home | History | Annotate | Download | only in utf8
      1 /*
      2  * Copyright (C) 2017 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 // UniLib implementation with the help of ICU. UniLib is basically a wrapper
     18 // around the ICU functionality.
     19 
     20 #ifndef LIBTEXTCLASSIFIER_UTIL_UTF8_UNILIB_ICU_H_
     21 #define LIBTEXTCLASSIFIER_UTIL_UTF8_UNILIB_ICU_H_
     22 
     23 #include <memory>
     24 
     25 #include "util/base/integral_types.h"
     26 #include "util/utf8/unicodetext.h"
     27 #include "unicode/brkiter.h"
     28 #include "unicode/errorcode.h"
     29 #include "unicode/regex.h"
     30 #include "unicode/uchar.h"
     31 #include "unicode/unum.h"
     32 
     33 namespace libtextclassifier2 {
     34 
     35 class UniLib {
     36  public:
     37   bool ParseInt32(const UnicodeText& text, int* result) const;
     38   bool IsOpeningBracket(char32 codepoint) const;
     39   bool IsClosingBracket(char32 codepoint) const;
     40   bool IsWhitespace(char32 codepoint) const;
     41   bool IsDigit(char32 codepoint) const;
     42   bool IsUpper(char32 codepoint) const;
     43 
     44   char32 ToLower(char32 codepoint) const;
     45   char32 GetPairedBracket(char32 codepoint) const;
     46 
     47   // Forward declaration for friend.
     48   class RegexPattern;
     49 
     50   class RegexMatcher {
     51    public:
     52     static constexpr int kError = -1;
     53     static constexpr int kNoError = 0;
     54 
     55     // Checks whether the input text matches the pattern exactly.
     56     bool Matches(int* status) const;
     57 
     58     // Approximate Matches() implementation implemented using Find(). It uses
     59     // the first Find() result and then checks that it spans the whole input.
     60     // NOTE: Unlike Matches() it can result in false negatives.
     61     // NOTE: Resets the matcher, so the current Find() state will be lost.
     62     bool ApproximatelyMatches(int* status);
     63 
     64     // Finds occurrences of the pattern in the input text.
     65     // Can be called repeatedly to find all occurences. A call will update
     66     // internal state, so that 'Start', 'End' and 'Group' can be called to get
     67     // information about the match.
     68     // NOTE: Any call to ApproximatelyMatches() in between Find() calls will
     69     // modify the state.
     70     bool Find(int* status);
     71 
     72     // Gets the start offset of the last match (from  'Find').
     73     // Sets status to 'kError' if 'Find'
     74     // was not called previously.
     75     int Start(int* status) const;
     76 
     77     // Gets the start offset of the specified group of the last match.
     78     // (from  'Find').
     79     // Sets status to 'kError' if an invalid group was specified or if 'Find'
     80     // was not called previously.
     81     int Start(int group_idx, int* status) const;
     82 
     83     // Gets the end offset of the last match (from  'Find').
     84     // Sets status to 'kError' if 'Find'
     85     // was not called previously.
     86     int End(int* status) const;
     87 
     88     // Gets the end offset of the specified group of the last match.
     89     // (from  'Find').
     90     // Sets status to 'kError' if an invalid group was specified or if 'Find'
     91     // was not called previously.
     92     int End(int group_idx, int* status) const;
     93 
     94     // Gets the text of the last match (from 'Find').
     95     // Sets status to 'kError' if 'Find' was not called previously.
     96     UnicodeText Group(int* status) const;
     97 
     98     // Gets the text of the specified group of the last match (from 'Find').
     99     // Sets status to 'kError' if an invalid group was specified or if 'Find'
    100     // was not called previously.
    101     UnicodeText Group(int group_idx, int* status) const;
    102 
    103    protected:
    104     friend class RegexPattern;
    105     explicit RegexMatcher(icu::RegexPattern* pattern, icu::UnicodeString text);
    106 
    107    private:
    108     bool UpdateLastFindOffset() const;
    109 
    110     std::unique_ptr<icu::RegexMatcher> matcher_;
    111     icu::UnicodeString text_;
    112     mutable int last_find_offset_;
    113     mutable int last_find_offset_codepoints_;
    114     mutable bool last_find_offset_dirty_;
    115   };
    116 
    117   class RegexPattern {
    118    public:
    119     std::unique_ptr<RegexMatcher> Matcher(const UnicodeText& input) const;
    120 
    121    protected:
    122     friend class UniLib;
    123     explicit RegexPattern(std::unique_ptr<icu::RegexPattern> pattern)
    124         : pattern_(std::move(pattern)) {}
    125 
    126    private:
    127     std::unique_ptr<icu::RegexPattern> pattern_;
    128   };
    129 
    130   class BreakIterator {
    131    public:
    132     int Next();
    133 
    134     static constexpr int kDone = -1;
    135 
    136    protected:
    137     friend class UniLib;
    138     explicit BreakIterator(const UnicodeText& text);
    139 
    140    private:
    141     std::unique_ptr<icu::BreakIterator> break_iterator_;
    142     icu::UnicodeString text_;
    143     int last_break_index_;
    144     int last_unicode_index_;
    145   };
    146 
    147   std::unique_ptr<RegexPattern> CreateRegexPattern(
    148       const UnicodeText& regex) const;
    149   std::unique_ptr<BreakIterator> CreateBreakIterator(
    150       const UnicodeText& text) const;
    151 };
    152 
    153 }  // namespace libtextclassifier2
    154 
    155 #endif  // LIBTEXTCLASSIFIER_UTIL_UTF8_UNILIB_ICU_H_
    156