Home | History | Annotate | Download | only in utf8
      1 /*
      2  * Copyright (C) 2018 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 // An implementation of Unilib that uses Android Java interfaces via JNI. The
     18 // performance critical ops have been re-implemented in C++.
     19 // Specifically, this class must be compatible with API level 14 (ICS).
     20 
     21 #ifndef LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_JAVAICU_H_
     22 #define LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_JAVAICU_H_
     23 
     24 #include <jni.h>
     25 #include <memory>
     26 #include <mutex>  // NOLINT
     27 #include <string>
     28 
     29 #include "utils/base/integral_types.h"
     30 #include "utils/java/jni-cache.h"
     31 #include "utils/java/scoped_global_ref.h"
     32 #include "utils/java/scoped_local_ref.h"
     33 #include "utils/java/string_utils.h"
     34 #include "utils/utf8/unicodetext.h"
     35 
     36 namespace libtextclassifier3 {
     37 
     38 class UniLib {
     39  public:
     40   UniLib();
     41   explicit UniLib(const std::shared_ptr<JniCache>& jni_cache);
     42 
     43   bool ParseInt32(const UnicodeText& text, int* result) const;
     44   bool IsOpeningBracket(char32 codepoint) const;
     45   bool IsClosingBracket(char32 codepoint) const;
     46   bool IsWhitespace(char32 codepoint) const;
     47   bool IsDigit(char32 codepoint) const;
     48   bool IsUpper(char32 codepoint) const;
     49 
     50   char32 ToLower(char32 codepoint) const;
     51   char32 GetPairedBracket(char32 codepoint) const;
     52 
     53   // Forward declaration for friend.
     54   class RegexPattern;
     55 
     56   class RegexMatcher {
     57    public:
     58     static constexpr int kError = -1;
     59     static constexpr int kNoError = 0;
     60 
     61     // Checks whether the input text matches the pattern exactly.
     62     bool Matches(int* status) const;
     63 
     64     // Approximate Matches() implementation implemented using Find(). It uses
     65     // the first Find() result and then checks that it spans the whole input.
     66     // NOTE: Unlike Matches() it can result in false negatives.
     67     // NOTE: Resets the matcher, so the current Find() state will be lost.
     68     bool ApproximatelyMatches(int* status);
     69 
     70     // Finds occurrences of the pattern in the input text.
     71     // Can be called repeatedly to find all occurences. A call will update
     72     // internal state, so that 'Start', 'End' and 'Group' can be called to get
     73     // information about the match.
     74     // NOTE: Any call to ApproximatelyMatches() in between Find() calls will
     75     // modify the state.
     76     bool Find(int* status);
     77 
     78     // Gets the start offset of the last match (from  'Find').
     79     // Sets status to 'kError' if 'Find'
     80     // was not called previously.
     81     int Start(int* status) const;
     82 
     83     // Gets the start offset of the specified group of the last match.
     84     // (from  'Find').
     85     // Sets status to 'kError' if an invalid group was specified or if 'Find'
     86     // was not called previously.
     87     int Start(int group_idx, int* status) const;
     88 
     89     // Gets the end offset of the last match (from  'Find').
     90     // Sets status to 'kError' if 'Find'
     91     // was not called previously.
     92     int End(int* status) const;
     93 
     94     // Gets the end offset of the specified group of the last match.
     95     // (from  'Find').
     96     // Sets status to 'kError' if an invalid group was specified or if 'Find'
     97     // was not called previously.
     98     int End(int group_idx, int* status) const;
     99 
    100     // Gets the text of the last match (from 'Find').
    101     // Sets status to 'kError' if 'Find' was not called previously.
    102     UnicodeText Group(int* status) const;
    103 
    104     // Gets the text of the specified group of the last match (from 'Find').
    105     // Sets status to 'kError' if an invalid group was specified or if 'Find'
    106     // was not called previously.
    107     UnicodeText Group(int group_idx, int* status) const;
    108 
    109     // Returns the matched text (the 0th capturing group).
    110     std::string Text() const {
    111       ScopedStringChars text_str =
    112           GetScopedStringChars(jni_cache_->GetEnv(), text_.get());
    113       return text_str.get();
    114     }
    115 
    116    private:
    117     friend class RegexPattern;
    118     RegexMatcher(const JniCache* jni_cache, ScopedGlobalRef<jobject> matcher,
    119                  ScopedGlobalRef<jstring> text);
    120     bool UpdateLastFindOffset() const;
    121 
    122     const JniCache* jni_cache_;
    123     ScopedGlobalRef<jobject> matcher_;
    124     ScopedGlobalRef<jstring> text_;
    125     mutable int last_find_offset_ = 0;
    126     mutable int last_find_offset_codepoints_ = 0;
    127     mutable bool last_find_offset_dirty_ = true;
    128   };
    129 
    130   class RegexPattern {
    131    public:
    132     std::unique_ptr<RegexMatcher> Matcher(const UnicodeText& context) const;
    133 
    134    private:
    135     friend class UniLib;
    136     RegexPattern(const JniCache* jni_cache, const UnicodeText& pattern,
    137                  bool lazy);
    138     void LockedInitializeIfNotAlready() const;
    139 
    140     const JniCache* jni_cache_;
    141 
    142     // These members need to be mutable because of the lazy initialization.
    143     // NOTE: The Matcher method first ensures (using a lock) that the
    144     // initialization was attempted (by using LockedInitializeIfNotAlready) and
    145     // then can access them without locking.
    146     mutable std::mutex mutex_;
    147     mutable ScopedGlobalRef<jobject> pattern_;
    148     mutable bool initialized_;
    149     mutable bool initialization_failure_;
    150     mutable UnicodeText pattern_text_;
    151   };
    152 
    153   class BreakIterator {
    154    public:
    155     int Next();
    156 
    157     static constexpr int kDone = -1;
    158 
    159    private:
    160     friend class UniLib;
    161     BreakIterator(const JniCache* jni_cache, const UnicodeText& text);
    162 
    163     const JniCache* jni_cache_;
    164     ScopedGlobalRef<jstring> text_;
    165     ScopedGlobalRef<jobject> iterator_;
    166     int last_break_index_;
    167     int last_unicode_index_;
    168   };
    169 
    170   std::unique_ptr<RegexPattern> CreateRegexPattern(
    171       const UnicodeText& regex) const;
    172   std::unique_ptr<RegexPattern> CreateLazyRegexPattern(
    173       const UnicodeText& regex) const;
    174   std::unique_ptr<BreakIterator> CreateBreakIterator(
    175       const UnicodeText& text) const;
    176 
    177  private:
    178   std::shared_ptr<JniCache> jni_cache_;
    179 };
    180 
    181 }  // namespace libtextclassifier3
    182 
    183 #endif  // LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_JAVAICU_H_
    184