1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "utils/utf8/unilib-javaicu.h" 18 19 #include <algorithm> 20 #include <cassert> 21 #include <cctype> 22 #include <map> 23 24 #include "utils/java/string_utils.h" 25 26 namespace libtextclassifier3 { 27 namespace { 28 29 // ----------------------------------------------------------------------------- 30 // Native implementations. 31 // ----------------------------------------------------------------------------- 32 33 #define ARRAYSIZE(a) sizeof(a) / sizeof(*a) 34 35 // Derived from http://www.unicode.org/Public/UNIDATA/UnicodeData.txt 36 // grep -E "Ps" UnicodeData.txt | \ 37 // sed -rne "s/^([0-9A-Z]{4});.*(PAREN|BRACKET|BRAKCET|BRACE).*/0x\1, /p" 38 // IMPORTANT: entries with the same offsets in kOpeningBrackets and 39 // kClosingBrackets must be counterparts. 40 constexpr char32 kOpeningBrackets[] = { 41 0x0028, 0x005B, 0x007B, 0x0F3C, 0x2045, 0x207D, 0x208D, 0x2329, 0x2768, 42 0x276A, 0x276C, 0x2770, 0x2772, 0x2774, 0x27E6, 0x27E8, 0x27EA, 0x27EC, 43 0x27EE, 0x2983, 0x2985, 0x2987, 0x2989, 0x298B, 0x298D, 0x298F, 0x2991, 44 0x2993, 0x2995, 0x2997, 0x29FC, 0x2E22, 0x2E24, 0x2E26, 0x2E28, 0x3008, 45 0x300A, 0x300C, 0x300E, 0x3010, 0x3014, 0x3016, 0x3018, 0x301A, 0xFD3F, 46 0xFE17, 0xFE35, 0xFE37, 0xFE39, 0xFE3B, 0xFE3D, 0xFE3F, 0xFE41, 0xFE43, 47 0xFE47, 0xFE59, 0xFE5B, 0xFE5D, 0xFF08, 0xFF3B, 0xFF5B, 0xFF5F, 0xFF62}; 48 constexpr int kNumOpeningBrackets = ARRAYSIZE(kOpeningBrackets); 49 50 // grep -E "Pe" UnicodeData.txt | \ 51 // sed -rne "s/^([0-9A-Z]{4});.*(PAREN|BRACKET|BRAKCET|BRACE).*/0x\1, /p" 52 constexpr char32 kClosingBrackets[] = { 53 0x0029, 0x005D, 0x007D, 0x0F3D, 0x2046, 0x207E, 0x208E, 0x232A, 0x2769, 54 0x276B, 0x276D, 0x2771, 0x2773, 0x2775, 0x27E7, 0x27E9, 0x27EB, 0x27ED, 55 0x27EF, 0x2984, 0x2986, 0x2988, 0x298A, 0x298C, 0x298E, 0x2990, 0x2992, 56 0x2994, 0x2996, 0x2998, 0x29FD, 0x2E23, 0x2E25, 0x2E27, 0x2E29, 0x3009, 57 0x300B, 0x300D, 0x300F, 0x3011, 0x3015, 0x3017, 0x3019, 0x301B, 0xFD3E, 58 0xFE18, 0xFE36, 0xFE38, 0xFE3A, 0xFE3C, 0xFE3E, 0xFE40, 0xFE42, 0xFE44, 59 0xFE48, 0xFE5A, 0xFE5C, 0xFE5E, 0xFF09, 0xFF3D, 0xFF5D, 0xFF60, 0xFF63}; 60 constexpr int kNumClosingBrackets = ARRAYSIZE(kClosingBrackets); 61 62 // grep -E "WS" UnicodeData.txt | sed -re "s/([0-9A-Z]+);.*/0x\1, /" 63 constexpr char32 kWhitespaces[] = { 64 0x000C, 0x0020, 0x1680, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 65 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x2028, 0x205F, 66 0x21C7, 0x21C8, 0x21C9, 0x21CA, 0x21F6, 0x2B31, 0x2B84, 0x2B85, 67 0x2B86, 0x2B87, 0x2B94, 0x3000, 0x4DCC, 0x10344, 0x10347, 0x1DA0A, 68 0x1DA0B, 0x1DA0C, 0x1DA0D, 0x1DA0E, 0x1DA0F, 0x1DA10, 0x1F4F0, 0x1F500, 69 0x1F501, 0x1F502, 0x1F503, 0x1F504, 0x1F5D8, 0x1F5DE}; 70 constexpr int kNumWhitespaces = ARRAYSIZE(kWhitespaces); 71 72 // grep -E "Nd" UnicodeData.txt | sed -re "s/([0-9A-Z]+);.*/0x\1, /" 73 // As the name suggests, these ranges are always 10 codepoints long, so we just 74 // store the end of the range. 75 constexpr char32 kDecimalDigitRangesEnd[] = { 76 0x0039, 0x0669, 0x06f9, 0x07c9, 0x096f, 0x09ef, 0x0a6f, 0x0aef, 77 0x0b6f, 0x0bef, 0x0c6f, 0x0cef, 0x0d6f, 0x0def, 0x0e59, 0x0ed9, 78 0x0f29, 0x1049, 0x1099, 0x17e9, 0x1819, 0x194f, 0x19d9, 0x1a89, 79 0x1a99, 0x1b59, 0x1bb9, 0x1c49, 0x1c59, 0xa629, 0xa8d9, 0xa909, 80 0xa9d9, 0xa9f9, 0xaa59, 0xabf9, 0xff19, 0x104a9, 0x1106f, 0x110f9, 81 0x1113f, 0x111d9, 0x112f9, 0x11459, 0x114d9, 0x11659, 0x116c9, 0x11739, 82 0x118e9, 0x11c59, 0x11d59, 0x16a69, 0x16b59, 0x1d7ff}; 83 constexpr int kNumDecimalDigitRangesEnd = ARRAYSIZE(kDecimalDigitRangesEnd); 84 85 // grep -E "Lu" UnicodeData.txt | sed -re "s/([0-9A-Z]+);.*/0x\1, /" 86 // There are three common ways in which upper/lower case codepoint ranges 87 // were introduced: one offs, dense ranges, and ranges that alternate between 88 // lower and upper case. For the sake of keeping out binary size down, we 89 // treat each independently. 90 constexpr char32 kUpperSingles[] = { 91 0x01b8, 0x01bc, 0x01c4, 0x01c7, 0x01ca, 0x01f1, 0x0376, 0x037f, 92 0x03cf, 0x03f4, 0x03fa, 0x10c7, 0x10cd, 0x2102, 0x2107, 0x2115, 93 0x2145, 0x2183, 0x2c72, 0x2c75, 0x2cf2, 0xa7b6}; 94 constexpr int kNumUpperSingles = ARRAYSIZE(kUpperSingles); 95 constexpr char32 kUpperRanges1Start[] = { 96 0x0041, 0x00c0, 0x00d8, 0x0181, 0x018a, 0x018e, 0x0193, 0x0196, 97 0x019c, 0x019f, 0x01b2, 0x01f7, 0x023a, 0x023d, 0x0244, 0x0389, 98 0x0392, 0x03a3, 0x03d2, 0x03fd, 0x0531, 0x10a0, 0x13a0, 0x1f08, 99 0x1f18, 0x1f28, 0x1f38, 0x1f48, 0x1f68, 0x1fb8, 0x1fc8, 0x1fd8, 100 0x1fe8, 0x1ff8, 0x210b, 0x2110, 0x2119, 0x212b, 0x2130, 0x213e, 101 0x2c00, 0x2c63, 0x2c6e, 0x2c7e, 0xa7ab, 0xa7b0}; 102 constexpr int kNumUpperRanges1Start = ARRAYSIZE(kUpperRanges1Start); 103 constexpr char32 kUpperRanges1End[] = { 104 0x005a, 0x00d6, 0x00de, 0x0182, 0x018b, 0x0191, 0x0194, 0x0198, 105 0x019d, 0x01a0, 0x01b3, 0x01f8, 0x023b, 0x023e, 0x0246, 0x038a, 106 0x03a1, 0x03ab, 0x03d4, 0x042f, 0x0556, 0x10c5, 0x13f5, 0x1f0f, 107 0x1f1d, 0x1f2f, 0x1f3f, 0x1f4d, 0x1f6f, 0x1fbb, 0x1fcb, 0x1fdb, 108 0x1fec, 0x1ffb, 0x210d, 0x2112, 0x211d, 0x212d, 0x2133, 0x213f, 109 0x2c2e, 0x2c64, 0x2c70, 0x2c80, 0xa7ae, 0xa7b4}; 110 constexpr int kNumUpperRanges1End = ARRAYSIZE(kUpperRanges1End); 111 constexpr char32 kUpperRanges2Start[] = { 112 0x0100, 0x0139, 0x014a, 0x0179, 0x0184, 0x0187, 0x01a2, 0x01a7, 0x01ac, 113 0x01af, 0x01b5, 0x01cd, 0x01de, 0x01f4, 0x01fa, 0x0241, 0x0248, 0x0370, 114 0x0386, 0x038c, 0x038f, 0x03d8, 0x03f7, 0x0460, 0x048a, 0x04c1, 0x04d0, 115 0x1e00, 0x1e9e, 0x1f59, 0x2124, 0x2c60, 0x2c67, 0x2c82, 0x2ceb, 0xa640, 116 0xa680, 0xa722, 0xa732, 0xa779, 0xa77e, 0xa78b, 0xa790, 0xa796}; 117 constexpr int kNumUpperRanges2Start = ARRAYSIZE(kUpperRanges2Start); 118 constexpr char32 kUpperRanges2End[] = { 119 0x0136, 0x0147, 0x0178, 0x017d, 0x0186, 0x0189, 0x01a6, 0x01a9, 0x01ae, 120 0x01b1, 0x01b7, 0x01db, 0x01ee, 0x01f6, 0x0232, 0x0243, 0x024e, 0x0372, 121 0x0388, 0x038e, 0x0391, 0x03ee, 0x03f9, 0x0480, 0x04c0, 0x04cd, 0x052e, 122 0x1e94, 0x1efe, 0x1f5f, 0x212a, 0x2c62, 0x2c6d, 0x2ce2, 0x2ced, 0xa66c, 123 0xa69a, 0xa72e, 0xa76e, 0xa77d, 0xa786, 0xa78d, 0xa792, 0xa7aa}; 124 constexpr int kNumUpperRanges2End = ARRAYSIZE(kUpperRanges2End); 125 126 // grep -E "Lu" UnicodeData.txt | \ 127 // sed -rne "s/^([0-9A-Z]+);.*;([0-9A-Z]+);$/(0x\1, 0x\2), /p" 128 // We have two strategies for mapping from upper to lower case. We have single 129 // character lookups that do not follow a pattern, and ranges for which there 130 // is a constant codepoint shift. 131 // Note that these ranges ignore anything that's not an upper case character, 132 // so when applied to a non-uppercase character the result is incorrect. 133 constexpr int kToLowerSingles[] = { 134 0x0130, 0x0178, 0x0181, 0x0186, 0x018b, 0x018e, 0x018f, 0x0190, 0x0191, 135 0x0194, 0x0196, 0x0197, 0x0198, 0x019c, 0x019d, 0x019f, 0x01a6, 0x01a9, 136 0x01ae, 0x01b7, 0x01f6, 0x01f7, 0x0220, 0x023a, 0x023d, 0x023e, 0x0243, 137 0x0244, 0x0245, 0x037f, 0x0386, 0x038c, 0x03cf, 0x03f4, 0x03f9, 0x04c0, 138 0x1e9e, 0x1fec, 0x2126, 0x212a, 0x212b, 0x2132, 0x2183, 0x2c60, 0x2c62, 139 0x2c63, 0x2c64, 0x2c6d, 0x2c6e, 0x2c6f, 0x2c70, 0xa77d, 0xa78d, 0xa7aa, 140 0xa7ab, 0xa7ac, 0xa7ad, 0xa7ae, 0xa7b0, 0xa7b1, 0xa7b2, 0xa7b3}; 141 constexpr int kNumToLowerSingles = ARRAYSIZE(kToLowerSingles); 142 constexpr int kToLowerSinglesOffsets[] = { 143 -199, -121, 210, 206, 1, 79, 202, 203, 1, 144 207, 211, 209, 1, 211, 213, 214, 218, 218, 145 218, 219, -97, -56, -130, 10795, -163, 10792, -195, 146 69, 71, 116, 38, 64, 8, -60, -7, 15, 147 -7615, -7, -7517, -8383, -8262, 28, 1, 1, -10743, 148 -3814, -10727, -10780, -10749, -10783, -10782, -35332, -42280, -42308, 149 -42319, -42315, -42305, -42308, -42258, -42282, -42261, 928}; 150 constexpr int kNumToLowerSinglesOffsets = ARRAYSIZE(kToLowerSinglesOffsets); 151 constexpr int kToLowerRangesStart[] = { 152 0x0041, 0x0100, 0x0189, 0x01a0, 0x01b1, 0x01b3, 0x0388, 0x038e, 0x0391, 153 0x03d8, 0x03fd, 0x0400, 0x0410, 0x0460, 0x0531, 0x10a0, 0x13a0, 0x13f0, 154 0x1e00, 0x1f08, 0x1fba, 0x1fc8, 0x1fd8, 0x1fda, 0x1fe8, 0x1fea, 0x1ff8, 155 0x1ffa, 0x2c00, 0x2c67, 0x2c7e, 0x2c80, 0xff21, 0x10400, 0x10c80, 0x118a0}; 156 constexpr int kNumToLowerRangesStart = ARRAYSIZE(kToLowerRangesStart); 157 constexpr int kToLowerRangesEnd[] = { 158 0x00de, 0x0187, 0x019f, 0x01af, 0x01b2, 0x0386, 0x038c, 0x038f, 0x03cf, 159 0x03fa, 0x03ff, 0x040f, 0x042f, 0x052e, 0x0556, 0x10cd, 0x13ef, 0x13f5, 160 0x1efe, 0x1fb9, 0x1fbb, 0x1fcb, 0x1fd9, 0x1fdb, 0x1fe9, 0x1fec, 0x1ff9, 161 0x2183, 0x2c64, 0x2c75, 0x2c7f, 0xa7b6, 0xff3a, 0x104d3, 0x10cb2, 0x118bf}; 162 constexpr int kNumToLowerRangesEnd = ARRAYSIZE(kToLowerRangesEnd); 163 constexpr int kToLowerRangesOffsets[] = { 164 32, 1, 205, 1, 217, 1, 37, 63, 32, 1, -130, 80, 165 32, 1, 48, 7264, 38864, 8, 1, -8, -74, -86, -8, -100, 166 -8, -112, -128, -126, 48, 1, -10815, 1, 32, 40, 64, 32}; 167 constexpr int kNumToLowerRangesOffsets = ARRAYSIZE(kToLowerRangesOffsets); 168 169 #undef ARRAYSIZE 170 171 static_assert(kNumOpeningBrackets == kNumClosingBrackets, 172 "mismatching number of opening and closing brackets"); 173 static_assert(kNumUpperRanges1Start == kNumUpperRanges1End, 174 "number of uppercase stride 1 range starts/ends doesn't match"); 175 static_assert(kNumUpperRanges2Start == kNumUpperRanges2End, 176 "number of uppercase stride 2 range starts/ends doesn't match"); 177 static_assert(kNumToLowerSingles == kNumToLowerSinglesOffsets, 178 "number of to lower singles and offsets doesn't match"); 179 static_assert(kNumToLowerRangesStart == kNumToLowerRangesEnd, 180 "mismatching number of range starts/ends for to lower ranges"); 181 static_assert(kNumToLowerRangesStart == kNumToLowerRangesOffsets, 182 "number of to lower ranges and offsets doesn't match"); 183 184 constexpr int kNoMatch = -1; 185 186 // Returns the index of the element in the array that matched the given 187 // codepoint, or kNoMatch if the element didn't exist. 188 // The input array must be in sorted order. 189 int GetMatchIndex(const char32* array, int array_length, char32 c) { 190 const char32* end = array + array_length; 191 const auto find_it = std::lower_bound(array, end, c); 192 if (find_it != end && *find_it == c) { 193 return find_it - array; 194 } else { 195 return kNoMatch; 196 } 197 } 198 199 // Returns the index of the range in the array that overlapped the given 200 // codepoint, or kNoMatch if no such range existed. 201 // The input array must be in sorted order. 202 int GetOverlappingRangeIndex(const char32* arr, int arr_length, 203 int range_length, char32 c) { 204 const char32* end = arr + arr_length; 205 const auto find_it = std::lower_bound(arr, end, c); 206 if (find_it == end) { 207 return kNoMatch; 208 } 209 // The end is inclusive, we so subtract one less than the range length. 210 const char32 range_end = *find_it; 211 const char32 range_start = range_end - (range_length - 1); 212 if (c < range_start || range_end < c) { 213 return kNoMatch; 214 } else { 215 return find_it - arr; 216 } 217 } 218 219 // As above, but with explicit codepoint start and end indices for the range. 220 // The input array must be in sorted order. 221 int GetOverlappingRangeIndex(const char32* start_arr, const char32* end_arr, 222 int arr_length, int stride, char32 c) { 223 const char32* end_arr_end = end_arr + arr_length; 224 const auto find_it = std::lower_bound(end_arr, end_arr_end, c); 225 if (find_it == end_arr_end) { 226 return kNoMatch; 227 } 228 // Find the corresponding start. 229 const int range_index = find_it - end_arr; 230 const char32 range_start = start_arr[range_index]; 231 const char32 range_end = *find_it; 232 if (c < range_start || range_end < c) { 233 return kNoMatch; 234 } 235 if ((c - range_start) % stride == 0) { 236 return range_index; 237 } else { 238 return kNoMatch; 239 } 240 } 241 242 } // anonymous namespace 243 244 UniLib::UniLib() { 245 TC3_LOG(FATAL) << "Java ICU UniLib must be initialized with a JniCache."; 246 } 247 248 UniLib::UniLib(const std::shared_ptr<JniCache>& jni_cache) 249 : jni_cache_(jni_cache) {} 250 251 bool UniLib::IsOpeningBracket(char32 codepoint) const { 252 return GetMatchIndex(kOpeningBrackets, kNumOpeningBrackets, codepoint) >= 0; 253 } 254 255 bool UniLib::IsClosingBracket(char32 codepoint) const { 256 return GetMatchIndex(kClosingBrackets, kNumClosingBrackets, codepoint) >= 0; 257 } 258 259 bool UniLib::IsWhitespace(char32 codepoint) const { 260 return GetMatchIndex(kWhitespaces, kNumWhitespaces, codepoint) >= 0; 261 } 262 263 bool UniLib::IsDigit(char32 codepoint) const { 264 return GetOverlappingRangeIndex(kDecimalDigitRangesEnd, 265 kNumDecimalDigitRangesEnd, 266 /*range_length=*/10, codepoint) >= 0; 267 } 268 269 bool UniLib::IsUpper(char32 codepoint) const { 270 if (GetMatchIndex(kUpperSingles, kNumUpperSingles, codepoint) >= 0) { 271 return true; 272 } else if (GetOverlappingRangeIndex(kUpperRanges1Start, kUpperRanges1End, 273 kNumUpperRanges1Start, /*stride=*/1, 274 codepoint) >= 0) { 275 return true; 276 } else if (GetOverlappingRangeIndex(kUpperRanges2Start, kUpperRanges2End, 277 kNumUpperRanges2Start, /*stride=*/2, 278 codepoint) >= 0) { 279 return true; 280 } else { 281 return false; 282 } 283 } 284 285 char32 UniLib::ToLower(char32 codepoint) const { 286 // Make sure we still produce output even if the method is called for a 287 // codepoint that's not an uppercase character. 288 if (!IsUpper(codepoint)) { 289 return codepoint; 290 } 291 const int singles_idx = 292 GetMatchIndex(kToLowerSingles, kNumToLowerSingles, codepoint); 293 if (singles_idx >= 0) { 294 return codepoint + kToLowerSinglesOffsets[singles_idx]; 295 } 296 const int ranges_idx = 297 GetOverlappingRangeIndex(kToLowerRangesStart, kToLowerRangesEnd, 298 kNumToLowerRangesStart, /*stride=*/1, codepoint); 299 if (ranges_idx >= 0) { 300 return codepoint + kToLowerRangesOffsets[ranges_idx]; 301 } 302 return codepoint; 303 } 304 305 char32 UniLib::GetPairedBracket(char32 codepoint) const { 306 const int open_offset = 307 GetMatchIndex(kOpeningBrackets, kNumOpeningBrackets, codepoint); 308 if (open_offset >= 0) { 309 return kClosingBrackets[open_offset]; 310 } 311 const int close_offset = 312 GetMatchIndex(kClosingBrackets, kNumClosingBrackets, codepoint); 313 if (close_offset >= 0) { 314 return kOpeningBrackets[close_offset]; 315 } 316 return codepoint; 317 } 318 319 // ----------------------------------------------------------------------------- 320 // Implementations that call out to JVM. Behold the beauty. 321 // ----------------------------------------------------------------------------- 322 323 bool UniLib::ParseInt32(const UnicodeText& text, int* result) const { 324 if (jni_cache_) { 325 JNIEnv* env = jni_cache_->GetEnv(); 326 const ScopedLocalRef<jstring> text_java = 327 jni_cache_->ConvertToJavaString(text); 328 jint res = env->CallStaticIntMethod(jni_cache_->integer_class.get(), 329 jni_cache_->integer_parse_int, 330 text_java.get()); 331 if (jni_cache_->ExceptionCheckAndClear()) { 332 return false; 333 } 334 *result = res; 335 return true; 336 } 337 return false; 338 } 339 340 std::unique_ptr<UniLib::RegexPattern> UniLib::CreateRegexPattern( 341 const UnicodeText& regex) const { 342 return std::unique_ptr<UniLib::RegexPattern>( 343 new UniLib::RegexPattern(jni_cache_.get(), regex, /*lazy=*/false)); 344 } 345 346 std::unique_ptr<UniLib::RegexPattern> UniLib::CreateLazyRegexPattern( 347 const UnicodeText& regex) const { 348 return std::unique_ptr<UniLib::RegexPattern>( 349 new UniLib::RegexPattern(jni_cache_.get(), regex, /*lazy=*/true)); 350 } 351 352 UniLib::RegexPattern::RegexPattern(const JniCache* jni_cache, 353 const UnicodeText& pattern, bool lazy) 354 : jni_cache_(jni_cache), 355 pattern_(nullptr, jni_cache ? jni_cache->jvm : nullptr), 356 initialized_(false), 357 initialization_failure_(false), 358 pattern_text_(pattern) { 359 if (!lazy) { 360 LockedInitializeIfNotAlready(); 361 } 362 } 363 364 void UniLib::RegexPattern::LockedInitializeIfNotAlready() const { 365 std::lock_guard<std::mutex> guard(mutex_); 366 if (initialized_ || initialization_failure_) { 367 return; 368 } 369 370 if (jni_cache_) { 371 JNIEnv* jenv = jni_cache_->GetEnv(); 372 const ScopedLocalRef<jstring> regex_java = 373 jni_cache_->ConvertToJavaString(pattern_text_); 374 pattern_ = MakeGlobalRef(jenv->CallStaticObjectMethod( 375 jni_cache_->pattern_class.get(), 376 jni_cache_->pattern_compile, regex_java.get()), 377 jenv, jni_cache_->jvm); 378 379 if (jni_cache_->ExceptionCheckAndClear() || pattern_ == nullptr) { 380 initialization_failure_ = true; 381 pattern_.reset(); 382 return; 383 } 384 385 initialized_ = true; 386 pattern_text_.clear(); // We don't need this anymore. 387 } 388 } 389 390 constexpr int UniLib::RegexMatcher::kError; 391 constexpr int UniLib::RegexMatcher::kNoError; 392 393 std::unique_ptr<UniLib::RegexMatcher> UniLib::RegexPattern::Matcher( 394 const UnicodeText& context) const { 395 LockedInitializeIfNotAlready(); // Possibly lazy initialization. 396 if (initialization_failure_) { 397 return nullptr; 398 } 399 400 if (jni_cache_) { 401 JNIEnv* env = jni_cache_->GetEnv(); 402 const jstring context_java = 403 jni_cache_->ConvertToJavaString(context).release(); 404 if (!context_java) { 405 return nullptr; 406 } 407 const jobject matcher = env->CallObjectMethod( 408 pattern_.get(), jni_cache_->pattern_matcher, context_java); 409 if (jni_cache_->ExceptionCheckAndClear() || !matcher) { 410 return nullptr; 411 } 412 return std::unique_ptr<UniLib::RegexMatcher>(new RegexMatcher( 413 jni_cache_, MakeGlobalRef(matcher, env, jni_cache_->jvm), 414 MakeGlobalRef(context_java, env, jni_cache_->jvm))); 415 } else { 416 // NOTE: A valid object needs to be created here to pass the interface 417 // tests. 418 return std::unique_ptr<UniLib::RegexMatcher>( 419 new RegexMatcher(jni_cache_, nullptr, nullptr)); 420 } 421 } 422 423 UniLib::RegexMatcher::RegexMatcher(const JniCache* jni_cache, 424 ScopedGlobalRef<jobject> matcher, 425 ScopedGlobalRef<jstring> text) 426 : jni_cache_(jni_cache), 427 matcher_(std::move(matcher)), 428 text_(std::move(text)) {} 429 430 bool UniLib::RegexMatcher::Matches(int* status) const { 431 if (jni_cache_) { 432 *status = kNoError; 433 const bool result = jni_cache_->GetEnv()->CallBooleanMethod( 434 matcher_.get(), jni_cache_->matcher_matches); 435 if (jni_cache_->ExceptionCheckAndClear()) { 436 *status = kError; 437 return false; 438 } 439 return result; 440 } else { 441 *status = kError; 442 return false; 443 } 444 } 445 446 bool UniLib::RegexMatcher::ApproximatelyMatches(int* status) { 447 *status = kNoError; 448 449 jni_cache_->GetEnv()->CallObjectMethod(matcher_.get(), 450 jni_cache_->matcher_reset); 451 if (jni_cache_->ExceptionCheckAndClear()) { 452 *status = kError; 453 return kError; 454 } 455 456 if (!Find(status) || *status != kNoError) { 457 return false; 458 } 459 460 const int found_start = jni_cache_->GetEnv()->CallIntMethod( 461 matcher_.get(), jni_cache_->matcher_start_idx, 0); 462 if (jni_cache_->ExceptionCheckAndClear()) { 463 *status = kError; 464 return kError; 465 } 466 467 const int found_end = jni_cache_->GetEnv()->CallIntMethod( 468 matcher_.get(), jni_cache_->matcher_end_idx, 0); 469 if (jni_cache_->ExceptionCheckAndClear()) { 470 *status = kError; 471 return kError; 472 } 473 474 int context_length_bmp = jni_cache_->GetEnv()->CallIntMethod( 475 text_.get(), jni_cache_->string_length); 476 if (jni_cache_->ExceptionCheckAndClear()) { 477 *status = kError; 478 return false; 479 } 480 481 if (found_start != 0 || found_end != context_length_bmp) { 482 return false; 483 } 484 485 return true; 486 } 487 488 bool UniLib::RegexMatcher::UpdateLastFindOffset() const { 489 if (!last_find_offset_dirty_) { 490 return true; 491 } 492 493 const int find_offset = jni_cache_->GetEnv()->CallIntMethod( 494 matcher_.get(), jni_cache_->matcher_start_idx, 0); 495 if (jni_cache_->ExceptionCheckAndClear()) { 496 return false; 497 } 498 499 const int codepoint_count = jni_cache_->GetEnv()->CallIntMethod( 500 text_.get(), jni_cache_->string_code_point_count, last_find_offset_, 501 find_offset); 502 if (jni_cache_->ExceptionCheckAndClear()) { 503 return false; 504 } 505 506 last_find_offset_codepoints_ += codepoint_count; 507 last_find_offset_ = find_offset; 508 last_find_offset_dirty_ = false; 509 510 return true; 511 } 512 513 bool UniLib::RegexMatcher::Find(int* status) { 514 if (jni_cache_) { 515 const bool result = jni_cache_->GetEnv()->CallBooleanMethod( 516 matcher_.get(), jni_cache_->matcher_find); 517 if (jni_cache_->ExceptionCheckAndClear()) { 518 *status = kError; 519 return false; 520 } 521 522 last_find_offset_dirty_ = true; 523 *status = kNoError; 524 return result; 525 } else { 526 *status = kError; 527 return false; 528 } 529 } 530 531 int UniLib::RegexMatcher::Start(int* status) const { 532 return Start(/*group_idx=*/0, status); 533 } 534 535 int UniLib::RegexMatcher::Start(int group_idx, int* status) const { 536 if (jni_cache_) { 537 *status = kNoError; 538 539 if (!UpdateLastFindOffset()) { 540 *status = kError; 541 return kError; 542 } 543 544 const int java_index = jni_cache_->GetEnv()->CallIntMethod( 545 matcher_.get(), jni_cache_->matcher_start_idx, group_idx); 546 if (jni_cache_->ExceptionCheckAndClear()) { 547 *status = kError; 548 return kError; 549 } 550 551 // If the group didn't participate in the match the index is -1. 552 if (java_index == -1) { 553 return -1; 554 } 555 556 const int unicode_index = jni_cache_->GetEnv()->CallIntMethod( 557 text_.get(), jni_cache_->string_code_point_count, last_find_offset_, 558 java_index); 559 if (jni_cache_->ExceptionCheckAndClear()) { 560 *status = kError; 561 return kError; 562 } 563 564 return unicode_index + last_find_offset_codepoints_; 565 } else { 566 *status = kError; 567 return kError; 568 } 569 } 570 571 int UniLib::RegexMatcher::End(int* status) const { 572 return End(/*group_idx=*/0, status); 573 } 574 575 int UniLib::RegexMatcher::End(int group_idx, int* status) const { 576 if (jni_cache_) { 577 *status = kNoError; 578 579 if (!UpdateLastFindOffset()) { 580 *status = kError; 581 return kError; 582 } 583 584 const int java_index = jni_cache_->GetEnv()->CallIntMethod( 585 matcher_.get(), jni_cache_->matcher_end_idx, group_idx); 586 if (jni_cache_->ExceptionCheckAndClear()) { 587 *status = kError; 588 return kError; 589 } 590 591 // If the group didn't participate in the match the index is -1. 592 if (java_index == -1) { 593 return -1; 594 } 595 596 const int unicode_index = jni_cache_->GetEnv()->CallIntMethod( 597 text_.get(), jni_cache_->string_code_point_count, last_find_offset_, 598 java_index); 599 if (jni_cache_->ExceptionCheckAndClear()) { 600 *status = kError; 601 return kError; 602 } 603 604 return unicode_index + last_find_offset_codepoints_; 605 } else { 606 *status = kError; 607 return kError; 608 } 609 } 610 611 UnicodeText UniLib::RegexMatcher::Group(int* status) const { 612 if (jni_cache_) { 613 JNIEnv* jenv = jni_cache_->GetEnv(); 614 const ScopedLocalRef<jstring> java_result( 615 reinterpret_cast<jstring>( 616 jenv->CallObjectMethod(matcher_.get(), jni_cache_->matcher_group)), 617 jenv); 618 if (jni_cache_->ExceptionCheckAndClear() || !java_result) { 619 *status = kError; 620 return UTF8ToUnicodeText("", /*do_copy=*/false); 621 } 622 623 std::string result; 624 if (!JStringToUtf8String(jenv, java_result.get(), &result)) { 625 *status = kError; 626 return UTF8ToUnicodeText("", /*do_copy=*/false); 627 } 628 *status = kNoError; 629 return UTF8ToUnicodeText(result, /*do_copy=*/true); 630 } else { 631 *status = kError; 632 return UTF8ToUnicodeText("", /*do_copy=*/false); 633 } 634 } 635 636 UnicodeText UniLib::RegexMatcher::Group(int group_idx, int* status) const { 637 if (jni_cache_) { 638 JNIEnv* jenv = jni_cache_->GetEnv(); 639 const ScopedLocalRef<jstring> java_result( 640 reinterpret_cast<jstring>(jenv->CallObjectMethod( 641 matcher_.get(), jni_cache_->matcher_group_idx, group_idx)), 642 jenv); 643 if (jni_cache_->ExceptionCheckAndClear()) { 644 *status = kError; 645 TC3_LOG(ERROR) << "Exception occurred"; 646 return UTF8ToUnicodeText("", /*do_copy=*/false); 647 } 648 649 // java_result is nullptr when the group did not participate in the match. 650 // For these cases other UniLib implementations return empty string, and 651 // the participation can be checked by checking if Start() == -1. 652 if (!java_result) { 653 *status = kNoError; 654 return UTF8ToUnicodeText("", /*do_copy=*/false); 655 } 656 657 std::string result; 658 if (!JStringToUtf8String(jenv, java_result.get(), &result)) { 659 *status = kError; 660 return UTF8ToUnicodeText("", /*do_copy=*/false); 661 } 662 *status = kNoError; 663 return UTF8ToUnicodeText(result, /*do_copy=*/true); 664 } else { 665 *status = kError; 666 return UTF8ToUnicodeText("", /*do_copy=*/false); 667 } 668 } 669 670 constexpr int UniLib::BreakIterator::kDone; 671 672 UniLib::BreakIterator::BreakIterator(const JniCache* jni_cache, 673 const UnicodeText& text) 674 : jni_cache_(jni_cache), 675 text_(nullptr, jni_cache ? jni_cache->jvm : nullptr), 676 iterator_(nullptr, jni_cache ? jni_cache->jvm : nullptr), 677 last_break_index_(0), 678 last_unicode_index_(0) { 679 if (jni_cache_) { 680 JNIEnv* jenv = jni_cache_->GetEnv(); 681 text_ = MakeGlobalRef(jni_cache_->ConvertToJavaString(text).release(), jenv, 682 jni_cache->jvm); 683 if (!text_) { 684 return; 685 } 686 687 iterator_ = MakeGlobalRef( 688 jenv->CallStaticObjectMethod(jni_cache->breakiterator_class.get(), 689 jni_cache->breakiterator_getwordinstance, 690 jni_cache->locale_us.get()), 691 jenv, jni_cache->jvm); 692 if (!iterator_) { 693 return; 694 } 695 jenv->CallVoidMethod(iterator_.get(), jni_cache->breakiterator_settext, 696 text_.get()); 697 } 698 } 699 700 int UniLib::BreakIterator::Next() { 701 if (jni_cache_) { 702 const int break_index = jni_cache_->GetEnv()->CallIntMethod( 703 iterator_.get(), jni_cache_->breakiterator_next); 704 if (jni_cache_->ExceptionCheckAndClear() || 705 break_index == BreakIterator::kDone) { 706 return BreakIterator::kDone; 707 } 708 709 const int token_unicode_length = jni_cache_->GetEnv()->CallIntMethod( 710 text_.get(), jni_cache_->string_code_point_count, last_break_index_, 711 break_index); 712 if (jni_cache_->ExceptionCheckAndClear()) { 713 return BreakIterator::kDone; 714 } 715 716 last_break_index_ = break_index; 717 return last_unicode_index_ += token_unicode_length; 718 } 719 return BreakIterator::kDone; 720 } 721 722 std::unique_ptr<UniLib::BreakIterator> UniLib::CreateBreakIterator( 723 const UnicodeText& text) const { 724 return std::unique_ptr<UniLib::BreakIterator>( 725 new UniLib::BreakIterator(jni_cache_.get(), text)); 726 } 727 728 } // namespace libtextclassifier3 729