1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "lang_id/script/approx-script.h" 18 19 #include "lang_id/common/lite_base/integral-types.h" 20 #include "lang_id/common/lite_base/logging.h" 21 #include "lang_id/common/utf8.h" 22 #include "lang_id/script/approx-script-data.h" 23 24 namespace libtextclassifier3 { 25 namespace mobile { 26 27 // int value of USCRIPT_UNKNOWN from enum UScriptCode (from 28 // unicode/uscript.h). Note: we do have a test that 29 // USCRIPT_UNKNOWN evaluates to 103. 30 const int kUnknownUscript = 103; 31 32 namespace { 33 using approx_script_internal::kNumRanges; 34 using approx_script_internal::kRangeFirst; 35 using approx_script_internal::kRangeScript; 36 using approx_script_internal::kRangeSizeMinusOne; 37 38 uint32 Utf8ToCodepoint(const unsigned char *s, int num_bytes) { 39 switch (num_bytes) { 40 case 1: 41 return s[0]; 42 case 2: 43 return ((s[0] & 0x1F) << 6) | (s[1] & 0x3F); 44 case 3: 45 return (((s[0] & 0x0F) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F)); 46 case 4: 47 return (((s[0] & 0x07) << 18) | ((s[1] & 0x3F) << 12) | 48 ((s[2] & 0x3F) << 6) | (s[3] & 0x3F)); 49 default: 50 SAFTM_DLOG(FATAL) << "Illegal num_bytes: " << num_bytes; 51 return 0; 52 } 53 } 54 55 inline int BinarySearch(uint32 codepoint, int start, int end) { 56 while (end > start + 1) { 57 // Due to the while loop condition, middle > start and middle < end. Hence, 58 // on both branches of the if below, we strictly reduce the end - start 59 // value, so we eventually get that difference below 1 and complete the 60 // while loop. 61 int middle = (start + end) / 2; 62 if (codepoint < kRangeFirst[middle]) { 63 end = middle; 64 } else { 65 start = middle; 66 } 67 } 68 69 if (end == start + 1) { 70 const uint32 range_start = kRangeFirst[start]; 71 if ((codepoint >= range_start) && 72 (codepoint <= range_start + kRangeSizeMinusOne[start])) { 73 return kRangeScript[start]; 74 } 75 } 76 77 return kUnknownUscript; 78 } 79 } // namespace 80 81 int GetApproxScript(const unsigned char *s, int num_bytes) { 82 SAFTM_DCHECK_NE(s, nullptr); 83 SAFTM_DCHECK_EQ(num_bytes, 84 utils::OneCharLen(reinterpret_cast<const char *>(s))); 85 uint32 codepoint = Utf8ToCodepoint(s, num_bytes); 86 return BinarySearch(codepoint, 0, kNumRanges); 87 } 88 89 int GetMaxApproxScriptResult() { return approx_script_internal::kMaxScript; } 90 91 SAFTM_STATIC_REGISTRATION(ApproxScriptDetector); 92 93 } // namespace mobile 94 } // namespace nlp_saft 95