Home | History | Annotate | Download | only in script
      1 /*
      2  * Copyright (C) 2018 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "lang_id/script/approx-script.h"
     18 
     19 #include "lang_id/common/lite_base/integral-types.h"
     20 #include "lang_id/common/lite_base/logging.h"
     21 #include "lang_id/common/utf8.h"
     22 #include "lang_id/script/approx-script-data.h"
     23 
     24 namespace libtextclassifier3 {
     25 namespace mobile {
     26 
     27 // int value of USCRIPT_UNKNOWN from enum UScriptCode (from
     28 // unicode/uscript.h).  Note: we do have a test that
     29 // USCRIPT_UNKNOWN evaluates to 103.
     30 const int kUnknownUscript = 103;
     31 
     32 namespace {
     33 using approx_script_internal::kNumRanges;
     34 using approx_script_internal::kRangeFirst;
     35 using approx_script_internal::kRangeScript;
     36 using approx_script_internal::kRangeSizeMinusOne;
     37 
     38 uint32 Utf8ToCodepoint(const unsigned char *s, int num_bytes) {
     39   switch (num_bytes) {
     40     case 1:
     41       return s[0];
     42     case 2:
     43       return ((s[0] & 0x1F) << 6) | (s[1] & 0x3F);
     44     case 3:
     45       return (((s[0] & 0x0F) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F));
     46     case 4:
     47       return (((s[0] & 0x07) << 18) | ((s[1] & 0x3F) << 12) |
     48               ((s[2] & 0x3F) << 6) | (s[3] & 0x3F));
     49     default:
     50       SAFTM_DLOG(FATAL) << "Illegal num_bytes: " << num_bytes;
     51       return 0;
     52   }
     53 }
     54 
     55 inline int BinarySearch(uint32 codepoint, int start, int end) {
     56   while (end > start + 1) {
     57     // Due to the while loop condition, middle > start and middle < end.  Hence,
     58     // on both branches of the if below, we strictly reduce the end - start
     59     // value, so we eventually get that difference below 1 and complete the
     60     // while loop.
     61     int middle = (start + end) / 2;
     62     if (codepoint < kRangeFirst[middle]) {
     63       end = middle;
     64     } else {
     65       start = middle;
     66     }
     67   }
     68 
     69   if (end == start + 1) {
     70     const uint32 range_start = kRangeFirst[start];
     71     if ((codepoint >= range_start) &&
     72         (codepoint <= range_start + kRangeSizeMinusOne[start])) {
     73       return kRangeScript[start];
     74     }
     75   }
     76 
     77   return kUnknownUscript;
     78 }
     79 }  // namespace
     80 
     81 int GetApproxScript(const unsigned char *s, int num_bytes) {
     82   SAFTM_DCHECK_NE(s, nullptr);
     83   SAFTM_DCHECK_EQ(num_bytes,
     84                   utils::OneCharLen(reinterpret_cast<const char *>(s)));
     85   uint32 codepoint = Utf8ToCodepoint(s, num_bytes);
     86   return BinarySearch(codepoint, 0, kNumRanges);
     87 }
     88 
     89 int GetMaxApproxScriptResult() { return approx_script_internal::kMaxScript; }
     90 
     91 SAFTM_STATIC_REGISTRATION(ApproxScriptDetector);
     92 
     93 }  // namespace mobile
     94 }  // namespace nlp_saft
     95