Home | History | Annotate | Download | only in script
      1 /*
      2  * Copyright (C) 2018 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #ifndef NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_SCRIPT_TINY_SCRIPT_DETECTOR_H_
     18 #define NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_SCRIPT_TINY_SCRIPT_DETECTOR_H_
     19 
     20 #include "lang_id/script/script-detector.h"
     21 
     22 namespace libtextclassifier3 {
     23 namespace mobile {
     24 namespace lang_id {
     25 
     26 // Unicode scripts we care about.  To get compact and fast code, we detect only
     27 // a few Unicode scripts that offer a strong indication about the language of
     28 // the text (e.g., Hiragana -> Japanese).
     29 enum Script {
     30   // Special value to indicate internal errors in the script detection code.
     31   kScriptError,
     32 
     33   // Special values for all Unicode scripts that we do not detect.  One special
     34   // value for Unicode characters of 1, 2, 3, respectively 4 bytes (as we
     35   // already have that information, we use it).  kScriptOtherUtf8OneByte means
     36   // ~Latin and kScriptOtherUtf8FourBytes means ~Han.
     37   kScriptOtherUtf8OneByte,
     38   kScriptOtherUtf8TwoBytes,
     39   kScriptOtherUtf8ThreeBytes,
     40   kScriptOtherUtf8FourBytes,
     41 
     42   kScriptGreek,
     43   kScriptCyrillic,
     44   kScriptHebrew,
     45   kScriptArabic,
     46   kScriptHangulJamo,  // Used primarily for Korean.
     47   kScriptHiragana,    // Used primarily for Japanese.
     48   kScriptKatakana,    // Used primarily for Japanese.
     49 
     50   // Add new scripts here.
     51 
     52   // Do not add any script after kNumRelevantScripts.  This value indicates the
     53   // number of elements in this enum Script (except this value) such that we can
     54   // easily iterate over the scripts.
     55   kNumRelevantScripts,
     56 };
     57 
     58 template<typename IntType>
     59 inline bool InRange(IntType value, IntType low, IntType hi) {
     60   return (value >= low) && (value <= hi);
     61 }
     62 
     63 // Returns Script for the UTF8 character that starts at address p.
     64 // Precondition: p points to a valid UTF8 character of num_bytes bytes.
     65 inline Script GetScript(const unsigned char *p, int num_bytes) {
     66   switch (num_bytes) {
     67     case 1:
     68       return kScriptOtherUtf8OneByte;
     69 
     70     case 2: {
     71       // 2-byte UTF8 characters have 11 bits of information.  unsigned int has
     72       // at least 16 bits (http://en.cppreference.com/w/cpp/language/types) so
     73       // it's enough.  It's also usually the fastest int type on the current
     74       // CPU, so it's better to use than int32.
     75       static const unsigned int kGreekStart = 0x370;
     76 
     77       // Commented out (unsued in the code): kGreekEnd = 0x3FF;
     78       static const unsigned int kCyrillicStart = 0x400;
     79       static const unsigned int kCyrillicEnd = 0x4FF;
     80       static const unsigned int kHebrewStart = 0x590;
     81 
     82       // Commented out (unsued in the code): kHebrewEnd = 0x5FF;
     83       static const unsigned int kArabicStart = 0x600;
     84       static const unsigned int kArabicEnd = 0x6FF;
     85       const unsigned int codepoint = ((p[0] & 0x1F) << 6) | (p[1] & 0x3F);
     86       if (codepoint > kCyrillicEnd) {
     87         if (codepoint >= kArabicStart) {
     88           if (codepoint <= kArabicEnd) {
     89             return kScriptArabic;
     90           }
     91         } else {
     92           // At this point, codepoint < kArabicStart = kHebrewEnd + 1, so
     93           // codepoint <= kHebrewEnd.
     94           if (codepoint >= kHebrewStart) {
     95             return kScriptHebrew;
     96           }
     97         }
     98       } else {
     99         if (codepoint >= kCyrillicStart) {
    100           return kScriptCyrillic;
    101         } else {
    102           // At this point, codepoint < kCyrillicStart = kGreekEnd + 1, so
    103           // codepoint <= kGreekEnd.
    104           if (codepoint >= kGreekStart) {
    105             return kScriptGreek;
    106           }
    107         }
    108       }
    109       return kScriptOtherUtf8TwoBytes;
    110     }
    111 
    112     case 3: {
    113       // 3-byte UTF8 characters have 16 bits of information.  unsigned int has
    114       // at least 16 bits.
    115       static const unsigned int kHangulJamoStart = 0x1100;
    116       static const unsigned int kHangulJamoEnd = 0x11FF;
    117       static const unsigned int kHiraganaStart = 0x3041;
    118       static const unsigned int kHiraganaEnd = 0x309F;
    119 
    120       // Commented out (unsued in the code): kKatakanaStart = 0x30A0;
    121       static const unsigned int kKatakanaEnd = 0x30FF;
    122       const unsigned int codepoint =
    123           ((p[0] & 0x0F) << 12) | ((p[1] & 0x3F) << 6) | (p[2] & 0x3F);
    124       if (codepoint > kHiraganaEnd) {
    125         // On this branch, codepoint > kHiraganaEnd = kKatakanaStart - 1, so
    126         // codepoint >= kKatakanaStart.
    127         if (codepoint <= kKatakanaEnd) {
    128           return kScriptKatakana;
    129         }
    130       } else {
    131         if (codepoint >= kHiraganaStart) {
    132           return kScriptHiragana;
    133         } else {
    134           if (InRange(codepoint, kHangulJamoStart, kHangulJamoEnd)) {
    135             return kScriptHangulJamo;
    136           }
    137         }
    138       }
    139       return kScriptOtherUtf8ThreeBytes;
    140     }
    141 
    142     case 4:
    143       return kScriptOtherUtf8FourBytes;
    144 
    145     default:
    146       return kScriptError;
    147   }
    148 }
    149 
    150 // Returns Script for the UTF8 character that starts at address p.  Similar to
    151 // the previous version of GetScript, except for "char" vs "unsigned char".
    152 // Most code works with "char *" pointers, ignoring the fact that char is
    153 // unsigned (by default) on most platforms, but signed on iOS.  This code takes
    154 // care of making sure we always treat chars as unsigned.
    155 inline Script GetScript(const char *p, int num_bytes) {
    156   return GetScript(reinterpret_cast<const unsigned char *>(p),
    157                    num_bytes);
    158 }
    159 
    160 class TinyScriptDetector : public ScriptDetector {
    161  public:
    162   ~TinyScriptDetector() override = default;
    163 
    164   int GetScript(const char *s, int num_bytes) const override {
    165     // Add the namespace in indicate that we want to call the method outside
    166     // this class, instead of performing an infinite recursive call.
    167     return libtextclassifier3::mobile::lang_id::GetScript(s, num_bytes);
    168   }
    169 
    170   int GetMaxScript() const override {
    171     return kNumRelevantScripts - 1;
    172   }
    173 
    174   SAFTM_DEFINE_REGISTRATION_METHOD("tiny-script-detector", TinyScriptDetector);
    175 };
    176 
    177 }  // namespace lang_id
    178 }  // namespace mobile
    179 }  // namespace nlp_saft
    180 
    181 #endif  // NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_SCRIPT_TINY_SCRIPT_DETECTOR_H_
    182