Home | History | Annotate | Download | only in lang_id
      1 /*
      2  * Copyright (C) 2017 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #ifndef LIBTEXTCLASSIFIER_LANG_ID_SCRIPT_DETECTOR_H_
     18 #define LIBTEXTCLASSIFIER_LANG_ID_SCRIPT_DETECTOR_H_
     19 
     20 namespace libtextclassifier {
     21 namespace nlp_core {
     22 namespace lang_id {
     23 
     24 // Unicode scripts we care about.  To get compact and fast code, we detect only
     25 // a few Unicode scripts that offer a strong indication about the language of
     26 // the text (e.g., Hiragana -> Japanese).
     27 enum Script {
     28   // Special value to indicate internal errors in the script detection code.
     29   kScriptError,
     30 
     31   // Special values for all Unicode scripts that we do not detect.  One special
     32   // value for Unicode characters of 1, 2, 3, respectively 4 bytes (as we
     33   // already have that information, we use it).  kScriptOtherUtf8OneByte means
     34   // ~Latin and kScriptOtherUtf8FourBytes means ~Han.
     35   kScriptOtherUtf8OneByte,
     36   kScriptOtherUtf8TwoBytes,
     37   kScriptOtherUtf8ThreeBytes,
     38   kScriptOtherUtf8FourBytes,
     39 
     40   kScriptGreek,
     41   kScriptCyrillic,
     42   kScriptHebrew,
     43   kScriptArabic,
     44   kScriptHangulJamo,  // Used primarily for Korean.
     45   kScriptHiragana,    // Used primarily for Japanese.
     46   kScriptKatakana,    // Used primarily for Japanese.
     47 
     48   // Add new scripts here.
     49 
     50   // Do not add any script after kNumRelevantScripts.  This value indicates the
     51   // number of elements in this enum Script (except this value) such that we can
     52   // easily iterate over the scripts.
     53   kNumRelevantScripts,
     54 };
     55 
     56 template<typename IntType>
     57 inline bool InRange(IntType value, IntType low, IntType hi) {
     58   return (value >= low) && (value <= hi);
     59 }
     60 
     61 // Returns Script for the UTF8 character that starts at address p.
     62 // Precondition: p points to a valid UTF8 character of num_bytes bytes.
     63 inline Script GetScript(const unsigned char *p, int num_bytes) {
     64   switch (num_bytes) {
     65     case 1:
     66       return kScriptOtherUtf8OneByte;
     67 
     68     case 2: {
     69       // 2-byte UTF8 characters have 11 bits of information.  unsigned int has
     70       // at least 16 bits (http://en.cppreference.com/w/cpp/language/types) so
     71       // it's enough.  It's also usually the fastest int type on the current
     72       // CPU, so it's better to use than int32.
     73       static const unsigned int kGreekStart = 0x370;
     74 
     75       // Commented out (unsued in the code): kGreekEnd = 0x3FF;
     76       static const unsigned int kCyrillicStart = 0x400;
     77       static const unsigned int kCyrillicEnd = 0x4FF;
     78       static const unsigned int kHebrewStart = 0x590;
     79 
     80       // Commented out (unsued in the code): kHebrewEnd = 0x5FF;
     81       static const unsigned int kArabicStart = 0x600;
     82       static const unsigned int kArabicEnd = 0x6FF;
     83       const unsigned int codepoint = ((p[0] & 0x1F) << 6) | (p[1] & 0x3F);
     84       if (codepoint > kCyrillicEnd) {
     85         if (codepoint >= kArabicStart) {
     86           if (codepoint <= kArabicEnd) {
     87             return kScriptArabic;
     88           }
     89         } else {
     90           // At this point, codepoint < kArabicStart = kHebrewEnd + 1, so
     91           // codepoint <= kHebrewEnd.
     92           if (codepoint >= kHebrewStart) {
     93             return kScriptHebrew;
     94           }
     95         }
     96       } else {
     97         if (codepoint >= kCyrillicStart) {
     98           return kScriptCyrillic;
     99         } else {
    100           // At this point, codepoint < kCyrillicStart = kGreekEnd + 1, so
    101           // codepoint <= kGreekEnd.
    102           if (codepoint >= kGreekStart) {
    103             return kScriptGreek;
    104           }
    105         }
    106       }
    107       return kScriptOtherUtf8TwoBytes;
    108     }
    109 
    110     case 3: {
    111       // 3-byte UTF8 characters have 16 bits of information.  unsigned int has
    112       // at least 16 bits.
    113       static const unsigned int kHangulJamoStart = 0x1100;
    114       static const unsigned int kHangulJamoEnd = 0x11FF;
    115       static const unsigned int kHiraganaStart = 0x3041;
    116       static const unsigned int kHiraganaEnd = 0x309F;
    117 
    118       // Commented out (unsued in the code): kKatakanaStart = 0x30A0;
    119       static const unsigned int kKatakanaEnd = 0x30FF;
    120       const unsigned int codepoint =
    121           ((p[0] & 0x0F) << 12) | ((p[1] & 0x3F) << 6) | (p[2] & 0x3F);
    122       if (codepoint > kHiraganaEnd) {
    123         // On this branch, codepoint > kHiraganaEnd = kKatakanaStart - 1, so
    124         // codepoint >= kKatakanaStart.
    125         if (codepoint <= kKatakanaEnd) {
    126           return kScriptKatakana;
    127         }
    128       } else {
    129         if (codepoint >= kHiraganaStart) {
    130           return kScriptHiragana;
    131         } else {
    132           if (InRange(codepoint, kHangulJamoStart, kHangulJamoEnd)) {
    133             return kScriptHangulJamo;
    134           }
    135         }
    136       }
    137       return kScriptOtherUtf8ThreeBytes;
    138     }
    139 
    140     case 4:
    141       return kScriptOtherUtf8FourBytes;
    142 
    143     default:
    144       return kScriptError;
    145   }
    146 }
    147 
    148 // Returns Script for the UTF8 character that starts at address p.  Similar to
    149 // the previous version of GetScript, except for "char" vs "unsigned char".
    150 // Most code works with "char *" pointers, ignoring the fact that char is
    151 // unsigned (by default) on most platforms, but signed on iOS.  This code takes
    152 // care of making sure we always treat chars as unsigned.
    153 inline Script GetScript(const char *p, int num_bytes) {
    154   return GetScript(reinterpret_cast<const unsigned char *>(p),
    155                    num_bytes);
    156 }
    157 
    158 }  // namespace lang_id
    159 }  // namespace nlp_core
    160 }  // namespace libtextclassifier
    161 
    162 #endif  // LIBTEXTCLASSIFIER_LANG_ID_SCRIPT_DETECTOR_H_
    163