libtextclassifier/lang_id/script-detector.h

/*
 * Copyright (C) 2017 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef LIBTEXTCLASSIFIER_LANG_ID_SCRIPT_DETECTOR_H_
#define LIBTEXTCLASSIFIER_LANG_ID_SCRIPT_DETECTOR_H_

namespace libtextclassifier {
namespace nlp_core {
namespace lang_id {

// Unicode scripts we care about.  To get compact and fast code, we detect only
// a few Unicode scripts that offer a strong indication about the language of
// the text (e.g., Hiragana -> Japanese).
enum Script {
  // Special value to indicate internal errors in the script detection code.
  kScriptError,

  // Special values for all Unicode scripts that we do not detect.  One special
  // value for Unicode characters of 1, 2, 3, respectively 4 bytes (as we
  // already have that information, we use it).  kScriptOtherUtf8OneByte means
  // ~Latin and kScriptOtherUtf8FourBytes means ~Han.
  kScriptOtherUtf8OneByte,
  kScriptOtherUtf8TwoBytes,
  kScriptOtherUtf8ThreeBytes,
  kScriptOtherUtf8FourBytes,

  kScriptGreek,
  kScriptCyrillic,
  kScriptHebrew,
  kScriptArabic,
  kScriptHangulJamo,  // Used primarily for Korean.
  kScriptHiragana,    // Used primarily for Japanese.
  kScriptKatakana,    // Used primarily for Japanese.

  // Add new scripts here.

  // Do not add any script after kNumRelevantScripts.  This value indicates the
  // number of elements in this enum Script (except this value) such that we can
  // easily iterate over the scripts.
  kNumRelevantScripts,
};

template<typename IntType>
inline bool InRange(IntType value, IntType low, IntType hi) {
  return (value >= low) && (value <= hi);
}

// Returns Script for the UTF8 character that starts at address p.
// Precondition: p points to a valid UTF8 character of num_bytes bytes.
inline Script GetScript(const unsigned char *p, int num_bytes) {
  switch (num_bytes) {
    case 1:
      return kScriptOtherUtf8OneByte;

    case 2: {
      // 2-byte UTF8 characters have 11 bits of information.  unsigned int has
      // at least 16 bits (http://en.cppreference.com/w/cpp/language/types) so
      // it's enough.  It's also usually the fastest int type on the current
      // CPU, so it's better to use than int32.
      static const unsigned int kGreekStart = 0x370;

      // Commented out (unsued in the code): kGreekEnd = 0x3FF;
      static const unsigned int kCyrillicStart = 0x400;
      static const unsigned int kCyrillicEnd = 0x4FF;
      static const unsigned int kHebrewStart = 0x590;

      // Commented out (unsued in the code): kHebrewEnd = 0x5FF;
      static const unsigned int kArabicStart = 0x600;
      static const unsigned int kArabicEnd = 0x6FF;
      const unsigned int codepoint = ((p[0] & 0x1F) << 6) | (p[1] & 0x3F);
      if (codepoint > kCyrillicEnd) {
        if (codepoint >= kArabicStart) {
          if (codepoint <= kArabicEnd) {
            return kScriptArabic;
          }
        } else {
          // At this point, codepoint < kArabicStart = kHebrewEnd + 1, so
          // codepoint <= kHebrewEnd.
          if (codepoint >= kHebrewStart) {
            return kScriptHebrew;
          }
        }
      } else {
        if (codepoint >= kCyrillicStart) {
          return kScriptCyrillic;
        } else {
          // At this point, codepoint < kCyrillicStart = kGreekEnd + 1, so
          // codepoint <= kGreekEnd.
          if (codepoint >= kGreekStart) {
            return kScriptGreek;
          }
        }
      }
      return kScriptOtherUtf8TwoBytes;
    }

    case 3: {
      // 3-byte UTF8 characters have 16 bits of information.  unsigned int has
      // at least 16 bits.
      static const unsigned int kHangulJamoStart = 0x1100;
      static const unsigned int kHangulJamoEnd = 0x11FF;
      static const unsigned int kHiraganaStart = 0x3041;
      static const unsigned int kHiraganaEnd = 0x309F;

      // Commented out (unsued in the code): kKatakanaStart = 0x30A0;
      static const unsigned int kKatakanaEnd = 0x30FF;
      const unsigned int codepoint =
          ((p[0] & 0x0F) << 12) | ((p[1] & 0x3F) << 6) | (p[2] & 0x3F);
      if (codepoint > kHiraganaEnd) {
        // On this branch, codepoint > kHiraganaEnd = kKatakanaStart - 1, so
        // codepoint >= kKatakanaStart.
        if (codepoint <= kKatakanaEnd) {
          return kScriptKatakana;
        }
      } else {
        if (codepoint >= kHiraganaStart) {
          return kScriptHiragana;
        } else {
          if (InRange(codepoint, kHangulJamoStart, kHangulJamoEnd)) {
            return kScriptHangulJamo;
          }
        }
      }
      return kScriptOtherUtf8ThreeBytes;
    }

    case 4:
      return kScriptOtherUtf8FourBytes;

    default:
      return kScriptError;
  }
}

// Returns Script for the UTF8 character that starts at address p.  Similar to
// the previous version of GetScript, except for "char" vs "unsigned char".
// Most code works with "char *" pointers, ignoring the fact that char is
// unsigned (by default) on most platforms, but signed on iOS.  This code takes
// care of making sure we always treat chars as unsigned.
inline Script GetScript(const char *p, int num_bytes) {
  return GetScript(reinterpret_cast<const unsigned char *>(p),
                   num_bytes);
}

}  // namespace lang_id
}  // namespace nlp_core
}  // namespace libtextclassifier

#endif  // LIBTEXTCLASSIFIER_LANG_ID_SCRIPT_DETECTOR_H_