Home | History | Annotate | Download | only in utils
      1 /*
      2  * Copyright (C) 2010 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #ifndef LATINIME_CHAR_UTILS_H
     18 #define LATINIME_CHAR_UTILS_H
     19 
     20 #include <cctype>
     21 #include <cstring>
     22 #include <vector>
     23 
     24 #include "defines.h"
     25 
     26 namespace latinime {
     27 
     28 class CharUtils {
     29  public:
     30     static const std::vector<int> EMPTY_STRING;
     31 
     32     static AK_FORCE_INLINE bool isAsciiUpper(int c) {
     33         // Note: isupper(...) reports false positives for some Cyrillic characters, causing them to
     34         // be incorrectly lower-cased using toAsciiLower(...) rather than latin_tolower(...).
     35         return (c >= 'A' && c <= 'Z');
     36     }
     37 
     38     static AK_FORCE_INLINE int toLowerCase(const int c) {
     39         if (isAsciiUpper(c)) {
     40             return toAsciiLower(c);
     41         }
     42         if (isAscii(c)) {
     43             return c;
     44         }
     45         return latin_tolower(c);
     46     }
     47 
     48     static AK_FORCE_INLINE int toBaseLowerCase(const int c) {
     49         return toLowerCase(toBaseCodePoint(c));
     50     }
     51 
     52     static AK_FORCE_INLINE bool isIntentionalOmissionCodePoint(const int codePoint) {
     53         // TODO: Do not hardcode here
     54         return codePoint == KEYCODE_SINGLE_QUOTE || codePoint == KEYCODE_HYPHEN_MINUS;
     55     }
     56     static AK_FORCE_INLINE int getCodePointCount(const int arraySize, const int *const codePoints) {
     57         int size = 0;
     58         for (; size < arraySize; ++size) {
     59             if (codePoints[size] == '\0') {
     60                 break;
     61             }
     62         }
     63         return size;
     64     }
     65 
     66     static AK_FORCE_INLINE int toBaseCodePoint(int c) {
     67         if (c < BASE_CHARS_SIZE) {
     68             return static_cast<int>(BASE_CHARS[c]);
     69         }
     70         return c;
     71     }
     72 
     73     static AK_FORCE_INLINE int getSpaceCount(const int *const codePointBuffer, const int length) {
     74         int spaceCount = 0;
     75         for (int i = 0; i < length; ++i) {
     76             if (codePointBuffer[i] == KEYCODE_SPACE) {
     77                 ++spaceCount;
     78             }
     79         }
     80         return spaceCount;
     81     }
     82 
     83     static AK_FORCE_INLINE int isInUnicodeSpace(const int codePoint) {
     84         return codePoint >= MIN_UNICODE_CODE_POINT && codePoint <= MAX_UNICODE_CODE_POINT;
     85     }
     86 
     87     // Returns updated code point count. Returns 0 when the code points cannot be marked as a
     88     // Beginning-of-Sentence.
     89     static AK_FORCE_INLINE int attachBeginningOfSentenceMarker(int *const codePoints,
     90             const int codePointCount, const int maxCodePoint) {
     91         if (codePointCount > 0 && codePoints[0] == CODE_POINT_BEGINNING_OF_SENTENCE) {
     92             // Marker has already been attached.
     93             return codePointCount;
     94         }
     95         if (codePointCount >= maxCodePoint) {
     96             // the code points cannot be marked as a Beginning-of-Sentence.
     97             return 0;
     98         }
     99         memmove(codePoints + 1, codePoints, sizeof(int) * codePointCount);
    100         codePoints[0] = CODE_POINT_BEGINNING_OF_SENTENCE;
    101         return codePointCount + 1;
    102     }
    103 
    104     // Returns updated code point count.
    105     static AK_FORCE_INLINE int removeBeginningOfSentenceMarker(int *const codePoints,
    106             const int codePointCount) {
    107         if (codePointCount <= 0 || codePoints[0] != CODE_POINT_BEGINNING_OF_SENTENCE) {
    108             return codePointCount;
    109         }
    110         const int newCodePointCount = codePointCount - 1;
    111         memmove(codePoints, codePoints + 1, sizeof(int) * newCodePointCount);
    112         return newCodePointCount;
    113     }
    114 
    115  private:
    116     DISALLOW_IMPLICIT_CONSTRUCTORS(CharUtils);
    117 
    118     static const int MIN_UNICODE_CODE_POINT;
    119     static const int MAX_UNICODE_CODE_POINT;
    120 
    121     /**
    122      * Table mapping most combined Latin, Greek, and Cyrillic characters
    123      * to their base characters.  If c is in range, BASE_CHARS[c] == c
    124      * if c is not a combined character, or the base character if it
    125      * is combined.
    126      */
    127     static const int BASE_CHARS_SIZE = 0x0500;
    128     static const unsigned short BASE_CHARS[BASE_CHARS_SIZE];
    129 
    130     static AK_FORCE_INLINE bool isAscii(int c) {
    131         return isascii(c) != 0;
    132     }
    133 
    134     static AK_FORCE_INLINE int toAsciiLower(int c) {
    135         return c - 'A' + 'a';
    136     }
    137 
    138     static int latin_tolower(const int c);
    139 };
    140 } // namespace latinime
    141 #endif // LATINIME_CHAR_UTILS_H
    142