Home | History | Annotate | Download | only in utils
      1 /*
      2  * Copyright (C) 2010 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #ifndef LATINIME_CHAR_UTILS_H
     18 #define LATINIME_CHAR_UTILS_H
     19 
     20 #include <cctype>
     21 #include <cstring>
     22 #include <vector>
     23 
     24 #include "defines.h"
     25 
     26 namespace latinime {
     27 
     28 class CharUtils {
     29  public:
     30     static AK_FORCE_INLINE bool isAsciiUpper(int c) {
     31         // Note: isupper(...) reports false positives for some Cyrillic characters, causing them to
     32         // be incorrectly lower-cased using toAsciiLower(...) rather than latin_tolower(...).
     33         return (c >= 'A' && c <= 'Z');
     34     }
     35 
     36     static AK_FORCE_INLINE int toAsciiLower(int c) {
     37         return c - 'A' + 'a';
     38     }
     39 
     40     static AK_FORCE_INLINE bool isAscii(int c) {
     41         return isascii(c) != 0;
     42     }
     43 
     44     static AK_FORCE_INLINE int toLowerCase(const int c) {
     45         if (isAsciiUpper(c)) {
     46             return toAsciiLower(c);
     47         }
     48         if (isAscii(c)) {
     49             return c;
     50         }
     51         return static_cast<int>(latin_tolower(static_cast<unsigned short>(c)));
     52     }
     53 
     54     static AK_FORCE_INLINE int toBaseLowerCase(const int c) {
     55         return toLowerCase(toBaseCodePoint(c));
     56     }
     57 
     58     static AK_FORCE_INLINE bool isIntentionalOmissionCodePoint(const int codePoint) {
     59         // TODO: Do not hardcode here
     60         return codePoint == KEYCODE_SINGLE_QUOTE || codePoint == KEYCODE_HYPHEN_MINUS;
     61     }
     62 
     63     static AK_FORCE_INLINE int getCodePointCount(const int arraySize, const int *const codePoints) {
     64         int size = 0;
     65         for (; size < arraySize; ++size) {
     66             if (codePoints[size] == '\0') {
     67                 break;
     68             }
     69         }
     70         return size;
     71     }
     72 
     73     static AK_FORCE_INLINE int toBaseCodePoint(int c) {
     74         if (c < BASE_CHARS_SIZE) {
     75             return static_cast<int>(BASE_CHARS[c]);
     76         }
     77         return c;
     78     }
     79 
     80     static AK_FORCE_INLINE int getSpaceCount(const int *const codePointBuffer, const int length) {
     81         int spaceCount = 0;
     82         for (int i = 0; i < length; ++i) {
     83             if (codePointBuffer[i] == KEYCODE_SPACE) {
     84                 ++spaceCount;
     85             }
     86         }
     87         return spaceCount;
     88     }
     89 
     90     static AK_FORCE_INLINE int isInUnicodeSpace(const int codePoint) {
     91         return codePoint >= MIN_UNICODE_CODE_POINT && codePoint <= MAX_UNICODE_CODE_POINT;
     92     }
     93 
     94     static unsigned short latin_tolower(const unsigned short c);
     95     static const std::vector<int> EMPTY_STRING;
     96 
     97     // Returns updated code point count. Returns 0 when the code points cannot be marked as a
     98     // Beginning-of-Sentence.
     99     static AK_FORCE_INLINE int attachBeginningOfSentenceMarker(int *const codePoints,
    100             const int codePointCount, const int maxCodePoint) {
    101         if (codePointCount > 0 && codePoints[0] == CODE_POINT_BEGINNING_OF_SENTENCE) {
    102             // Marker has already been attached.
    103             return codePointCount;
    104         }
    105         if (codePointCount >= maxCodePoint) {
    106             // the code points cannot be marked as a Beginning-of-Sentence.
    107             return 0;
    108         }
    109         memmove(codePoints + 1, codePoints, sizeof(int) * codePointCount);
    110         codePoints[0] = CODE_POINT_BEGINNING_OF_SENTENCE;
    111         return codePointCount + 1;
    112     }
    113 
    114  private:
    115     DISALLOW_IMPLICIT_CONSTRUCTORS(CharUtils);
    116 
    117     static const int MIN_UNICODE_CODE_POINT;
    118     static const int MAX_UNICODE_CODE_POINT;
    119 
    120     /**
    121      * Table mapping most combined Latin, Greek, and Cyrillic characters
    122      * to their base characters.  If c is in range, BASE_CHARS[c] == c
    123      * if c is not a combined character, or the base character if it
    124      * is combined.
    125      */
    126     static const int BASE_CHARS_SIZE = 0x0500;
    127     static const unsigned short BASE_CHARS[BASE_CHARS_SIZE];
    128 };
    129 } // namespace latinime
    130 #endif // LATINIME_CHAR_UTILS_H
    131