1 /* 2 * Copyright (C) 2010 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef LATINIME_CHAR_UTILS_H 18 #define LATINIME_CHAR_UTILS_H 19 20 #include <cctype> 21 #include <cstring> 22 #include <vector> 23 24 #include "defines.h" 25 26 namespace latinime { 27 28 class CharUtils { 29 public: 30 static AK_FORCE_INLINE bool isAsciiUpper(int c) { 31 // Note: isupper(...) reports false positives for some Cyrillic characters, causing them to 32 // be incorrectly lower-cased using toAsciiLower(...) rather than latin_tolower(...). 33 return (c >= 'A' && c <= 'Z'); 34 } 35 36 static AK_FORCE_INLINE int toAsciiLower(int c) { 37 return c - 'A' + 'a'; 38 } 39 40 static AK_FORCE_INLINE bool isAscii(int c) { 41 return isascii(c) != 0; 42 } 43 44 static AK_FORCE_INLINE int toLowerCase(const int c) { 45 if (isAsciiUpper(c)) { 46 return toAsciiLower(c); 47 } 48 if (isAscii(c)) { 49 return c; 50 } 51 return static_cast<int>(latin_tolower(static_cast<unsigned short>(c))); 52 } 53 54 static AK_FORCE_INLINE int toBaseLowerCase(const int c) { 55 return toLowerCase(toBaseCodePoint(c)); 56 } 57 58 static AK_FORCE_INLINE bool isIntentionalOmissionCodePoint(const int codePoint) { 59 // TODO: Do not hardcode here 60 return codePoint == KEYCODE_SINGLE_QUOTE || codePoint == KEYCODE_HYPHEN_MINUS; 61 } 62 63 static AK_FORCE_INLINE int getCodePointCount(const int arraySize, const int *const codePoints) { 64 int size = 0; 65 for (; size < arraySize; ++size) { 66 if (codePoints[size] == '\0') { 67 break; 68 } 69 } 70 return size; 71 } 72 73 static AK_FORCE_INLINE int toBaseCodePoint(int c) { 74 if (c < BASE_CHARS_SIZE) { 75 return static_cast<int>(BASE_CHARS[c]); 76 } 77 return c; 78 } 79 80 static AK_FORCE_INLINE int getSpaceCount(const int *const codePointBuffer, const int length) { 81 int spaceCount = 0; 82 for (int i = 0; i < length; ++i) { 83 if (codePointBuffer[i] == KEYCODE_SPACE) { 84 ++spaceCount; 85 } 86 } 87 return spaceCount; 88 } 89 90 static AK_FORCE_INLINE int isInUnicodeSpace(const int codePoint) { 91 return codePoint >= MIN_UNICODE_CODE_POINT && codePoint <= MAX_UNICODE_CODE_POINT; 92 } 93 94 static unsigned short latin_tolower(const unsigned short c); 95 static const std::vector<int> EMPTY_STRING; 96 97 // Returns updated code point count. Returns 0 when the code points cannot be marked as a 98 // Beginning-of-Sentence. 99 static AK_FORCE_INLINE int attachBeginningOfSentenceMarker(int *const codePoints, 100 const int codePointCount, const int maxCodePoint) { 101 if (codePointCount > 0 && codePoints[0] == CODE_POINT_BEGINNING_OF_SENTENCE) { 102 // Marker has already been attached. 103 return codePointCount; 104 } 105 if (codePointCount >= maxCodePoint) { 106 // the code points cannot be marked as a Beginning-of-Sentence. 107 return 0; 108 } 109 memmove(codePoints + 1, codePoints, sizeof(int) * codePointCount); 110 codePoints[0] = CODE_POINT_BEGINNING_OF_SENTENCE; 111 return codePointCount + 1; 112 } 113 114 private: 115 DISALLOW_IMPLICIT_CONSTRUCTORS(CharUtils); 116 117 static const int MIN_UNICODE_CODE_POINT; 118 static const int MAX_UNICODE_CODE_POINT; 119 120 /** 121 * Table mapping most combined Latin, Greek, and Cyrillic characters 122 * to their base characters. If c is in range, BASE_CHARS[c] == c 123 * if c is not a combined character, or the base character if it 124 * is combined. 125 */ 126 static const int BASE_CHARS_SIZE = 0x0500; 127 static const unsigned short BASE_CHARS[BASE_CHARS_SIZE]; 128 }; 129 } // namespace latinime 130 #endif // LATINIME_CHAR_UTILS_H 131