1 /* 2 * Copyright (C) 2014 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "utils/utf8_utils.h" 18 19 #include "utils/char_utils.h" 20 21 namespace latinime { 22 namespace dicttoolkit { 23 24 const size_t Utf8Utils::MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT = 4; 25 const uint8_t Utf8Utils::FIRST_BYTE_MARKER_MASKS[] = {0, 0x80, 0xE0, 0xF0, 0xF8}; 26 const uint8_t Utf8Utils::FIRST_BYTE_MARKERS[] = {0, 0x00, 0xC0, 0xE0, 0xF0}; 27 const uint8_t Utf8Utils::FIRST_BYTE_CODE_POINT_BITS_MASKS[] = {0, 0x7F, 0x1F, 0x0F, 0x03}; 28 const int Utf8Utils::MAX_ENCODED_CODE_POINT_VALUES[] = {-1, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF}; 29 30 const uint8_t Utf8Utils::TRAILING_BYTE_CODE_POINT_BITS_MASK = 0x3F; 31 const uint8_t Utf8Utils::TRAILING_BYTE_MARKER = 0x80; 32 const size_t Utf8Utils::CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE = 6; 33 34 /* static */ std::vector<int> Utf8Utils::getCodePoints(const std::string &utf8Str) { 35 std::vector<int> codePoints; 36 int remainingByteCountForCurrentCodePoint = 0; 37 int currentCodePointSequenceSize = 0; 38 int codePoint = 0; 39 for (const char c : utf8Str) { 40 if (remainingByteCountForCurrentCodePoint == 0) { 41 currentCodePointSequenceSize = getSequenceSizeByCheckingFirstByte(c); 42 if (currentCodePointSequenceSize <= 0) { 43 AKLOGE("%x is an invalid utf8 first byte value.", c); 44 return std::vector<int>(); 45 } 46 remainingByteCountForCurrentCodePoint = currentCodePointSequenceSize; 47 codePoint = maskFirstByte(c, remainingByteCountForCurrentCodePoint); 48 } else { 49 codePoint <<= CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE; 50 codePoint += maskTrailingByte(c); 51 } 52 remainingByteCountForCurrentCodePoint--; 53 if (remainingByteCountForCurrentCodePoint == 0) { 54 if (codePoint <= MAX_ENCODED_CODE_POINT_VALUES[currentCodePointSequenceSize - 1]) { 55 AKLOGE("%d bytes encode for codePoint(%x) is a redundant UTF-8 sequence.", 56 currentCodePointSequenceSize, codePoint); 57 return std::vector<int>(); 58 } 59 codePoints.push_back(codePoint); 60 } 61 } 62 return codePoints; 63 } 64 65 /* static */ int Utf8Utils::getSequenceSizeByCheckingFirstByte(const uint8_t firstByte) { 66 for (size_t i = 1; i <= MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT; ++i) { 67 if ((firstByte & FIRST_BYTE_MARKER_MASKS[i]) == FIRST_BYTE_MARKERS[i]) { 68 return i; 69 } 70 } 71 // Not a valid utf8 char first byte. 72 return -1; 73 } 74 75 /* static */ AK_FORCE_INLINE int Utf8Utils::maskFirstByte(const uint8_t firstByte, 76 const int sequenceSize) { 77 return firstByte & FIRST_BYTE_CODE_POINT_BITS_MASKS[sequenceSize]; 78 } 79 80 /* static */ AK_FORCE_INLINE int Utf8Utils::maskTrailingByte(const uint8_t secondOrLaterByte) { 81 return secondOrLaterByte & TRAILING_BYTE_CODE_POINT_BITS_MASK; 82 } 83 84 /* static */ std::string Utf8Utils::getUtf8String(const CodePointArrayView codePoints) { 85 std::string utf8String; 86 for (const int codePoint : codePoints) { 87 const int sequenceSize = getSequenceSizeToEncodeCodePoint(codePoint); 88 if (sequenceSize <= 0) { 89 AKLOGE("Cannot encode code point (%d).", codePoint); 90 return std::string(); 91 } 92 const int trailingByteCount = sequenceSize - 1; 93 // Output first byte. 94 const int value = codePoint >> (trailingByteCount * CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE); 95 utf8String.push_back(static_cast<char>(value | FIRST_BYTE_MARKERS[sequenceSize])); 96 // Output second and later bytes. 97 for (int i = 1; i < sequenceSize; ++i) { 98 const int shiftAmount = (trailingByteCount - i) * CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE; 99 const int value = (codePoint >> shiftAmount) & TRAILING_BYTE_CODE_POINT_BITS_MASK; 100 utf8String.push_back(static_cast<char>(value | TRAILING_BYTE_MARKER)); 101 } 102 } 103 return utf8String; 104 } 105 106 /* static */ int Utf8Utils::getSequenceSizeToEncodeCodePoint(const int codePoint) { 107 if (codePoint < 0) { 108 return -1; 109 } 110 for (size_t i = 1; i <= MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT; ++i) { 111 if (codePoint <= MAX_ENCODED_CODE_POINT_VALUES[i]) { 112 return i; 113 } 114 } 115 return -1; 116 } 117 118 } // namespace dicttoolkit 119 } // namespace latinime 120