Home | History | Annotate | Download | only in utils
      1 /*
      2  * Copyright (C) 2014 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "utils/utf8_utils.h"
     18 
     19 #include "utils/char_utils.h"
     20 
     21 namespace latinime {
     22 namespace dicttoolkit {
     23 
     24 const size_t Utf8Utils::MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT = 4;
     25 const uint8_t Utf8Utils::FIRST_BYTE_MARKER_MASKS[] = {0, 0x80, 0xE0, 0xF0, 0xF8};
     26 const uint8_t Utf8Utils::FIRST_BYTE_MARKERS[] = {0, 0x00, 0xC0, 0xE0, 0xF0};
     27 const uint8_t Utf8Utils::FIRST_BYTE_CODE_POINT_BITS_MASKS[] = {0, 0x7F, 0x1F, 0x0F, 0x03};
     28 const int Utf8Utils::MAX_ENCODED_CODE_POINT_VALUES[] = {-1, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF};
     29 
     30 const uint8_t Utf8Utils::TRAILING_BYTE_CODE_POINT_BITS_MASK = 0x3F;
     31 const uint8_t Utf8Utils::TRAILING_BYTE_MARKER = 0x80;
     32 const size_t Utf8Utils::CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE = 6;
     33 
     34 /* static */ std::vector<int> Utf8Utils::getCodePoints(const std::string &utf8Str) {
     35     std::vector<int> codePoints;
     36     int remainingByteCountForCurrentCodePoint = 0;
     37     int currentCodePointSequenceSize = 0;
     38     int codePoint = 0;
     39     for (const char c : utf8Str) {
     40         if (remainingByteCountForCurrentCodePoint == 0) {
     41             currentCodePointSequenceSize = getSequenceSizeByCheckingFirstByte(c);
     42             if (currentCodePointSequenceSize <= 0) {
     43                 AKLOGE("%x is an invalid utf8 first byte value.", c);
     44                 return std::vector<int>();
     45             }
     46             remainingByteCountForCurrentCodePoint = currentCodePointSequenceSize;
     47             codePoint = maskFirstByte(c, remainingByteCountForCurrentCodePoint);
     48         } else {
     49             codePoint <<= CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE;
     50             codePoint += maskTrailingByte(c);
     51         }
     52         remainingByteCountForCurrentCodePoint--;
     53         if (remainingByteCountForCurrentCodePoint == 0) {
     54             if (codePoint <= MAX_ENCODED_CODE_POINT_VALUES[currentCodePointSequenceSize - 1]) {
     55                 AKLOGE("%d bytes encode for codePoint(%x) is a redundant UTF-8 sequence.",
     56                         currentCodePointSequenceSize,  codePoint);
     57                 return std::vector<int>();
     58             }
     59             codePoints.push_back(codePoint);
     60         }
     61     }
     62     return codePoints;
     63 }
     64 
     65 /* static */ int Utf8Utils::getSequenceSizeByCheckingFirstByte(const uint8_t firstByte) {
     66     for (size_t i = 1; i <= MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT; ++i) {
     67         if ((firstByte & FIRST_BYTE_MARKER_MASKS[i]) == FIRST_BYTE_MARKERS[i]) {
     68             return i;
     69         }
     70     }
     71     // Not a valid utf8 char first byte.
     72     return -1;
     73 }
     74 
     75 /* static */ AK_FORCE_INLINE int Utf8Utils::maskFirstByte(const uint8_t firstByte,
     76         const int sequenceSize) {
     77     return firstByte & FIRST_BYTE_CODE_POINT_BITS_MASKS[sequenceSize];
     78 }
     79 
     80 /* static */ AK_FORCE_INLINE int Utf8Utils::maskTrailingByte(const uint8_t secondOrLaterByte) {
     81     return secondOrLaterByte & TRAILING_BYTE_CODE_POINT_BITS_MASK;
     82 }
     83 
     84 /* static */ std::string Utf8Utils::getUtf8String(const CodePointArrayView codePoints) {
     85     std::string utf8String;
     86     for (const int codePoint : codePoints) {
     87         const int sequenceSize = getSequenceSizeToEncodeCodePoint(codePoint);
     88         if (sequenceSize <= 0) {
     89             AKLOGE("Cannot encode code point (%d).", codePoint);
     90             return std::string();
     91         }
     92         const int trailingByteCount = sequenceSize - 1;
     93         // Output first byte.
     94         const int value = codePoint >> (trailingByteCount * CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE);
     95         utf8String.push_back(static_cast<char>(value | FIRST_BYTE_MARKERS[sequenceSize]));
     96         // Output second and later bytes.
     97         for (int i = 1; i < sequenceSize; ++i) {
     98             const int shiftAmount = (trailingByteCount - i) * CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE;
     99             const int value = (codePoint >> shiftAmount) & TRAILING_BYTE_CODE_POINT_BITS_MASK;
    100             utf8String.push_back(static_cast<char>(value | TRAILING_BYTE_MARKER));
    101         }
    102     }
    103     return utf8String;
    104 }
    105 
    106 /* static */ int Utf8Utils::getSequenceSizeToEncodeCodePoint(const int codePoint) {
    107     if (codePoint < 0) {
    108         return -1;
    109     }
    110     for (size_t i = 1; i <= MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT; ++i) {
    111         if (codePoint <= MAX_ENCODED_CODE_POINT_VALUES[i]) {
    112             return i;
    113         }
    114     }
    115     return -1;
    116 }
    117 
    118 } // namespace dicttoolkit
    119 } // namespace latinime
    120