Home | History | Annotate | Download | only in util
      1 /*
      2  * Copyright (C) 2015 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include <cstdlib>
     18 #include <string>
     19 #include <vector>
     20 
     21 #include <cutils/log.h>
     22 #include <unicode/utf.h>
     23 #include <unicode/utf8.h>
     24 
     25 #include "minikin/U16StringPiece.h"
     26 
     27 namespace minikin {
     28 
     29 // src is of the form "U+1F431 | 'h' 'i'". Position of "|" gets saved to offset if non-null.
     30 // Size is returned in an out parameter because gtest needs a void return for ASSERT to work.
     31 void ParseUnicode(uint16_t* buf, size_t buf_size, const char* src, size_t* result_size,
     32                   size_t* offset) {
     33     size_t input_ix = 0;
     34     size_t output_ix = 0;
     35     bool seen_offset = false;
     36 
     37     while (src[input_ix] != 0) {
     38         switch (src[input_ix]) {
     39             case '\'':
     40                 // single ASCII char
     41                 LOG_ALWAYS_FATAL_IF(static_cast<uint8_t>(src[input_ix]) >= 0x80);
     42                 input_ix++;
     43                 LOG_ALWAYS_FATAL_IF(src[input_ix] == 0);
     44                 LOG_ALWAYS_FATAL_IF(output_ix >= buf_size);
     45                 buf[output_ix++] = (uint16_t)src[input_ix++];
     46                 LOG_ALWAYS_FATAL_IF(src[input_ix] != '\'');
     47                 input_ix++;
     48                 break;
     49             case 'u':
     50             case 'U': {
     51                 // Unicode codepoint in hex syntax
     52                 input_ix++;
     53                 LOG_ALWAYS_FATAL_IF(src[input_ix] != '+');
     54                 input_ix++;
     55                 char* endptr = (char*)src + input_ix;
     56                 unsigned long int codepoint = strtoul(src + input_ix, &endptr, 16);
     57                 size_t num_hex_digits = endptr - (src + input_ix);
     58 
     59                 // also triggers on invalid number syntax, digits = 0
     60                 LOG_ALWAYS_FATAL_IF(num_hex_digits < 4u);
     61                 LOG_ALWAYS_FATAL_IF(num_hex_digits > 6u);
     62                 LOG_ALWAYS_FATAL_IF(codepoint > 0x10FFFFu);
     63                 input_ix += num_hex_digits;
     64                 if (U16_LENGTH(codepoint) == 1) {
     65                     LOG_ALWAYS_FATAL_IF(output_ix + 1 > buf_size);
     66                     buf[output_ix++] = codepoint;
     67                 } else {
     68                     // UTF-16 encoding
     69                     LOG_ALWAYS_FATAL_IF(output_ix + 2 > buf_size);
     70                     buf[output_ix++] = U16_LEAD(codepoint);
     71                     buf[output_ix++] = U16_TRAIL(codepoint);
     72                 }
     73                 break;
     74             }
     75             case ' ':
     76                 input_ix++;
     77                 break;
     78             case '|':
     79                 LOG_ALWAYS_FATAL_IF(seen_offset);
     80                 LOG_ALWAYS_FATAL_IF(offset == nullptr);
     81                 *offset = output_ix;
     82                 seen_offset = true;
     83                 input_ix++;
     84                 break;
     85             default:
     86                 LOG_ALWAYS_FATAL("Unexpected Character");
     87         }
     88     }
     89     LOG_ALWAYS_FATAL_IF(result_size == nullptr);
     90     *result_size = output_ix;
     91     LOG_ALWAYS_FATAL_IF(!seen_offset && offset != nullptr);
     92 }
     93 
     94 std::vector<uint16_t> parseUnicodeStringWithOffset(const std::string& in, size_t* offset) {
     95     std::unique_ptr<uint16_t[]> buffer(new uint16_t[in.size()]);
     96     size_t result_size = 0;
     97     ParseUnicode(buffer.get(), in.size(), in.c_str(), &result_size, offset);
     98     return std::vector<uint16_t>(buffer.get(), buffer.get() + result_size);
     99 }
    100 
    101 std::vector<uint16_t> parseUnicodeString(const std::string& in) {
    102     return parseUnicodeStringWithOffset(in, nullptr);
    103 }
    104 
    105 std::vector<uint16_t> utf8ToUtf16(const std::string& text) {
    106     std::vector<uint16_t> result;
    107     int32_t i = 0;
    108     const int32_t textLength = static_cast<int32_t>(text.size());
    109     uint32_t c = 0;
    110     while (i < textLength) {
    111         U8_NEXT(text.c_str(), i, textLength, c);
    112         if (U16_LENGTH(c) == 1) {
    113             result.push_back(c);
    114         } else {
    115             result.push_back(U16_LEAD(c));
    116             result.push_back(U16_TRAIL(c));
    117         }
    118     }
    119     return result;
    120 }
    121 
    122 std::string utf16ToUtf8(const U16StringPiece& u16String) {
    123     const uint32_t textLength = u16String.size();
    124     uint32_t i = 0;
    125     uint32_t c = 0;
    126 
    127     std::string out;
    128     out.reserve(textLength * 4);
    129 
    130     while (i < textLength) {
    131         U16_NEXT(u16String.data(), i, textLength, c);
    132 
    133         char buf[U8_MAX_LENGTH] = {};
    134         uint32_t outIndex = 0;
    135         U8_APPEND_UNSAFE(buf, outIndex, c);
    136         out.append(buf, outIndex);
    137     }
    138     return out;
    139 }
    140 
    141 }  // namespace minikin
    142