1 /* 2 * Copyright (C) 2015 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include <gtest/gtest.h> 18 #include <unicode/utf.h> 19 #include <cstdlib> 20 21 // src is of the form "U+1F431 | 'h' 'i'". Position of "|" gets saved to offset if non-null. 22 // Size is returned in an out parameter because gtest needs a void return for ASSERT to work. 23 void ParseUnicode(uint16_t* buf, size_t buf_size, const char* src, size_t* result_size, 24 size_t* offset) { 25 size_t input_ix = 0; 26 size_t output_ix = 0; 27 bool seen_offset = false; 28 29 while (src[input_ix] != 0) { 30 switch (src[input_ix]) { 31 case '\'': 32 // single ASCII char 33 ASSERT_LT(src[input_ix], 0x80); 34 input_ix++; 35 ASSERT_NE(src[input_ix], 0); 36 ASSERT_LT(output_ix, buf_size); 37 buf[output_ix++] = (uint16_t)src[input_ix++]; 38 ASSERT_EQ(src[input_ix], '\''); 39 input_ix++; 40 break; 41 case 'u': 42 case 'U': { 43 // Unicode codepoint in hex syntax 44 input_ix++; 45 ASSERT_EQ(src[input_ix], '+'); 46 input_ix++; 47 char* endptr = (char*)src + input_ix; 48 unsigned long int codepoint = strtoul(src + input_ix, &endptr, 16); 49 size_t num_hex_digits = endptr - (src + input_ix); 50 ASSERT_GE(num_hex_digits, 4u); // also triggers on invalid number syntax, digits = 0 51 ASSERT_LE(num_hex_digits, 6u); 52 ASSERT_LE(codepoint, 0x10FFFFu); 53 input_ix += num_hex_digits; 54 if (U16_LENGTH(codepoint) == 1) { 55 ASSERT_LE(output_ix + 1, buf_size); 56 buf[output_ix++] = codepoint; 57 } else { 58 // UTF-16 encoding 59 ASSERT_LE(output_ix + 2, buf_size); 60 buf[output_ix++] = U16_LEAD(codepoint); 61 buf[output_ix++] = U16_TRAIL(codepoint); 62 } 63 break; 64 } 65 case ' ': 66 input_ix++; 67 break; 68 case '|': 69 ASSERT_FALSE(seen_offset); 70 ASSERT_NE(offset, nullptr); 71 *offset = output_ix; 72 seen_offset = true; 73 input_ix++; 74 break; 75 default: 76 FAIL(); // unexpected character 77 } 78 } 79 ASSERT_NE(result_size, nullptr); 80 *result_size = output_ix; 81 ASSERT_TRUE(seen_offset || offset == nullptr); 82 } 83 84 TEST(UnicodeUtils, parse) { 85 const size_t BUF_SIZE = 256; 86 uint16_t buf[BUF_SIZE]; 87 size_t offset; 88 size_t size; 89 ParseUnicode(buf, BUF_SIZE, "U+000D U+1F431 | 'a'", &size, &offset); 90 EXPECT_EQ(size, 4u); 91 EXPECT_EQ(offset, 3u); 92 EXPECT_EQ(buf[0], 0x000D); 93 EXPECT_EQ(buf[1], 0xD83D); 94 EXPECT_EQ(buf[2], 0xDC31); 95 EXPECT_EQ(buf[3], 'a'); 96 } 97