1 /* libs/cutils/strdup8to16.c 2 ** 3 ** Copyright 2006, The Android Open Source Project 4 ** 5 ** Licensed under the Apache License, Version 2.0 (the "License"); 6 ** you may not use this file except in compliance with the License. 7 ** You may obtain a copy of the License at 8 ** 9 ** http://www.apache.org/licenses/LICENSE-2.0 10 ** 11 ** Unless required by applicable law or agreed to in writing, software 12 ** distributed under the License is distributed on an "AS IS" BASIS, 13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 ** See the License for the specific language governing permissions and 15 ** limitations under the License. 16 */ 17 18 #include <cutils/jstring.h> 19 20 #include <assert.h> 21 #include <limits.h> 22 #include <stdlib.h> 23 24 /* See http://www.unicode.org/reports/tr22/ for discussion 25 * on invalid sequences 26 */ 27 28 #define UTF16_REPLACEMENT_CHAR 0xfffd 29 30 /* Clever trick from Dianne that returns 1-4 depending on leading bit sequence*/ 31 #define UTF8_SEQ_LENGTH(ch) (((0xe5000000 >> (((ch) >> 3) & 0x1e)) & 3) + 1) 32 33 /* note: macro expands to multiple lines */ 34 #define UTF8_SHIFT_AND_MASK(unicode, byte) \ 35 (unicode)<<=6; (unicode) |= (0x3f & (byte)); 36 37 #define UNICODE_UPPER_LIMIT 0x10fffd 38 39 /** 40 * out_len is an out parameter (which may not be null) containing the 41 * length of the UTF-16 string (which may contain embedded \0's) 42 */ 43 44 extern char16_t * strdup8to16 (const char* s, size_t *out_len) 45 { 46 char16_t *ret; 47 size_t len; 48 49 if (s == NULL) return NULL; 50 51 len = strlen8to16(s); 52 53 // fail on overflow 54 if (len && SIZE_MAX/len < sizeof(char16_t)) 55 return NULL; 56 57 // no plus-one here. UTF-16 strings are not null terminated 58 ret = (char16_t *) malloc (sizeof(char16_t) * len); 59 60 return strcpy8to16 (ret, s, out_len); 61 } 62 63 /** 64 * Like "strlen", but for strings encoded with Java's modified UTF-8. 65 * 66 * The value returned is the number of UTF-16 characters required 67 * to represent this string. 68 */ 69 extern size_t strlen8to16 (const char* utf8Str) 70 { 71 size_t len = 0; 72 int ic; 73 int expected = 0; 74 75 while ((ic = *utf8Str++) != '\0') { 76 /* bytes that start 0? or 11 are lead bytes and count as characters.*/ 77 /* bytes that start 10 are extention bytes and are not counted */ 78 79 if ((ic & 0xc0) == 0x80) { 80 /* count the 0x80 extention bytes. if we have more than 81 * expected, then start counting them because strcpy8to16 82 * will insert UTF16_REPLACEMENT_CHAR's 83 */ 84 expected--; 85 if (expected < 0) { 86 len++; 87 } 88 } else { 89 len++; 90 expected = UTF8_SEQ_LENGTH(ic) - 1; 91 92 /* this will result in a surrogate pair */ 93 if (expected == 3) { 94 len++; 95 } 96 } 97 } 98 99 return len; 100 } 101 102 103 104 /* 105 * Retrieve the next UTF-32 character from a UTF-8 string. 106 * 107 * Stops at inner \0's 108 * 109 * Returns UTF16_REPLACEMENT_CHAR if an invalid sequence is encountered 110 * 111 * Advances "*pUtf8Ptr" to the start of the next character. 112 */ 113 static inline uint32_t getUtf32FromUtf8(const char** pUtf8Ptr) 114 { 115 uint32_t ret; 116 int seq_len; 117 int i; 118 119 /* Mask for leader byte for lengths 1, 2, 3, and 4 respectively*/ 120 static const unsigned char leaderMask[4] = {0xff, 0x1f, 0x0f, 0x07}; 121 122 /* Bytes that start with bits "10" are not leading characters. */ 123 if (((**pUtf8Ptr) & 0xc0) == 0x80) { 124 (*pUtf8Ptr)++; 125 return UTF16_REPLACEMENT_CHAR; 126 } 127 128 /* note we tolerate invalid leader 11111xxx here */ 129 seq_len = UTF8_SEQ_LENGTH(**pUtf8Ptr); 130 131 ret = (**pUtf8Ptr) & leaderMask [seq_len - 1]; 132 133 if (**pUtf8Ptr == '\0') return ret; 134 135 (*pUtf8Ptr)++; 136 for (i = 1; i < seq_len ; i++, (*pUtf8Ptr)++) { 137 if ((**pUtf8Ptr) == '\0') return UTF16_REPLACEMENT_CHAR; 138 if (((**pUtf8Ptr) & 0xc0) != 0x80) return UTF16_REPLACEMENT_CHAR; 139 140 UTF8_SHIFT_AND_MASK(ret, **pUtf8Ptr); 141 } 142 143 return ret; 144 } 145 146 147 /** 148 * out_len is an out parameter (which may not be null) containing the 149 * length of the UTF-16 string (which may contain embedded \0's) 150 */ 151 152 extern char16_t * strcpy8to16 (char16_t *utf16Str, const char*utf8Str, 153 size_t *out_len) 154 { 155 char16_t *dest = utf16Str; 156 157 while (*utf8Str != '\0') { 158 uint32_t ret; 159 160 ret = getUtf32FromUtf8(&utf8Str); 161 162 if (ret <= 0xffff) { 163 *dest++ = (char16_t) ret; 164 } else if (ret <= UNICODE_UPPER_LIMIT) { 165 /* Create surrogate pairs */ 166 /* See http://en.wikipedia.org/wiki/UTF-16/UCS-2#Method_for_code_points_in_Plane_1.2C_Plane_2 */ 167 168 *dest++ = 0xd800 | ((ret - 0x10000) >> 10); 169 *dest++ = 0xdc00 | ((ret - 0x10000) & 0x3ff); 170 } else { 171 *dest++ = UTF16_REPLACEMENT_CHAR; 172 } 173 } 174 175 *out_len = dest - utf16Str; 176 177 return utf16Str; 178 } 179 180 /** 181 * length is the number of characters in the UTF-8 string. 182 * out_len is an out parameter (which may not be null) containing the 183 * length of the UTF-16 string (which may contain embedded \0's) 184 */ 185 186 extern char16_t * strcpylen8to16 (char16_t *utf16Str, const char*utf8Str, 187 int length, size_t *out_len) 188 { 189 /* TODO: Share more of this code with the method above. Only 2 lines changed. */ 190 191 char16_t *dest = utf16Str; 192 193 const char *end = utf8Str + length; /* This line */ 194 while (utf8Str < end) { /* and this line changed. */ 195 uint32_t ret; 196 197 ret = getUtf32FromUtf8(&utf8Str); 198 199 if (ret <= 0xffff) { 200 *dest++ = (char16_t) ret; 201 } else if (ret <= UNICODE_UPPER_LIMIT) { 202 /* Create surrogate pairs */ 203 /* See http://en.wikipedia.org/wiki/UTF-16/UCS-2#Method_for_code_points_in_Plane_1.2C_Plane_2 */ 204 205 *dest++ = 0xd800 | ((ret - 0x10000) >> 10); 206 *dest++ = 0xdc00 | ((ret - 0x10000) & 0x3ff); 207 } else { 208 *dest++ = UTF16_REPLACEMENT_CHAR; 209 } 210 } 211 212 *out_len = dest - utf16Str; 213 214 return utf16Str; 215 } 216