1 /* libs/cutils/strdup8to16.c 2 ** 3 ** Copyright 2006, The Android Open Source Project 4 ** 5 ** Licensed under the Apache License, Version 2.0 (the "License"); 6 ** you may not use this file except in compliance with the License. 7 ** You may obtain a copy of the License at 8 ** 9 ** http://www.apache.org/licenses/LICENSE-2.0 10 ** 11 ** Unless required by applicable law or agreed to in writing, software 12 ** distributed under the License is distributed on an "AS IS" BASIS, 13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 ** See the License for the specific language governing permissions and 15 ** limitations under the License. 16 */ 17 18 #include <cutils/jstring.h> 19 #include <assert.h> 20 #include <stdlib.h> 21 #include <limits.h> 22 23 /* See http://www.unicode.org/reports/tr22/ for discussion 24 * on invalid sequences 25 */ 26 27 #define UTF16_REPLACEMENT_CHAR 0xfffd 28 29 /* Clever trick from Dianne that returns 1-4 depending on leading bit sequence*/ 30 #define UTF8_SEQ_LENGTH(ch) (((0xe5000000 >> ((ch >> 3) & 0x1e)) & 3) + 1) 31 32 /* note: macro expands to multiple lines */ 33 #define UTF8_SHIFT_AND_MASK(unicode, byte) \ 34 (unicode)<<=6; (unicode) |= (0x3f & (byte)); 35 36 #define UNICODE_UPPER_LIMIT 0x10fffd 37 38 /** 39 * out_len is an out parameter (which may not be null) containing the 40 * length of the UTF-16 string (which may contain embedded \0's) 41 */ 42 43 extern char16_t * strdup8to16 (const char* s, size_t *out_len) 44 { 45 char16_t *ret; 46 size_t len; 47 48 if (s == NULL) return NULL; 49 50 len = strlen8to16(s); 51 52 // fail on overflow 53 if (len && SIZE_MAX/len < sizeof(char16_t)) 54 return NULL; 55 56 // no plus-one here. UTF-16 strings are not null terminated 57 ret = (char16_t *) malloc (sizeof(char16_t) * len); 58 59 return strcpy8to16 (ret, s, out_len); 60 } 61 62 /** 63 * Like "strlen", but for strings encoded with Java's modified UTF-8. 64 * 65 * The value returned is the number of UTF-16 characters required 66 * to represent this string. 67 */ 68 extern size_t strlen8to16 (const char* utf8Str) 69 { 70 size_t len = 0; 71 int ic; 72 int expected = 0; 73 74 while ((ic = *utf8Str++) != '\0') { 75 /* bytes that start 0? or 11 are lead bytes and count as characters.*/ 76 /* bytes that start 10 are extention bytes and are not counted */ 77 78 if ((ic & 0xc0) == 0x80) { 79 /* count the 0x80 extention bytes. if we have more than 80 * expected, then start counting them because strcpy8to16 81 * will insert UTF16_REPLACEMENT_CHAR's 82 */ 83 expected--; 84 if (expected < 0) { 85 len++; 86 } 87 } else { 88 len++; 89 expected = UTF8_SEQ_LENGTH(ic) - 1; 90 91 /* this will result in a surrogate pair */ 92 if (expected == 3) { 93 len++; 94 } 95 } 96 } 97 98 return len; 99 } 100 101 102 103 /* 104 * Retrieve the next UTF-32 character from a UTF-8 string. 105 * 106 * Stops at inner \0's 107 * 108 * Returns UTF16_REPLACEMENT_CHAR if an invalid sequence is encountered 109 * 110 * Advances "*pUtf8Ptr" to the start of the next character. 111 */ 112 static inline uint32_t getUtf32FromUtf8(const char** pUtf8Ptr) 113 { 114 uint32_t ret; 115 int seq_len; 116 int i; 117 118 /* Mask for leader byte for lengths 1, 2, 3, and 4 respectively*/ 119 static const char leaderMask[4] = {0xff, 0x1f, 0x0f, 0x07}; 120 121 /* Bytes that start with bits "10" are not leading characters. */ 122 if (((**pUtf8Ptr) & 0xc0) == 0x80) { 123 (*pUtf8Ptr)++; 124 return UTF16_REPLACEMENT_CHAR; 125 } 126 127 /* note we tolerate invalid leader 11111xxx here */ 128 seq_len = UTF8_SEQ_LENGTH(**pUtf8Ptr); 129 130 ret = (**pUtf8Ptr) & leaderMask [seq_len - 1]; 131 132 if (**pUtf8Ptr == '\0') return ret; 133 134 (*pUtf8Ptr)++; 135 for (i = 1; i < seq_len ; i++, (*pUtf8Ptr)++) { 136 if ((**pUtf8Ptr) == '\0') return UTF16_REPLACEMENT_CHAR; 137 if (((**pUtf8Ptr) & 0xc0) != 0x80) return UTF16_REPLACEMENT_CHAR; 138 139 UTF8_SHIFT_AND_MASK(ret, **pUtf8Ptr); 140 } 141 142 return ret; 143 } 144 145 146 /** 147 * out_len is an out parameter (which may not be null) containing the 148 * length of the UTF-16 string (which may contain embedded \0's) 149 */ 150 151 extern char16_t * strcpy8to16 (char16_t *utf16Str, const char*utf8Str, 152 size_t *out_len) 153 { 154 char16_t *dest = utf16Str; 155 156 while (*utf8Str != '\0') { 157 uint32_t ret; 158 159 ret = getUtf32FromUtf8(&utf8Str); 160 161 if (ret <= 0xffff) { 162 *dest++ = (char16_t) ret; 163 } else if (ret <= UNICODE_UPPER_LIMIT) { 164 /* Create surrogate pairs */ 165 /* See http://en.wikipedia.org/wiki/UTF-16/UCS-2#Method_for_code_points_in_Plane_1.2C_Plane_2 */ 166 167 *dest++ = 0xd800 | ((ret - 0x10000) >> 10); 168 *dest++ = 0xdc00 | ((ret - 0x10000) & 0x3ff); 169 } else { 170 *dest++ = UTF16_REPLACEMENT_CHAR; 171 } 172 } 173 174 *out_len = dest - utf16Str; 175 176 return utf16Str; 177 } 178 179 /** 180 * length is the number of characters in the UTF-8 string. 181 * out_len is an out parameter (which may not be null) containing the 182 * length of the UTF-16 string (which may contain embedded \0's) 183 */ 184 185 extern char16_t * strcpylen8to16 (char16_t *utf16Str, const char*utf8Str, 186 int length, size_t *out_len) 187 { 188 /* TODO: Share more of this code with the method above. Only 2 lines changed. */ 189 190 char16_t *dest = utf16Str; 191 192 const char *end = utf8Str + length; /* This line */ 193 while (utf8Str < end) { /* and this line changed. */ 194 uint32_t ret; 195 196 ret = getUtf32FromUtf8(&utf8Str); 197 198 if (ret <= 0xffff) { 199 *dest++ = (char16_t) ret; 200 } else if (ret <= UNICODE_UPPER_LIMIT) { 201 /* Create surrogate pairs */ 202 /* See http://en.wikipedia.org/wiki/UTF-16/UCS-2#Method_for_code_points_in_Plane_1.2C_Plane_2 */ 203 204 *dest++ = 0xd800 | ((ret - 0x10000) >> 10); 205 *dest++ = 0xdc00 | ((ret - 0x10000) & 0x3ff); 206 } else { 207 *dest++ = UTF16_REPLACEMENT_CHAR; 208 } 209 } 210 211 *out_len = dest - utf16Str; 212 213 return utf16Str; 214 } 215