1 /* 2 * Copyright (C) 2005 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include <utils/Unicode.h> 18 19 #include <stddef.h> 20 21 #ifdef HAVE_WINSOCK 22 # undef nhtol 23 # undef htonl 24 # undef nhtos 25 # undef htons 26 27 # define ntohl(x) ( ((x) << 24) | (((x) >> 24) & 255) | (((x) << 8) & 0xff0000) | (((x) >> 8) & 0xff00) ) 28 # define htonl(x) ntohl(x) 29 # define ntohs(x) ( (((x) << 8) & 0xff00) | (((x) >> 8) & 255) ) 30 # define htons(x) ntohs(x) 31 #else 32 # include <netinet/in.h> 33 #endif 34 35 extern "C" { 36 37 static const char32_t kByteMask = 0x000000BF; 38 static const char32_t kByteMark = 0x00000080; 39 40 // Surrogates aren't valid for UTF-32 characters, so define some 41 // constants that will let us screen them out. 42 static const char32_t kUnicodeSurrogateHighStart = 0x0000D800; 43 // Unused, here for completeness: 44 // static const char32_t kUnicodeSurrogateHighEnd = 0x0000DBFF; 45 // static const char32_t kUnicodeSurrogateLowStart = 0x0000DC00; 46 static const char32_t kUnicodeSurrogateLowEnd = 0x0000DFFF; 47 static const char32_t kUnicodeSurrogateStart = kUnicodeSurrogateHighStart; 48 static const char32_t kUnicodeSurrogateEnd = kUnicodeSurrogateLowEnd; 49 static const char32_t kUnicodeMaxCodepoint = 0x0010FFFF; 50 51 // Mask used to set appropriate bits in first byte of UTF-8 sequence, 52 // indexed by number of bytes in the sequence. 53 // 0xxxxxxx 54 // -> (00-7f) 7bit. Bit mask for the first byte is 0x00000000 55 // 110yyyyx 10xxxxxx 56 // -> (c0-df)(80-bf) 11bit. Bit mask is 0x000000C0 57 // 1110yyyy 10yxxxxx 10xxxxxx 58 // -> (e0-ef)(80-bf)(80-bf) 16bit. Bit mask is 0x000000E0 59 // 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx 60 // -> (f0-f7)(80-bf)(80-bf)(80-bf) 21bit. Bit mask is 0x000000F0 61 static const char32_t kFirstByteMark[] = { 62 0x00000000, 0x00000000, 0x000000C0, 0x000000E0, 0x000000F0 63 }; 64 65 // -------------------------------------------------------------------------- 66 // UTF-32 67 // -------------------------------------------------------------------------- 68 69 /** 70 * Return number of UTF-8 bytes required for the character. If the character 71 * is invalid, return size of 0. 72 */ 73 static inline size_t utf32_codepoint_utf8_length(char32_t srcChar) 74 { 75 // Figure out how many bytes the result will require. 76 if (srcChar < 0x00000080) { 77 return 1; 78 } else if (srcChar < 0x00000800) { 79 return 2; 80 } else if (srcChar < 0x00010000) { 81 if ((srcChar < kUnicodeSurrogateStart) || (srcChar > kUnicodeSurrogateEnd)) { 82 return 3; 83 } else { 84 // Surrogates are invalid UTF-32 characters. 85 return 0; 86 } 87 } 88 // Max code point for Unicode is 0x0010FFFF. 89 else if (srcChar <= kUnicodeMaxCodepoint) { 90 return 4; 91 } else { 92 // Invalid UTF-32 character. 93 return 0; 94 } 95 } 96 97 // Write out the source character to <dstP>. 98 99 static inline void utf32_codepoint_to_utf8(uint8_t* dstP, char32_t srcChar, size_t bytes) 100 { 101 dstP += bytes; 102 switch (bytes) 103 { /* note: everything falls through. */ 104 case 4: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6; 105 case 3: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6; 106 case 2: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6; 107 case 1: *--dstP = (uint8_t)(srcChar | kFirstByteMark[bytes]); 108 } 109 } 110 111 size_t strlen32(const char32_t *s) 112 { 113 const char32_t *ss = s; 114 while ( *ss ) 115 ss++; 116 return ss-s; 117 } 118 119 size_t strnlen32(const char32_t *s, size_t maxlen) 120 { 121 const char32_t *ss = s; 122 while ((maxlen > 0) && *ss) { 123 ss++; 124 maxlen--; 125 } 126 return ss-s; 127 } 128 129 static inline int32_t utf32_at_internal(const char* cur, size_t *num_read) 130 { 131 const char first_char = *cur; 132 if ((first_char & 0x80) == 0) { // ASCII 133 *num_read = 1; 134 return *cur; 135 } 136 cur++; 137 char32_t mask, to_ignore_mask; 138 size_t num_to_read = 0; 139 char32_t utf32 = first_char; 140 for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0xFFFFFF80; 141 (first_char & mask); 142 num_to_read++, to_ignore_mask |= mask, mask >>= 1) { 143 // 0x3F == 00111111 144 utf32 = (utf32 << 6) + (*cur++ & 0x3F); 145 } 146 to_ignore_mask |= mask; 147 utf32 &= ~(to_ignore_mask << (6 * (num_to_read - 1))); 148 149 *num_read = num_to_read; 150 return static_cast<int32_t>(utf32); 151 } 152 153 int32_t utf32_from_utf8_at(const char *src, size_t src_len, size_t index, size_t *next_index) 154 { 155 if (index >= src_len) { 156 return -1; 157 } 158 size_t dummy_index; 159 if (next_index == NULL) { 160 next_index = &dummy_index; 161 } 162 size_t num_read; 163 int32_t ret = utf32_at_internal(src + index, &num_read); 164 if (ret >= 0) { 165 *next_index = index + num_read; 166 } 167 168 return ret; 169 } 170 171 ssize_t utf32_to_utf8_length(const char32_t *src, size_t src_len) 172 { 173 if (src == NULL || src_len == 0) { 174 return -1; 175 } 176 177 size_t ret = 0; 178 const char32_t *end = src + src_len; 179 while (src < end) { 180 ret += utf32_codepoint_utf8_length(*src++); 181 } 182 return ret; 183 } 184 185 void utf32_to_utf8(const char32_t* src, size_t src_len, char* dst) 186 { 187 if (src == NULL || src_len == 0 || dst == NULL) { 188 return; 189 } 190 191 const char32_t *cur_utf32 = src; 192 const char32_t *end_utf32 = src + src_len; 193 char *cur = dst; 194 while (cur_utf32 < end_utf32) { 195 size_t len = utf32_codepoint_utf8_length(*cur_utf32); 196 utf32_codepoint_to_utf8((uint8_t *)cur, *cur_utf32++, len); 197 cur += len; 198 } 199 *cur = '\0'; 200 } 201 202 // -------------------------------------------------------------------------- 203 // UTF-16 204 // -------------------------------------------------------------------------- 205 206 int strcmp16(const char16_t *s1, const char16_t *s2) 207 { 208 char16_t ch; 209 int d = 0; 210 211 while ( 1 ) { 212 d = (int)(ch = *s1++) - (int)*s2++; 213 if ( d || !ch ) 214 break; 215 } 216 217 return d; 218 } 219 220 int strncmp16(const char16_t *s1, const char16_t *s2, size_t n) 221 { 222 char16_t ch; 223 int d = 0; 224 225 while ( n-- ) { 226 d = (int)(ch = *s1++) - (int)*s2++; 227 if ( d || !ch ) 228 break; 229 } 230 231 return d; 232 } 233 234 char16_t *strcpy16(char16_t *dst, const char16_t *src) 235 { 236 char16_t *q = dst; 237 const char16_t *p = src; 238 char16_t ch; 239 240 do { 241 *q++ = ch = *p++; 242 } while ( ch ); 243 244 return dst; 245 } 246 247 size_t strlen16(const char16_t *s) 248 { 249 const char16_t *ss = s; 250 while ( *ss ) 251 ss++; 252 return ss-s; 253 } 254 255 256 char16_t *strncpy16(char16_t *dst, const char16_t *src, size_t n) 257 { 258 char16_t *q = dst; 259 const char16_t *p = src; 260 char ch; 261 262 while (n) { 263 n--; 264 *q++ = ch = *p++; 265 if ( !ch ) 266 break; 267 } 268 269 *q = 0; 270 271 return dst; 272 } 273 274 size_t strnlen16(const char16_t *s, size_t maxlen) 275 { 276 const char16_t *ss = s; 277 278 /* Important: the maxlen test must precede the reference through ss; 279 since the byte beyond the maximum may segfault */ 280 while ((maxlen > 0) && *ss) { 281 ss++; 282 maxlen--; 283 } 284 return ss-s; 285 } 286 287 int strzcmp16(const char16_t *s1, size_t n1, const char16_t *s2, size_t n2) 288 { 289 const char16_t* e1 = s1+n1; 290 const char16_t* e2 = s2+n2; 291 292 while (s1 < e1 && s2 < e2) { 293 const int d = (int)*s1++ - (int)*s2++; 294 if (d) { 295 return d; 296 } 297 } 298 299 return n1 < n2 300 ? (0 - (int)*s2) 301 : (n1 > n2 302 ? ((int)*s1 - 0) 303 : 0); 304 } 305 306 int strzcmp16_h_n(const char16_t *s1H, size_t n1, const char16_t *s2N, size_t n2) 307 { 308 const char16_t* e1 = s1H+n1; 309 const char16_t* e2 = s2N+n2; 310 311 while (s1H < e1 && s2N < e2) { 312 const char16_t c2 = ntohs(*s2N); 313 const int d = (int)*s1H++ - (int)c2; 314 s2N++; 315 if (d) { 316 return d; 317 } 318 } 319 320 return n1 < n2 321 ? (0 - (int)ntohs(*s2N)) 322 : (n1 > n2 323 ? ((int)*s1H - 0) 324 : 0); 325 } 326 327 void utf16_to_utf8(const char16_t* src, size_t src_len, char* dst) 328 { 329 if (src == NULL || src_len == 0 || dst == NULL) { 330 return; 331 } 332 333 const char16_t* cur_utf16 = src; 334 const char16_t* const end_utf16 = src + src_len; 335 char *cur = dst; 336 while (cur_utf16 < end_utf16) { 337 char32_t utf32; 338 // surrogate pairs 339 if((*cur_utf16 & 0xFC00) == 0xD800 && (cur_utf16 + 1) < end_utf16 340 && (*(cur_utf16 + 1) & 0xFC00) == 0xDC00) { 341 utf32 = (*cur_utf16++ - 0xD800) << 10; 342 utf32 |= *cur_utf16++ - 0xDC00; 343 utf32 += 0x10000; 344 } else { 345 utf32 = (char32_t) *cur_utf16++; 346 } 347 const size_t len = utf32_codepoint_utf8_length(utf32); 348 utf32_codepoint_to_utf8((uint8_t*)cur, utf32, len); 349 cur += len; 350 } 351 *cur = '\0'; 352 } 353 354 // -------------------------------------------------------------------------- 355 // UTF-8 356 // -------------------------------------------------------------------------- 357 358 ssize_t utf8_length(const char *src) 359 { 360 const char *cur = src; 361 size_t ret = 0; 362 while (*cur != '\0') { 363 const char first_char = *cur++; 364 if ((first_char & 0x80) == 0) { // ASCII 365 ret += 1; 366 continue; 367 } 368 // (UTF-8's character must not be like 10xxxxxx, 369 // but 110xxxxx, 1110xxxx, ... or 1111110x) 370 if ((first_char & 0x40) == 0) { 371 return -1; 372 } 373 374 int32_t mask, to_ignore_mask; 375 size_t num_to_read = 0; 376 char32_t utf32 = 0; 377 for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0x80; 378 num_to_read < 5 && (first_char & mask); 379 num_to_read++, to_ignore_mask |= mask, mask >>= 1) { 380 if ((*cur & 0xC0) != 0x80) { // must be 10xxxxxx 381 return -1; 382 } 383 // 0x3F == 00111111 384 utf32 = (utf32 << 6) + (*cur++ & 0x3F); 385 } 386 // "first_char" must be (110xxxxx - 11110xxx) 387 if (num_to_read == 5) { 388 return -1; 389 } 390 to_ignore_mask |= mask; 391 utf32 |= ((~to_ignore_mask) & first_char) << (6 * (num_to_read - 1)); 392 if (utf32 > kUnicodeMaxCodepoint) { 393 return -1; 394 } 395 396 ret += num_to_read; 397 } 398 return ret; 399 } 400 401 ssize_t utf16_to_utf8_length(const char16_t *src, size_t src_len) 402 { 403 if (src == NULL || src_len == 0) { 404 return -1; 405 } 406 407 size_t ret = 0; 408 const char16_t* const end = src + src_len; 409 while (src < end) { 410 if ((*src & 0xFC00) == 0xD800 && (src + 1) < end 411 && (*++src & 0xFC00) == 0xDC00) { 412 // surrogate pairs are always 4 bytes. 413 ret += 4; 414 src++; 415 } else { 416 ret += utf32_codepoint_utf8_length((char32_t) *src++); 417 } 418 } 419 return ret; 420 } 421 422 /** 423 * Returns 1-4 based on the number of leading bits. 424 * 425 * 1111 -> 4 426 * 1110 -> 3 427 * 110x -> 2 428 * 10xx -> 1 429 * 0xxx -> 1 430 */ 431 static inline size_t utf8_codepoint_len(uint8_t ch) 432 { 433 return ((0xe5000000 >> ((ch >> 3) & 0x1e)) & 3) + 1; 434 } 435 436 static inline void utf8_shift_and_mask(uint32_t* codePoint, const uint8_t byte) 437 { 438 *codePoint <<= 6; 439 *codePoint |= 0x3F & byte; 440 } 441 442 size_t utf8_to_utf32_length(const char *src, size_t src_len) 443 { 444 if (src == NULL || src_len == 0) { 445 return 0; 446 } 447 size_t ret = 0; 448 const char* cur; 449 const char* end; 450 size_t num_to_skip; 451 for (cur = src, end = src + src_len, num_to_skip = 1; 452 cur < end; 453 cur += num_to_skip, ret++) { 454 const char first_char = *cur; 455 num_to_skip = 1; 456 if ((first_char & 0x80) == 0) { // ASCII 457 continue; 458 } 459 int32_t mask; 460 461 for (mask = 0x40; (first_char & mask); num_to_skip++, mask >>= 1) { 462 } 463 } 464 return ret; 465 } 466 467 void utf8_to_utf32(const char* src, size_t src_len, char32_t* dst) 468 { 469 if (src == NULL || src_len == 0 || dst == NULL) { 470 return; 471 } 472 473 const char* cur = src; 474 const char* const end = src + src_len; 475 char32_t* cur_utf32 = dst; 476 while (cur < end) { 477 size_t num_read; 478 *cur_utf32++ = static_cast<char32_t>(utf32_at_internal(cur, &num_read)); 479 cur += num_read; 480 } 481 *cur_utf32 = 0; 482 } 483 484 static inline uint32_t utf8_to_utf32_codepoint(const uint8_t *src, size_t length) 485 { 486 uint32_t unicode; 487 488 switch (length) 489 { 490 case 1: 491 return src[0]; 492 case 2: 493 unicode = src[0] & 0x1f; 494 utf8_shift_and_mask(&unicode, src[1]); 495 return unicode; 496 case 3: 497 unicode = src[0] & 0x0f; 498 utf8_shift_and_mask(&unicode, src[1]); 499 utf8_shift_and_mask(&unicode, src[2]); 500 return unicode; 501 case 4: 502 unicode = src[0] & 0x07; 503 utf8_shift_and_mask(&unicode, src[1]); 504 utf8_shift_and_mask(&unicode, src[2]); 505 utf8_shift_and_mask(&unicode, src[3]); 506 return unicode; 507 default: 508 return 0xffff; 509 } 510 511 //printf("Char at %p: len=%d, utf-16=%p\n", src, length, (void*)result); 512 } 513 514 ssize_t utf8_to_utf16_length(const uint8_t* u8str, size_t u8len) 515 { 516 const uint8_t* const u8end = u8str + u8len; 517 const uint8_t* u8cur = u8str; 518 519 /* Validate that the UTF-8 is the correct len */ 520 size_t u16measuredLen = 0; 521 while (u8cur < u8end) { 522 u16measuredLen++; 523 int u8charLen = utf8_codepoint_len(*u8cur); 524 uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8charLen); 525 if (codepoint > 0xFFFF) u16measuredLen++; // this will be a surrogate pair in utf16 526 u8cur += u8charLen; 527 } 528 529 /** 530 * Make sure that we ended where we thought we would and the output UTF-16 531 * will be exactly how long we were told it would be. 532 */ 533 if (u8cur != u8end) { 534 return -1; 535 } 536 537 return u16measuredLen; 538 } 539 540 char16_t* utf8_to_utf16_no_null_terminator(const uint8_t* u8str, size_t u8len, char16_t* u16str) 541 { 542 const uint8_t* const u8end = u8str + u8len; 543 const uint8_t* u8cur = u8str; 544 char16_t* u16cur = u16str; 545 546 while (u8cur < u8end) { 547 size_t u8len = utf8_codepoint_len(*u8cur); 548 uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8len); 549 550 // Convert the UTF32 codepoint to one or more UTF16 codepoints 551 if (codepoint <= 0xFFFF) { 552 // Single UTF16 character 553 *u16cur++ = (char16_t) codepoint; 554 } else { 555 // Multiple UTF16 characters with surrogates 556 codepoint = codepoint - 0x10000; 557 *u16cur++ = (char16_t) ((codepoint >> 10) + 0xD800); 558 *u16cur++ = (char16_t) ((codepoint & 0x3FF) + 0xDC00); 559 } 560 561 u8cur += u8len; 562 } 563 return u16cur; 564 } 565 566 void utf8_to_utf16(const uint8_t* u8str, size_t u8len, char16_t* u16str) { 567 char16_t* end = utf8_to_utf16_no_null_terminator(u8str, u8len, u16str); 568 *end = 0; 569 } 570 571 char16_t* utf8_to_utf16_n(const uint8_t* src, size_t srcLen, char16_t* dst, size_t dstLen) { 572 const uint8_t* const u8end = src + srcLen; 573 const uint8_t* u8cur = src; 574 const char16_t* const u16end = dst + dstLen; 575 char16_t* u16cur = dst; 576 577 while (u8cur < u8end && u16cur < u16end) { 578 size_t u8len = utf8_codepoint_len(*u8cur); 579 uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8len); 580 581 // Convert the UTF32 codepoint to one or more UTF16 codepoints 582 if (codepoint <= 0xFFFF) { 583 // Single UTF16 character 584 *u16cur++ = (char16_t) codepoint; 585 } else { 586 // Multiple UTF16 characters with surrogates 587 codepoint = codepoint - 0x10000; 588 *u16cur++ = (char16_t) ((codepoint >> 10) + 0xD800); 589 if (u16cur >= u16end) { 590 // Ooops... not enough room for this surrogate pair. 591 return u16cur-1; 592 } 593 *u16cur++ = (char16_t) ((codepoint & 0x3FF) + 0xDC00); 594 } 595 596 u8cur += u8len; 597 } 598 return u16cur; 599 } 600 601 } 602