1 /* 2 * Copyright (C) 2005 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include <utils/Unicode.h> 18 19 #include <stddef.h> 20 21 #if defined(_WIN32) 22 # undef nhtol 23 # undef htonl 24 # undef nhtos 25 # undef htons 26 27 # define ntohl(x) ( ((x) << 24) | (((x) >> 24) & 255) | (((x) << 8) & 0xff0000) | (((x) >> 8) & 0xff00) ) 28 # define htonl(x) ntohl(x) 29 # define ntohs(x) ( (((x) << 8) & 0xff00) | (((x) >> 8) & 255) ) 30 # define htons(x) ntohs(x) 31 #else 32 # include <netinet/in.h> 33 #endif 34 35 extern "C" { 36 37 static const char32_t kByteMask = 0x000000BF; 38 static const char32_t kByteMark = 0x00000080; 39 40 // Surrogates aren't valid for UTF-32 characters, so define some 41 // constants that will let us screen them out. 42 static const char32_t kUnicodeSurrogateHighStart = 0x0000D800; 43 // Unused, here for completeness: 44 // static const char32_t kUnicodeSurrogateHighEnd = 0x0000DBFF; 45 // static const char32_t kUnicodeSurrogateLowStart = 0x0000DC00; 46 static const char32_t kUnicodeSurrogateLowEnd = 0x0000DFFF; 47 static const char32_t kUnicodeSurrogateStart = kUnicodeSurrogateHighStart; 48 static const char32_t kUnicodeSurrogateEnd = kUnicodeSurrogateLowEnd; 49 static const char32_t kUnicodeMaxCodepoint = 0x0010FFFF; 50 51 // Mask used to set appropriate bits in first byte of UTF-8 sequence, 52 // indexed by number of bytes in the sequence. 53 // 0xxxxxxx 54 // -> (00-7f) 7bit. Bit mask for the first byte is 0x00000000 55 // 110yyyyx 10xxxxxx 56 // -> (c0-df)(80-bf) 11bit. Bit mask is 0x000000C0 57 // 1110yyyy 10yxxxxx 10xxxxxx 58 // -> (e0-ef)(80-bf)(80-bf) 16bit. Bit mask is 0x000000E0 59 // 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx 60 // -> (f0-f7)(80-bf)(80-bf)(80-bf) 21bit. Bit mask is 0x000000F0 61 static const char32_t kFirstByteMark[] = { 62 0x00000000, 0x00000000, 0x000000C0, 0x000000E0, 0x000000F0 63 }; 64 65 // -------------------------------------------------------------------------- 66 // UTF-32 67 // -------------------------------------------------------------------------- 68 69 /** 70 * Return number of UTF-8 bytes required for the character. If the character 71 * is invalid, return size of 0. 72 */ 73 static inline size_t utf32_codepoint_utf8_length(char32_t srcChar) 74 { 75 // Figure out how many bytes the result will require. 76 if (srcChar < 0x00000080) { 77 return 1; 78 } else if (srcChar < 0x00000800) { 79 return 2; 80 } else if (srcChar < 0x00010000) { 81 if ((srcChar < kUnicodeSurrogateStart) || (srcChar > kUnicodeSurrogateEnd)) { 82 return 3; 83 } else { 84 // Surrogates are invalid UTF-32 characters. 85 return 0; 86 } 87 } 88 // Max code point for Unicode is 0x0010FFFF. 89 else if (srcChar <= kUnicodeMaxCodepoint) { 90 return 4; 91 } else { 92 // Invalid UTF-32 character. 93 return 0; 94 } 95 } 96 97 // Write out the source character to <dstP>. 98 99 static inline void utf32_codepoint_to_utf8(uint8_t* dstP, char32_t srcChar, size_t bytes) 100 { 101 dstP += bytes; 102 switch (bytes) 103 { /* note: everything falls through. */ 104 case 4: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6; 105 case 3: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6; 106 case 2: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6; 107 case 1: *--dstP = (uint8_t)(srcChar | kFirstByteMark[bytes]); 108 } 109 } 110 111 size_t strlen32(const char32_t *s) 112 { 113 const char32_t *ss = s; 114 while ( *ss ) 115 ss++; 116 return ss-s; 117 } 118 119 size_t strnlen32(const char32_t *s, size_t maxlen) 120 { 121 const char32_t *ss = s; 122 while ((maxlen > 0) && *ss) { 123 ss++; 124 maxlen--; 125 } 126 return ss-s; 127 } 128 129 static inline int32_t utf32_at_internal(const char* cur, size_t *num_read) 130 { 131 const char first_char = *cur; 132 if ((first_char & 0x80) == 0) { // ASCII 133 *num_read = 1; 134 return *cur; 135 } 136 cur++; 137 char32_t mask, to_ignore_mask; 138 size_t num_to_read = 0; 139 char32_t utf32 = first_char; 140 for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0xFFFFFF80; 141 (first_char & mask); 142 num_to_read++, to_ignore_mask |= mask, mask >>= 1) { 143 // 0x3F == 00111111 144 utf32 = (utf32 << 6) + (*cur++ & 0x3F); 145 } 146 to_ignore_mask |= mask; 147 utf32 &= ~(to_ignore_mask << (6 * (num_to_read - 1))); 148 149 *num_read = num_to_read; 150 return static_cast<int32_t>(utf32); 151 } 152 153 int32_t utf32_from_utf8_at(const char *src, size_t src_len, size_t index, size_t *next_index) 154 { 155 if (index >= src_len) { 156 return -1; 157 } 158 size_t dummy_index; 159 if (next_index == NULL) { 160 next_index = &dummy_index; 161 } 162 size_t num_read; 163 int32_t ret = utf32_at_internal(src + index, &num_read); 164 if (ret >= 0) { 165 *next_index = index + num_read; 166 } 167 168 return ret; 169 } 170 171 ssize_t utf32_to_utf8_length(const char32_t *src, size_t src_len) 172 { 173 if (src == NULL || src_len == 0) { 174 return -1; 175 } 176 177 size_t ret = 0; 178 const char32_t *end = src + src_len; 179 while (src < end) { 180 ret += utf32_codepoint_utf8_length(*src++); 181 } 182 return ret; 183 } 184 185 void utf32_to_utf8(const char32_t* src, size_t src_len, char* dst) 186 { 187 if (src == NULL || src_len == 0 || dst == NULL) { 188 return; 189 } 190 191 const char32_t *cur_utf32 = src; 192 const char32_t *end_utf32 = src + src_len; 193 char *cur = dst; 194 while (cur_utf32 < end_utf32) { 195 size_t len = utf32_codepoint_utf8_length(*cur_utf32); 196 utf32_codepoint_to_utf8((uint8_t *)cur, *cur_utf32++, len); 197 cur += len; 198 } 199 *cur = '\0'; 200 } 201 202 // -------------------------------------------------------------------------- 203 // UTF-16 204 // -------------------------------------------------------------------------- 205 206 int strcmp16(const char16_t *s1, const char16_t *s2) 207 { 208 char16_t ch; 209 int d = 0; 210 211 while ( 1 ) { 212 d = (int)(ch = *s1++) - (int)*s2++; 213 if ( d || !ch ) 214 break; 215 } 216 217 return d; 218 } 219 220 int strncmp16(const char16_t *s1, const char16_t *s2, size_t n) 221 { 222 char16_t ch; 223 int d = 0; 224 225 if (n == 0) { 226 return 0; 227 } 228 229 do { 230 d = (int)(ch = *s1++) - (int)*s2++; 231 if ( d || !ch ) { 232 break; 233 } 234 } while (--n); 235 236 return d; 237 } 238 239 char16_t *strcpy16(char16_t *dst, const char16_t *src) 240 { 241 char16_t *q = dst; 242 const char16_t *p = src; 243 char16_t ch; 244 245 do { 246 *q++ = ch = *p++; 247 } while ( ch ); 248 249 return dst; 250 } 251 252 size_t strlen16(const char16_t *s) 253 { 254 const char16_t *ss = s; 255 while ( *ss ) 256 ss++; 257 return ss-s; 258 } 259 260 261 char16_t *strncpy16(char16_t *dst, const char16_t *src, size_t n) 262 { 263 char16_t *q = dst; 264 const char16_t *p = src; 265 char ch; 266 267 while (n) { 268 n--; 269 *q++ = ch = *p++; 270 if ( !ch ) 271 break; 272 } 273 274 *q = 0; 275 276 return dst; 277 } 278 279 size_t strnlen16(const char16_t *s, size_t maxlen) 280 { 281 const char16_t *ss = s; 282 283 /* Important: the maxlen test must precede the reference through ss; 284 since the byte beyond the maximum may segfault */ 285 while ((maxlen > 0) && *ss) { 286 ss++; 287 maxlen--; 288 } 289 return ss-s; 290 } 291 292 char16_t* strstr16(const char16_t* src, const char16_t* target) 293 { 294 const char16_t needle = *target++; 295 const size_t target_len = strlen16(target); 296 if (needle != '\0') { 297 do { 298 do { 299 if (*src == '\0') { 300 return nullptr; 301 } 302 } while (*src++ != needle); 303 } while (strncmp16(src, target, target_len) != 0); 304 src--; 305 } 306 307 return (char16_t*)src; 308 } 309 310 311 int strzcmp16(const char16_t *s1, size_t n1, const char16_t *s2, size_t n2) 312 { 313 const char16_t* e1 = s1+n1; 314 const char16_t* e2 = s2+n2; 315 316 while (s1 < e1 && s2 < e2) { 317 const int d = (int)*s1++ - (int)*s2++; 318 if (d) { 319 return d; 320 } 321 } 322 323 return n1 < n2 324 ? (0 - (int)*s2) 325 : (n1 > n2 326 ? ((int)*s1 - 0) 327 : 0); 328 } 329 330 int strzcmp16_h_n(const char16_t *s1H, size_t n1, const char16_t *s2N, size_t n2) 331 { 332 const char16_t* e1 = s1H+n1; 333 const char16_t* e2 = s2N+n2; 334 335 while (s1H < e1 && s2N < e2) { 336 const char16_t c2 = ntohs(*s2N); 337 const int d = (int)*s1H++ - (int)c2; 338 s2N++; 339 if (d) { 340 return d; 341 } 342 } 343 344 return n1 < n2 345 ? (0 - (int)ntohs(*s2N)) 346 : (n1 > n2 347 ? ((int)*s1H - 0) 348 : 0); 349 } 350 351 void utf16_to_utf8(const char16_t* src, size_t src_len, char* dst) 352 { 353 if (src == NULL || src_len == 0 || dst == NULL) { 354 return; 355 } 356 357 const char16_t* cur_utf16 = src; 358 const char16_t* const end_utf16 = src + src_len; 359 char *cur = dst; 360 while (cur_utf16 < end_utf16) { 361 char32_t utf32; 362 // surrogate pairs 363 if((*cur_utf16 & 0xFC00) == 0xD800 && (cur_utf16 + 1) < end_utf16 364 && (*(cur_utf16 + 1) & 0xFC00) == 0xDC00) { 365 utf32 = (*cur_utf16++ - 0xD800) << 10; 366 utf32 |= *cur_utf16++ - 0xDC00; 367 utf32 += 0x10000; 368 } else { 369 utf32 = (char32_t) *cur_utf16++; 370 } 371 const size_t len = utf32_codepoint_utf8_length(utf32); 372 utf32_codepoint_to_utf8((uint8_t*)cur, utf32, len); 373 cur += len; 374 } 375 *cur = '\0'; 376 } 377 378 // -------------------------------------------------------------------------- 379 // UTF-8 380 // -------------------------------------------------------------------------- 381 382 ssize_t utf8_length(const char *src) 383 { 384 const char *cur = src; 385 size_t ret = 0; 386 while (*cur != '\0') { 387 const char first_char = *cur++; 388 if ((first_char & 0x80) == 0) { // ASCII 389 ret += 1; 390 continue; 391 } 392 // (UTF-8's character must not be like 10xxxxxx, 393 // but 110xxxxx, 1110xxxx, ... or 1111110x) 394 if ((first_char & 0x40) == 0) { 395 return -1; 396 } 397 398 int32_t mask, to_ignore_mask; 399 size_t num_to_read = 0; 400 char32_t utf32 = 0; 401 for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0x80; 402 num_to_read < 5 && (first_char & mask); 403 num_to_read++, to_ignore_mask |= mask, mask >>= 1) { 404 if ((*cur & 0xC0) != 0x80) { // must be 10xxxxxx 405 return -1; 406 } 407 // 0x3F == 00111111 408 utf32 = (utf32 << 6) + (*cur++ & 0x3F); 409 } 410 // "first_char" must be (110xxxxx - 11110xxx) 411 if (num_to_read == 5) { 412 return -1; 413 } 414 to_ignore_mask |= mask; 415 utf32 |= ((~to_ignore_mask) & first_char) << (6 * (num_to_read - 1)); 416 if (utf32 > kUnicodeMaxCodepoint) { 417 return -1; 418 } 419 420 ret += num_to_read; 421 } 422 return ret; 423 } 424 425 ssize_t utf16_to_utf8_length(const char16_t *src, size_t src_len) 426 { 427 if (src == NULL || src_len == 0) { 428 return -1; 429 } 430 431 size_t ret = 0; 432 const char16_t* const end = src + src_len; 433 while (src < end) { 434 if ((*src & 0xFC00) == 0xD800 && (src + 1) < end 435 && (*++src & 0xFC00) == 0xDC00) { 436 // surrogate pairs are always 4 bytes. 437 ret += 4; 438 src++; 439 } else { 440 ret += utf32_codepoint_utf8_length((char32_t) *src++); 441 } 442 } 443 return ret; 444 } 445 446 /** 447 * Returns 1-4 based on the number of leading bits. 448 * 449 * 1111 -> 4 450 * 1110 -> 3 451 * 110x -> 2 452 * 10xx -> 1 453 * 0xxx -> 1 454 */ 455 static inline size_t utf8_codepoint_len(uint8_t ch) 456 { 457 return ((0xe5000000 >> ((ch >> 3) & 0x1e)) & 3) + 1; 458 } 459 460 static inline void utf8_shift_and_mask(uint32_t* codePoint, const uint8_t byte) 461 { 462 *codePoint <<= 6; 463 *codePoint |= 0x3F & byte; 464 } 465 466 size_t utf8_to_utf32_length(const char *src, size_t src_len) 467 { 468 if (src == NULL || src_len == 0) { 469 return 0; 470 } 471 size_t ret = 0; 472 const char* cur; 473 const char* end; 474 size_t num_to_skip; 475 for (cur = src, end = src + src_len, num_to_skip = 1; 476 cur < end; 477 cur += num_to_skip, ret++) { 478 const char first_char = *cur; 479 num_to_skip = 1; 480 if ((first_char & 0x80) == 0) { // ASCII 481 continue; 482 } 483 int32_t mask; 484 485 for (mask = 0x40; (first_char & mask); num_to_skip++, mask >>= 1) { 486 } 487 } 488 return ret; 489 } 490 491 void utf8_to_utf32(const char* src, size_t src_len, char32_t* dst) 492 { 493 if (src == NULL || src_len == 0 || dst == NULL) { 494 return; 495 } 496 497 const char* cur = src; 498 const char* const end = src + src_len; 499 char32_t* cur_utf32 = dst; 500 while (cur < end) { 501 size_t num_read; 502 *cur_utf32++ = static_cast<char32_t>(utf32_at_internal(cur, &num_read)); 503 cur += num_read; 504 } 505 *cur_utf32 = 0; 506 } 507 508 static inline uint32_t utf8_to_utf32_codepoint(const uint8_t *src, size_t length) 509 { 510 uint32_t unicode; 511 512 switch (length) 513 { 514 case 1: 515 return src[0]; 516 case 2: 517 unicode = src[0] & 0x1f; 518 utf8_shift_and_mask(&unicode, src[1]); 519 return unicode; 520 case 3: 521 unicode = src[0] & 0x0f; 522 utf8_shift_and_mask(&unicode, src[1]); 523 utf8_shift_and_mask(&unicode, src[2]); 524 return unicode; 525 case 4: 526 unicode = src[0] & 0x07; 527 utf8_shift_and_mask(&unicode, src[1]); 528 utf8_shift_and_mask(&unicode, src[2]); 529 utf8_shift_and_mask(&unicode, src[3]); 530 return unicode; 531 default: 532 return 0xffff; 533 } 534 535 //printf("Char at %p: len=%d, utf-16=%p\n", src, length, (void*)result); 536 } 537 538 ssize_t utf8_to_utf16_length(const uint8_t* u8str, size_t u8len) 539 { 540 const uint8_t* const u8end = u8str + u8len; 541 const uint8_t* u8cur = u8str; 542 543 /* Validate that the UTF-8 is the correct len */ 544 size_t u16measuredLen = 0; 545 while (u8cur < u8end) { 546 u16measuredLen++; 547 int u8charLen = utf8_codepoint_len(*u8cur); 548 uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8charLen); 549 if (codepoint > 0xFFFF) u16measuredLen++; // this will be a surrogate pair in utf16 550 u8cur += u8charLen; 551 } 552 553 /** 554 * Make sure that we ended where we thought we would and the output UTF-16 555 * will be exactly how long we were told it would be. 556 */ 557 if (u8cur != u8end) { 558 return -1; 559 } 560 561 return u16measuredLen; 562 } 563 564 char16_t* utf8_to_utf16_no_null_terminator(const uint8_t* u8str, size_t u8len, char16_t* u16str) 565 { 566 const uint8_t* const u8end = u8str + u8len; 567 const uint8_t* u8cur = u8str; 568 char16_t* u16cur = u16str; 569 570 while (u8cur < u8end) { 571 size_t u8len = utf8_codepoint_len(*u8cur); 572 uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8len); 573 574 // Convert the UTF32 codepoint to one or more UTF16 codepoints 575 if (codepoint <= 0xFFFF) { 576 // Single UTF16 character 577 *u16cur++ = (char16_t) codepoint; 578 } else { 579 // Multiple UTF16 characters with surrogates 580 codepoint = codepoint - 0x10000; 581 *u16cur++ = (char16_t) ((codepoint >> 10) + 0xD800); 582 *u16cur++ = (char16_t) ((codepoint & 0x3FF) + 0xDC00); 583 } 584 585 u8cur += u8len; 586 } 587 return u16cur; 588 } 589 590 void utf8_to_utf16(const uint8_t* u8str, size_t u8len, char16_t* u16str) { 591 char16_t* end = utf8_to_utf16_no_null_terminator(u8str, u8len, u16str); 592 *end = 0; 593 } 594 595 char16_t* utf8_to_utf16_n(const uint8_t* src, size_t srcLen, char16_t* dst, size_t dstLen) { 596 const uint8_t* const u8end = src + srcLen; 597 const uint8_t* u8cur = src; 598 const char16_t* const u16end = dst + dstLen; 599 char16_t* u16cur = dst; 600 601 while (u8cur < u8end && u16cur < u16end) { 602 size_t u8len = utf8_codepoint_len(*u8cur); 603 uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8len); 604 605 // Convert the UTF32 codepoint to one or more UTF16 codepoints 606 if (codepoint <= 0xFFFF) { 607 // Single UTF16 character 608 *u16cur++ = (char16_t) codepoint; 609 } else { 610 // Multiple UTF16 characters with surrogates 611 codepoint = codepoint - 0x10000; 612 *u16cur++ = (char16_t) ((codepoint >> 10) + 0xD800); 613 if (u16cur >= u16end) { 614 // Ooops... not enough room for this surrogate pair. 615 return u16cur-1; 616 } 617 *u16cur++ = (char16_t) ((codepoint & 0x3FF) + 0xDC00); 618 } 619 620 u8cur += u8len; 621 } 622 return u16cur; 623 } 624 625 } 626