1 /* 2 * Copyright (C) 2005 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include <log/log.h> 18 #include <utils/Unicode.h> 19 20 #include <stddef.h> 21 22 #if defined(_WIN32) 23 # undef nhtol 24 # undef htonl 25 # undef nhtos 26 # undef htons 27 28 # define ntohl(x) ( ((x) << 24) | (((x) >> 24) & 255) | (((x) << 8) & 0xff0000) | (((x) >> 8) & 0xff00) ) 29 # define htonl(x) ntohl(x) 30 # define ntohs(x) ( (((x) << 8) & 0xff00) | (((x) >> 8) & 255) ) 31 # define htons(x) ntohs(x) 32 #else 33 # include <netinet/in.h> 34 #endif 35 36 extern "C" { 37 38 static const char32_t kByteMask = 0x000000BF; 39 static const char32_t kByteMark = 0x00000080; 40 41 // Surrogates aren't valid for UTF-32 characters, so define some 42 // constants that will let us screen them out. 43 static const char32_t kUnicodeSurrogateHighStart = 0x0000D800; 44 // Unused, here for completeness: 45 // static const char32_t kUnicodeSurrogateHighEnd = 0x0000DBFF; 46 // static const char32_t kUnicodeSurrogateLowStart = 0x0000DC00; 47 static const char32_t kUnicodeSurrogateLowEnd = 0x0000DFFF; 48 static const char32_t kUnicodeSurrogateStart = kUnicodeSurrogateHighStart; 49 static const char32_t kUnicodeSurrogateEnd = kUnicodeSurrogateLowEnd; 50 static const char32_t kUnicodeMaxCodepoint = 0x0010FFFF; 51 52 // Mask used to set appropriate bits in first byte of UTF-8 sequence, 53 // indexed by number of bytes in the sequence. 54 // 0xxxxxxx 55 // -> (00-7f) 7bit. Bit mask for the first byte is 0x00000000 56 // 110yyyyx 10xxxxxx 57 // -> (c0-df)(80-bf) 11bit. Bit mask is 0x000000C0 58 // 1110yyyy 10yxxxxx 10xxxxxx 59 // -> (e0-ef)(80-bf)(80-bf) 16bit. Bit mask is 0x000000E0 60 // 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx 61 // -> (f0-f7)(80-bf)(80-bf)(80-bf) 21bit. Bit mask is 0x000000F0 62 static const char32_t kFirstByteMark[] = { 63 0x00000000, 0x00000000, 0x000000C0, 0x000000E0, 0x000000F0 64 }; 65 66 // -------------------------------------------------------------------------- 67 // UTF-32 68 // -------------------------------------------------------------------------- 69 70 /** 71 * Return number of UTF-8 bytes required for the character. If the character 72 * is invalid, return size of 0. 73 */ 74 static inline size_t utf32_codepoint_utf8_length(char32_t srcChar) 75 { 76 // Figure out how many bytes the result will require. 77 if (srcChar < 0x00000080) { 78 return 1; 79 } else if (srcChar < 0x00000800) { 80 return 2; 81 } else if (srcChar < 0x00010000) { 82 if ((srcChar < kUnicodeSurrogateStart) || (srcChar > kUnicodeSurrogateEnd)) { 83 return 3; 84 } else { 85 // Surrogates are invalid UTF-32 characters. 86 return 0; 87 } 88 } 89 // Max code point for Unicode is 0x0010FFFF. 90 else if (srcChar <= kUnicodeMaxCodepoint) { 91 return 4; 92 } else { 93 // Invalid UTF-32 character. 94 return 0; 95 } 96 } 97 98 // Write out the source character to <dstP>. 99 100 static inline void utf32_codepoint_to_utf8(uint8_t* dstP, char32_t srcChar, size_t bytes) 101 { 102 dstP += bytes; 103 switch (bytes) 104 { /* note: everything falls through. */ 105 case 4: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6; 106 case 3: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6; 107 case 2: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6; 108 case 1: *--dstP = (uint8_t)(srcChar | kFirstByteMark[bytes]); 109 } 110 } 111 112 size_t strlen32(const char32_t *s) 113 { 114 const char32_t *ss = s; 115 while ( *ss ) 116 ss++; 117 return ss-s; 118 } 119 120 size_t strnlen32(const char32_t *s, size_t maxlen) 121 { 122 const char32_t *ss = s; 123 while ((maxlen > 0) && *ss) { 124 ss++; 125 maxlen--; 126 } 127 return ss-s; 128 } 129 130 static inline int32_t utf32_at_internal(const char* cur, size_t *num_read) 131 { 132 const char first_char = *cur; 133 if ((first_char & 0x80) == 0) { // ASCII 134 *num_read = 1; 135 return *cur; 136 } 137 cur++; 138 char32_t mask, to_ignore_mask; 139 size_t num_to_read = 0; 140 char32_t utf32 = first_char; 141 for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0xFFFFFF80; 142 (first_char & mask); 143 num_to_read++, to_ignore_mask |= mask, mask >>= 1) { 144 // 0x3F == 00111111 145 utf32 = (utf32 << 6) + (*cur++ & 0x3F); 146 } 147 to_ignore_mask |= mask; 148 utf32 &= ~(to_ignore_mask << (6 * (num_to_read - 1))); 149 150 *num_read = num_to_read; 151 return static_cast<int32_t>(utf32); 152 } 153 154 int32_t utf32_from_utf8_at(const char *src, size_t src_len, size_t index, size_t *next_index) 155 { 156 if (index >= src_len) { 157 return -1; 158 } 159 size_t dummy_index; 160 if (next_index == NULL) { 161 next_index = &dummy_index; 162 } 163 size_t num_read; 164 int32_t ret = utf32_at_internal(src + index, &num_read); 165 if (ret >= 0) { 166 *next_index = index + num_read; 167 } 168 169 return ret; 170 } 171 172 ssize_t utf32_to_utf8_length(const char32_t *src, size_t src_len) 173 { 174 if (src == NULL || src_len == 0) { 175 return -1; 176 } 177 178 size_t ret = 0; 179 const char32_t *end = src + src_len; 180 while (src < end) { 181 ret += utf32_codepoint_utf8_length(*src++); 182 } 183 return ret; 184 } 185 186 void utf32_to_utf8(const char32_t* src, size_t src_len, char* dst, size_t dst_len) 187 { 188 if (src == NULL || src_len == 0 || dst == NULL) { 189 return; 190 } 191 192 const char32_t *cur_utf32 = src; 193 const char32_t *end_utf32 = src + src_len; 194 char *cur = dst; 195 while (cur_utf32 < end_utf32) { 196 size_t len = utf32_codepoint_utf8_length(*cur_utf32); 197 LOG_ALWAYS_FATAL_IF(dst_len < len, "%zu < %zu", dst_len, len); 198 utf32_codepoint_to_utf8((uint8_t *)cur, *cur_utf32++, len); 199 cur += len; 200 dst_len -= len; 201 } 202 LOG_ALWAYS_FATAL_IF(dst_len < 1, "dst_len < 1: %zu < 1", dst_len); 203 *cur = '\0'; 204 } 205 206 // -------------------------------------------------------------------------- 207 // UTF-16 208 // -------------------------------------------------------------------------- 209 210 int strcmp16(const char16_t *s1, const char16_t *s2) 211 { 212 char16_t ch; 213 int d = 0; 214 215 while ( 1 ) { 216 d = (int)(ch = *s1++) - (int)*s2++; 217 if ( d || !ch ) 218 break; 219 } 220 221 return d; 222 } 223 224 int strncmp16(const char16_t *s1, const char16_t *s2, size_t n) 225 { 226 char16_t ch; 227 int d = 0; 228 229 if (n == 0) { 230 return 0; 231 } 232 233 do { 234 d = (int)(ch = *s1++) - (int)*s2++; 235 if ( d || !ch ) { 236 break; 237 } 238 } while (--n); 239 240 return d; 241 } 242 243 char16_t *strcpy16(char16_t *dst, const char16_t *src) 244 { 245 char16_t *q = dst; 246 const char16_t *p = src; 247 char16_t ch; 248 249 do { 250 *q++ = ch = *p++; 251 } while ( ch ); 252 253 return dst; 254 } 255 256 size_t strlen16(const char16_t *s) 257 { 258 const char16_t *ss = s; 259 while ( *ss ) 260 ss++; 261 return ss-s; 262 } 263 264 265 char16_t *strncpy16(char16_t *dst, const char16_t *src, size_t n) 266 { 267 char16_t *q = dst; 268 const char16_t *p = src; 269 char ch; 270 271 while (n) { 272 n--; 273 *q++ = ch = *p++; 274 if ( !ch ) 275 break; 276 } 277 278 *q = 0; 279 280 return dst; 281 } 282 283 size_t strnlen16(const char16_t *s, size_t maxlen) 284 { 285 const char16_t *ss = s; 286 287 /* Important: the maxlen test must precede the reference through ss; 288 since the byte beyond the maximum may segfault */ 289 while ((maxlen > 0) && *ss) { 290 ss++; 291 maxlen--; 292 } 293 return ss-s; 294 } 295 296 char16_t* strstr16(const char16_t* src, const char16_t* target) 297 { 298 const char16_t needle = *target++; 299 const size_t target_len = strlen16(target); 300 if (needle != '\0') { 301 do { 302 do { 303 if (*src == '\0') { 304 return nullptr; 305 } 306 } while (*src++ != needle); 307 } while (strncmp16(src, target, target_len) != 0); 308 src--; 309 } 310 311 return (char16_t*)src; 312 } 313 314 315 int strzcmp16(const char16_t *s1, size_t n1, const char16_t *s2, size_t n2) 316 { 317 const char16_t* e1 = s1+n1; 318 const char16_t* e2 = s2+n2; 319 320 while (s1 < e1 && s2 < e2) { 321 const int d = (int)*s1++ - (int)*s2++; 322 if (d) { 323 return d; 324 } 325 } 326 327 return n1 < n2 328 ? (0 - (int)*s2) 329 : (n1 > n2 330 ? ((int)*s1 - 0) 331 : 0); 332 } 333 334 int strzcmp16_h_n(const char16_t *s1H, size_t n1, const char16_t *s2N, size_t n2) 335 { 336 const char16_t* e1 = s1H+n1; 337 const char16_t* e2 = s2N+n2; 338 339 while (s1H < e1 && s2N < e2) { 340 const char16_t c2 = ntohs(*s2N); 341 const int d = (int)*s1H++ - (int)c2; 342 s2N++; 343 if (d) { 344 return d; 345 } 346 } 347 348 return n1 < n2 349 ? (0 - (int)ntohs(*s2N)) 350 : (n1 > n2 351 ? ((int)*s1H - 0) 352 : 0); 353 } 354 355 void utf16_to_utf8(const char16_t* src, size_t src_len, char* dst, size_t dst_len) 356 { 357 if (src == NULL || src_len == 0 || dst == NULL) { 358 return; 359 } 360 361 const char16_t* cur_utf16 = src; 362 const char16_t* const end_utf16 = src + src_len; 363 char *cur = dst; 364 while (cur_utf16 < end_utf16) { 365 char32_t utf32; 366 // surrogate pairs 367 if((*cur_utf16 & 0xFC00) == 0xD800 && (cur_utf16 + 1) < end_utf16 368 && (*(cur_utf16 + 1) & 0xFC00) == 0xDC00) { 369 utf32 = (*cur_utf16++ - 0xD800) << 10; 370 utf32 |= *cur_utf16++ - 0xDC00; 371 utf32 += 0x10000; 372 } else { 373 utf32 = (char32_t) *cur_utf16++; 374 } 375 const size_t len = utf32_codepoint_utf8_length(utf32); 376 LOG_ALWAYS_FATAL_IF(dst_len < len, "%zu < %zu", dst_len, len); 377 utf32_codepoint_to_utf8((uint8_t*)cur, utf32, len); 378 cur += len; 379 dst_len -= len; 380 } 381 LOG_ALWAYS_FATAL_IF(dst_len < 1, "%zu < 1", dst_len); 382 *cur = '\0'; 383 } 384 385 // -------------------------------------------------------------------------- 386 // UTF-8 387 // -------------------------------------------------------------------------- 388 389 ssize_t utf8_length(const char *src) 390 { 391 const char *cur = src; 392 size_t ret = 0; 393 while (*cur != '\0') { 394 const char first_char = *cur++; 395 if ((first_char & 0x80) == 0) { // ASCII 396 ret += 1; 397 continue; 398 } 399 // (UTF-8's character must not be like 10xxxxxx, 400 // but 110xxxxx, 1110xxxx, ... or 1111110x) 401 if ((first_char & 0x40) == 0) { 402 return -1; 403 } 404 405 int32_t mask, to_ignore_mask; 406 size_t num_to_read = 0; 407 char32_t utf32 = 0; 408 for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0x80; 409 num_to_read < 5 && (first_char & mask); 410 num_to_read++, to_ignore_mask |= mask, mask >>= 1) { 411 if ((*cur & 0xC0) != 0x80) { // must be 10xxxxxx 412 return -1; 413 } 414 // 0x3F == 00111111 415 utf32 = (utf32 << 6) + (*cur++ & 0x3F); 416 } 417 // "first_char" must be (110xxxxx - 11110xxx) 418 if (num_to_read == 5) { 419 return -1; 420 } 421 to_ignore_mask |= mask; 422 utf32 |= ((~to_ignore_mask) & first_char) << (6 * (num_to_read - 1)); 423 if (utf32 > kUnicodeMaxCodepoint) { 424 return -1; 425 } 426 427 ret += num_to_read; 428 } 429 return ret; 430 } 431 432 ssize_t utf16_to_utf8_length(const char16_t *src, size_t src_len) 433 { 434 if (src == NULL || src_len == 0) { 435 return -1; 436 } 437 438 size_t ret = 0; 439 const char16_t* const end = src + src_len; 440 while (src < end) { 441 if ((*src & 0xFC00) == 0xD800 && (src + 1) < end 442 && (*(src + 1) & 0xFC00) == 0xDC00) { 443 // surrogate pairs are always 4 bytes. 444 ret += 4; 445 src += 2; 446 } else { 447 ret += utf32_codepoint_utf8_length((char32_t) *src++); 448 } 449 } 450 return ret; 451 } 452 453 /** 454 * Returns 1-4 based on the number of leading bits. 455 * 456 * 1111 -> 4 457 * 1110 -> 3 458 * 110x -> 2 459 * 10xx -> 1 460 * 0xxx -> 1 461 */ 462 static inline size_t utf8_codepoint_len(uint8_t ch) 463 { 464 return ((0xe5000000 >> ((ch >> 3) & 0x1e)) & 3) + 1; 465 } 466 467 static inline void utf8_shift_and_mask(uint32_t* codePoint, const uint8_t byte) 468 { 469 *codePoint <<= 6; 470 *codePoint |= 0x3F & byte; 471 } 472 473 size_t utf8_to_utf32_length(const char *src, size_t src_len) 474 { 475 if (src == NULL || src_len == 0) { 476 return 0; 477 } 478 size_t ret = 0; 479 const char* cur; 480 const char* end; 481 size_t num_to_skip; 482 for (cur = src, end = src + src_len, num_to_skip = 1; 483 cur < end; 484 cur += num_to_skip, ret++) { 485 const char first_char = *cur; 486 num_to_skip = 1; 487 if ((first_char & 0x80) == 0) { // ASCII 488 continue; 489 } 490 int32_t mask; 491 492 for (mask = 0x40; (first_char & mask); num_to_skip++, mask >>= 1) { 493 } 494 } 495 return ret; 496 } 497 498 void utf8_to_utf32(const char* src, size_t src_len, char32_t* dst) 499 { 500 if (src == NULL || src_len == 0 || dst == NULL) { 501 return; 502 } 503 504 const char* cur = src; 505 const char* const end = src + src_len; 506 char32_t* cur_utf32 = dst; 507 while (cur < end) { 508 size_t num_read; 509 *cur_utf32++ = static_cast<char32_t>(utf32_at_internal(cur, &num_read)); 510 cur += num_read; 511 } 512 *cur_utf32 = 0; 513 } 514 515 static inline uint32_t utf8_to_utf32_codepoint(const uint8_t *src, size_t length) 516 { 517 uint32_t unicode; 518 519 switch (length) 520 { 521 case 1: 522 return src[0]; 523 case 2: 524 unicode = src[0] & 0x1f; 525 utf8_shift_and_mask(&unicode, src[1]); 526 return unicode; 527 case 3: 528 unicode = src[0] & 0x0f; 529 utf8_shift_and_mask(&unicode, src[1]); 530 utf8_shift_and_mask(&unicode, src[2]); 531 return unicode; 532 case 4: 533 unicode = src[0] & 0x07; 534 utf8_shift_and_mask(&unicode, src[1]); 535 utf8_shift_and_mask(&unicode, src[2]); 536 utf8_shift_and_mask(&unicode, src[3]); 537 return unicode; 538 default: 539 return 0xffff; 540 } 541 542 //printf("Char at %p: len=%d, utf-16=%p\n", src, length, (void*)result); 543 } 544 545 ssize_t utf8_to_utf16_length(const uint8_t* u8str, size_t u8len) 546 { 547 const uint8_t* const u8end = u8str + u8len; 548 const uint8_t* u8cur = u8str; 549 550 /* Validate that the UTF-8 is the correct len */ 551 size_t u16measuredLen = 0; 552 while (u8cur < u8end) { 553 u16measuredLen++; 554 int u8charLen = utf8_codepoint_len(*u8cur); 555 uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8charLen); 556 if (codepoint > 0xFFFF) u16measuredLen++; // this will be a surrogate pair in utf16 557 u8cur += u8charLen; 558 } 559 560 /** 561 * Make sure that we ended where we thought we would and the output UTF-16 562 * will be exactly how long we were told it would be. 563 */ 564 if (u8cur != u8end) { 565 return -1; 566 } 567 568 return u16measuredLen; 569 } 570 571 char16_t* utf8_to_utf16_no_null_terminator(const uint8_t* u8str, size_t u8len, char16_t* u16str) 572 { 573 const uint8_t* const u8end = u8str + u8len; 574 const uint8_t* u8cur = u8str; 575 char16_t* u16cur = u16str; 576 577 while (u8cur < u8end) { 578 size_t u8len = utf8_codepoint_len(*u8cur); 579 uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8len); 580 581 // Convert the UTF32 codepoint to one or more UTF16 codepoints 582 if (codepoint <= 0xFFFF) { 583 // Single UTF16 character 584 *u16cur++ = (char16_t) codepoint; 585 } else { 586 // Multiple UTF16 characters with surrogates 587 codepoint = codepoint - 0x10000; 588 *u16cur++ = (char16_t) ((codepoint >> 10) + 0xD800); 589 *u16cur++ = (char16_t) ((codepoint & 0x3FF) + 0xDC00); 590 } 591 592 u8cur += u8len; 593 } 594 return u16cur; 595 } 596 597 void utf8_to_utf16(const uint8_t* u8str, size_t u8len, char16_t* u16str) { 598 char16_t* end = utf8_to_utf16_no_null_terminator(u8str, u8len, u16str); 599 *end = 0; 600 } 601 602 char16_t* utf8_to_utf16_n(const uint8_t* src, size_t srcLen, char16_t* dst, size_t dstLen) { 603 const uint8_t* const u8end = src + srcLen; 604 const uint8_t* u8cur = src; 605 const char16_t* const u16end = dst + dstLen; 606 char16_t* u16cur = dst; 607 608 while (u8cur < u8end && u16cur < u16end) { 609 size_t u8len = utf8_codepoint_len(*u8cur); 610 uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8len); 611 612 // Convert the UTF32 codepoint to one or more UTF16 codepoints 613 if (codepoint <= 0xFFFF) { 614 // Single UTF16 character 615 *u16cur++ = (char16_t) codepoint; 616 } else { 617 // Multiple UTF16 characters with surrogates 618 codepoint = codepoint - 0x10000; 619 *u16cur++ = (char16_t) ((codepoint >> 10) + 0xD800); 620 if (u16cur >= u16end) { 621 // Ooops... not enough room for this surrogate pair. 622 return u16cur-1; 623 } 624 *u16cur++ = (char16_t) ((codepoint & 0x3FF) + 0xDC00); 625 } 626 627 u8cur += u8len; 628 } 629 return u16cur; 630 } 631 632 } 633