1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2008-2011, International Business Machines 7 * Corporation, Google and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 */ 11 // Author : eldawy (at) google.com (Mohamed Eldawy) 12 // ucnvsel.cpp 13 // 14 // Purpose: To generate a list of encodings capable of handling 15 // a given Unicode text 16 // 17 // Started 09-April-2008 18 19 /** 20 * \file 21 * 22 * This is an implementation of an encoding selector. 23 * The goal is, given a unicode string, find the encodings 24 * this string can be mapped to. To make processing faster 25 * a trie is built when you call ucnvsel_open() that 26 * stores all encodings a codepoint can map to 27 */ 28 29 #include "unicode/ucnvsel.h" 30 31 #if !UCONFIG_NO_CONVERSION 32 33 #include <string.h> 34 35 #include "unicode/uchar.h" 36 #include "unicode/uniset.h" 37 #include "unicode/ucnv.h" 38 #include "unicode/ustring.h" 39 #include "unicode/uchriter.h" 40 #include "utrie2.h" 41 #include "propsvec.h" 42 #include "uassert.h" 43 #include "ucmndata.h" 44 #include "udataswp.h" 45 #include "uenumimp.h" 46 #include "cmemory.h" 47 #include "cstring.h" 48 49 U_NAMESPACE_USE 50 51 struct UConverterSelector { 52 UTrie2 *trie; // 16 bit trie containing offsets into pv 53 uint32_t* pv; // table of bits! 54 int32_t pvCount; 55 char** encodings; // which encodings did user ask to use? 56 int32_t encodingsCount; 57 int32_t encodingStrLength; 58 uint8_t* swapped; 59 UBool ownPv, ownEncodingStrings; 60 }; 61 62 static void generateSelectorData(UConverterSelector* result, 63 UPropsVectors *upvec, 64 const USet* excludedCodePoints, 65 const UConverterUnicodeSet whichSet, 66 UErrorCode* status) { 67 if (U_FAILURE(*status)) { 68 return; 69 } 70 71 int32_t columns = (result->encodingsCount+31)/32; 72 73 // set errorValue to all-ones 74 for (int32_t col = 0; col < columns; col++) { 75 upvec_setValue(upvec, UPVEC_ERROR_VALUE_CP, UPVEC_ERROR_VALUE_CP, 76 col, static_cast<uint32_t>(~0), static_cast<uint32_t>(~0), status); 77 } 78 79 for (int32_t i = 0; i < result->encodingsCount; ++i) { 80 uint32_t mask; 81 uint32_t column; 82 int32_t item_count; 83 int32_t j; 84 UConverter* test_converter = ucnv_open(result->encodings[i], status); 85 if (U_FAILURE(*status)) { 86 return; 87 } 88 USet* unicode_point_set; 89 unicode_point_set = uset_open(1, 0); // empty set 90 91 ucnv_getUnicodeSet(test_converter, unicode_point_set, 92 whichSet, status); 93 if (U_FAILURE(*status)) { 94 ucnv_close(test_converter); 95 return; 96 } 97 98 column = i / 32; 99 mask = 1 << (i%32); 100 // now iterate over intervals on set i! 101 item_count = uset_getItemCount(unicode_point_set); 102 103 for (j = 0; j < item_count; ++j) { 104 UChar32 start_char; 105 UChar32 end_char; 106 UErrorCode smallStatus = U_ZERO_ERROR; 107 uset_getItem(unicode_point_set, j, &start_char, &end_char, NULL, 0, 108 &smallStatus); 109 if (U_FAILURE(smallStatus)) { 110 // this will be reached for the converters that fill the set with 111 // strings. Those should be ignored by our system 112 } else { 113 upvec_setValue(upvec, start_char, end_char, column, static_cast<uint32_t>(~0), mask, 114 status); 115 } 116 } 117 ucnv_close(test_converter); 118 uset_close(unicode_point_set); 119 if (U_FAILURE(*status)) { 120 return; 121 } 122 } 123 124 // handle excluded encodings! Simply set their values to all 1's in the upvec 125 if (excludedCodePoints) { 126 int32_t item_count = uset_getItemCount(excludedCodePoints); 127 for (int32_t j = 0; j < item_count; ++j) { 128 UChar32 start_char; 129 UChar32 end_char; 130 131 uset_getItem(excludedCodePoints, j, &start_char, &end_char, NULL, 0, 132 status); 133 for (int32_t col = 0; col < columns; col++) { 134 upvec_setValue(upvec, start_char, end_char, col, static_cast<uint32_t>(~0), static_cast<uint32_t>(~0), 135 status); 136 } 137 } 138 } 139 140 // alright. Now, let's put things in the same exact form you'd get when you 141 // unserialize things. 142 result->trie = upvec_compactToUTrie2WithRowIndexes(upvec, status); 143 result->pv = upvec_cloneArray(upvec, &result->pvCount, NULL, status); 144 result->pvCount *= columns; // number of uint32_t = rows * columns 145 result->ownPv = TRUE; 146 } 147 148 /* open a selector. If converterListSize is 0, build for all converters. 149 If excludedCodePoints is NULL, don't exclude any codepoints */ 150 U_CAPI UConverterSelector* U_EXPORT2 151 ucnvsel_open(const char* const* converterList, int32_t converterListSize, 152 const USet* excludedCodePoints, 153 const UConverterUnicodeSet whichSet, UErrorCode* status) { 154 // check if already failed 155 if (U_FAILURE(*status)) { 156 return NULL; 157 } 158 // ensure args make sense! 159 if (converterListSize < 0 || (converterList == NULL && converterListSize != 0)) { 160 *status = U_ILLEGAL_ARGUMENT_ERROR; 161 return NULL; 162 } 163 164 // allocate a new converter 165 LocalUConverterSelectorPointer newSelector( 166 (UConverterSelector*)uprv_malloc(sizeof(UConverterSelector))); 167 if (newSelector.isNull()) { 168 *status = U_MEMORY_ALLOCATION_ERROR; 169 return NULL; 170 } 171 uprv_memset(newSelector.getAlias(), 0, sizeof(UConverterSelector)); 172 173 if (converterListSize == 0) { 174 converterList = NULL; 175 converterListSize = ucnv_countAvailable(); 176 } 177 newSelector->encodings = 178 (char**)uprv_malloc(converterListSize * sizeof(char*)); 179 if (!newSelector->encodings) { 180 *status = U_MEMORY_ALLOCATION_ERROR; 181 return NULL; 182 } 183 newSelector->encodings[0] = NULL; // now we can call ucnvsel_close() 184 185 // make a backup copy of the list of converters 186 int32_t totalSize = 0; 187 int32_t i; 188 for (i = 0; i < converterListSize; i++) { 189 totalSize += 190 (int32_t)uprv_strlen(converterList != NULL ? converterList[i] : ucnv_getAvailableName(i)) + 1; 191 } 192 // 4-align the totalSize to 4-align the size of the serialized form 193 int32_t encodingStrPadding = totalSize & 3; 194 if (encodingStrPadding != 0) { 195 encodingStrPadding = 4 - encodingStrPadding; 196 } 197 newSelector->encodingStrLength = totalSize += encodingStrPadding; 198 char* allStrings = (char*) uprv_malloc(totalSize); 199 if (!allStrings) { 200 *status = U_MEMORY_ALLOCATION_ERROR; 201 return NULL; 202 } 203 204 for (i = 0; i < converterListSize; i++) { 205 newSelector->encodings[i] = allStrings; 206 uprv_strcpy(newSelector->encodings[i], 207 converterList != NULL ? converterList[i] : ucnv_getAvailableName(i)); 208 allStrings += uprv_strlen(newSelector->encodings[i]) + 1; 209 } 210 while (encodingStrPadding > 0) { 211 *allStrings++ = 0; 212 --encodingStrPadding; 213 } 214 215 newSelector->ownEncodingStrings = TRUE; 216 newSelector->encodingsCount = converterListSize; 217 UPropsVectors *upvec = upvec_open((converterListSize+31)/32, status); 218 generateSelectorData(newSelector.getAlias(), upvec, excludedCodePoints, whichSet, status); 219 upvec_close(upvec); 220 221 if (U_FAILURE(*status)) { 222 return NULL; 223 } 224 225 return newSelector.orphan(); 226 } 227 228 /* close opened selector */ 229 U_CAPI void U_EXPORT2 230 ucnvsel_close(UConverterSelector *sel) { 231 if (!sel) { 232 return; 233 } 234 if (sel->ownEncodingStrings) { 235 uprv_free(sel->encodings[0]); 236 } 237 uprv_free(sel->encodings); 238 if (sel->ownPv) { 239 uprv_free(sel->pv); 240 } 241 utrie2_close(sel->trie); 242 uprv_free(sel->swapped); 243 uprv_free(sel); 244 } 245 246 static const UDataInfo dataInfo = { 247 sizeof(UDataInfo), 248 0, 249 250 U_IS_BIG_ENDIAN, 251 U_CHARSET_FAMILY, 252 U_SIZEOF_UCHAR, 253 0, 254 255 { 0x43, 0x53, 0x65, 0x6c }, /* dataFormat="CSel" */ 256 { 1, 0, 0, 0 }, /* formatVersion */ 257 { 0, 0, 0, 0 } /* dataVersion */ 258 }; 259 260 enum { 261 UCNVSEL_INDEX_TRIE_SIZE, // trie size in bytes 262 UCNVSEL_INDEX_PV_COUNT, // number of uint32_t in the bit vectors 263 UCNVSEL_INDEX_NAMES_COUNT, // number of encoding names 264 UCNVSEL_INDEX_NAMES_LENGTH, // number of encoding name bytes including padding 265 UCNVSEL_INDEX_SIZE = 15, // bytes following the DataHeader 266 UCNVSEL_INDEX_COUNT = 16 267 }; 268 269 /* 270 * Serialized form of a UConverterSelector, formatVersion 1: 271 * 272 * The serialized form begins with a standard ICU DataHeader with a UDataInfo 273 * as the template above. 274 * This is followed by: 275 * int32_t indexes[UCNVSEL_INDEX_COUNT]; // see index entry constants above 276 * serialized UTrie2; // indexes[UCNVSEL_INDEX_TRIE_SIZE] bytes 277 * uint32_t pv[indexes[UCNVSEL_INDEX_PV_COUNT]]; // bit vectors 278 * char* encodingNames[indexes[UCNVSEL_INDEX_NAMES_LENGTH]]; // NUL-terminated strings + padding 279 */ 280 281 /* serialize a selector */ 282 U_CAPI int32_t U_EXPORT2 283 ucnvsel_serialize(const UConverterSelector* sel, 284 void* buffer, int32_t bufferCapacity, UErrorCode* status) { 285 // check if already failed 286 if (U_FAILURE(*status)) { 287 return 0; 288 } 289 // ensure args make sense! 290 uint8_t *p = (uint8_t *)buffer; 291 if (bufferCapacity < 0 || 292 (bufferCapacity > 0 && (p == NULL || (U_POINTER_MASK_LSB(p, 3) != 0))) 293 ) { 294 *status = U_ILLEGAL_ARGUMENT_ERROR; 295 return 0; 296 } 297 // add up the size of the serialized form 298 int32_t serializedTrieSize = utrie2_serialize(sel->trie, NULL, 0, status); 299 if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) { 300 return 0; 301 } 302 *status = U_ZERO_ERROR; 303 304 DataHeader header; 305 uprv_memset(&header, 0, sizeof(header)); 306 header.dataHeader.headerSize = (uint16_t)((sizeof(header) + 15) & ~15); 307 header.dataHeader.magic1 = 0xda; 308 header.dataHeader.magic2 = 0x27; 309 uprv_memcpy(&header.info, &dataInfo, sizeof(dataInfo)); 310 311 int32_t indexes[UCNVSEL_INDEX_COUNT] = { 312 serializedTrieSize, 313 sel->pvCount, 314 sel->encodingsCount, 315 sel->encodingStrLength 316 }; 317 318 int32_t totalSize = 319 header.dataHeader.headerSize + 320 (int32_t)sizeof(indexes) + 321 serializedTrieSize + 322 sel->pvCount * 4 + 323 sel->encodingStrLength; 324 indexes[UCNVSEL_INDEX_SIZE] = totalSize - header.dataHeader.headerSize; 325 if (totalSize > bufferCapacity) { 326 *status = U_BUFFER_OVERFLOW_ERROR; 327 return totalSize; 328 } 329 // ok, save! 330 int32_t length = header.dataHeader.headerSize; 331 uprv_memcpy(p, &header, sizeof(header)); 332 uprv_memset(p + sizeof(header), 0, length - sizeof(header)); 333 p += length; 334 335 length = (int32_t)sizeof(indexes); 336 uprv_memcpy(p, indexes, length); 337 p += length; 338 339 utrie2_serialize(sel->trie, p, serializedTrieSize, status); 340 p += serializedTrieSize; 341 342 length = sel->pvCount * 4; 343 uprv_memcpy(p, sel->pv, length); 344 p += length; 345 346 uprv_memcpy(p, sel->encodings[0], sel->encodingStrLength); 347 p += sel->encodingStrLength; 348 349 return totalSize; 350 } 351 352 /** 353 * swap a selector into the desired Endianness and Asciiness of 354 * the system. Just as FYI, selectors are always saved in the format 355 * of the system that created them. They are only converted if used 356 * on another system. In other words, selectors created on different 357 * system can be different even if the params are identical (endianness 358 * and Asciiness differences only) 359 * 360 * @param ds pointer to data swapper containing swapping info 361 * @param inData pointer to incoming data 362 * @param length length of inData in bytes 363 * @param outData pointer to output data. Capacity should 364 * be at least equal to capacity of inData 365 * @param status an in/out ICU UErrorCode 366 * @return 0 on failure, number of bytes swapped on success 367 * number of bytes swapped can be smaller than length 368 */ 369 static int32_t 370 ucnvsel_swap(const UDataSwapper *ds, 371 const void *inData, int32_t length, 372 void *outData, UErrorCode *status) { 373 /* udata_swapDataHeader checks the arguments */ 374 int32_t headerSize = udata_swapDataHeader(ds, inData, length, outData, status); 375 if(U_FAILURE(*status)) { 376 return 0; 377 } 378 379 /* check data format and format version */ 380 const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData + 4); 381 if(!( 382 pInfo->dataFormat[0] == 0x43 && /* dataFormat="CSel" */ 383 pInfo->dataFormat[1] == 0x53 && 384 pInfo->dataFormat[2] == 0x65 && 385 pInfo->dataFormat[3] == 0x6c 386 )) { 387 udata_printError(ds, "ucnvsel_swap(): data format %02x.%02x.%02x.%02x is not recognized as UConverterSelector data\n", 388 pInfo->dataFormat[0], pInfo->dataFormat[1], 389 pInfo->dataFormat[2], pInfo->dataFormat[3]); 390 *status = U_INVALID_FORMAT_ERROR; 391 return 0; 392 } 393 if(pInfo->formatVersion[0] != 1) { 394 udata_printError(ds, "ucnvsel_swap(): format version %02x is not supported\n", 395 pInfo->formatVersion[0]); 396 *status = U_UNSUPPORTED_ERROR; 397 return 0; 398 } 399 400 if(length >= 0) { 401 length -= headerSize; 402 if(length < 16*4) { 403 udata_printError(ds, "ucnvsel_swap(): too few bytes (%d after header) for UConverterSelector data\n", 404 length); 405 *status = U_INDEX_OUTOFBOUNDS_ERROR; 406 return 0; 407 } 408 } 409 410 const uint8_t *inBytes = (const uint8_t *)inData + headerSize; 411 uint8_t *outBytes = (uint8_t *)outData + headerSize; 412 413 /* read the indexes */ 414 const int32_t *inIndexes = (const int32_t *)inBytes; 415 int32_t indexes[16]; 416 int32_t i; 417 for(i = 0; i < 16; ++i) { 418 indexes[i] = udata_readInt32(ds, inIndexes[i]); 419 } 420 421 /* get the total length of the data */ 422 int32_t size = indexes[UCNVSEL_INDEX_SIZE]; 423 if(length >= 0) { 424 if(length < size) { 425 udata_printError(ds, "ucnvsel_swap(): too few bytes (%d after header) for all of UConverterSelector data\n", 426 length); 427 *status = U_INDEX_OUTOFBOUNDS_ERROR; 428 return 0; 429 } 430 431 /* copy the data for inaccessible bytes */ 432 if(inBytes != outBytes) { 433 uprv_memcpy(outBytes, inBytes, size); 434 } 435 436 int32_t offset = 0, count; 437 438 /* swap the int32_t indexes[] */ 439 count = UCNVSEL_INDEX_COUNT*4; 440 ds->swapArray32(ds, inBytes, count, outBytes, status); 441 offset += count; 442 443 /* swap the UTrie2 */ 444 count = indexes[UCNVSEL_INDEX_TRIE_SIZE]; 445 utrie2_swap(ds, inBytes + offset, count, outBytes + offset, status); 446 offset += count; 447 448 /* swap the uint32_t pv[] */ 449 count = indexes[UCNVSEL_INDEX_PV_COUNT]*4; 450 ds->swapArray32(ds, inBytes + offset, count, outBytes + offset, status); 451 offset += count; 452 453 /* swap the encoding names */ 454 count = indexes[UCNVSEL_INDEX_NAMES_LENGTH]; 455 ds->swapInvChars(ds, inBytes + offset, count, outBytes + offset, status); 456 offset += count; 457 458 U_ASSERT(offset == size); 459 } 460 461 return headerSize + size; 462 } 463 464 /* unserialize a selector */ 465 U_CAPI UConverterSelector* U_EXPORT2 466 ucnvsel_openFromSerialized(const void* buffer, int32_t length, UErrorCode* status) { 467 // check if already failed 468 if (U_FAILURE(*status)) { 469 return NULL; 470 } 471 // ensure args make sense! 472 const uint8_t *p = (const uint8_t *)buffer; 473 if (length <= 0 || 474 (length > 0 && (p == NULL || (U_POINTER_MASK_LSB(p, 3) != 0))) 475 ) { 476 *status = U_ILLEGAL_ARGUMENT_ERROR; 477 return NULL; 478 } 479 // header 480 if (length < 32) { 481 // not even enough space for a minimal header 482 *status = U_INDEX_OUTOFBOUNDS_ERROR; 483 return NULL; 484 } 485 const DataHeader *pHeader = (const DataHeader *)p; 486 if (!( 487 pHeader->dataHeader.magic1==0xda && 488 pHeader->dataHeader.magic2==0x27 && 489 pHeader->info.dataFormat[0] == 0x43 && 490 pHeader->info.dataFormat[1] == 0x53 && 491 pHeader->info.dataFormat[2] == 0x65 && 492 pHeader->info.dataFormat[3] == 0x6c 493 )) { 494 /* header not valid or dataFormat not recognized */ 495 *status = U_INVALID_FORMAT_ERROR; 496 return NULL; 497 } 498 if (pHeader->info.formatVersion[0] != 1) { 499 *status = U_UNSUPPORTED_ERROR; 500 return NULL; 501 } 502 uint8_t* swapped = NULL; 503 if (pHeader->info.isBigEndian != U_IS_BIG_ENDIAN || 504 pHeader->info.charsetFamily != U_CHARSET_FAMILY 505 ) { 506 // swap the data 507 UDataSwapper *ds = 508 udata_openSwapperForInputData(p, length, U_IS_BIG_ENDIAN, U_CHARSET_FAMILY, status); 509 int32_t totalSize = ucnvsel_swap(ds, p, -1, NULL, status); 510 if (U_FAILURE(*status)) { 511 udata_closeSwapper(ds); 512 return NULL; 513 } 514 if (length < totalSize) { 515 udata_closeSwapper(ds); 516 *status = U_INDEX_OUTOFBOUNDS_ERROR; 517 return NULL; 518 } 519 swapped = (uint8_t*)uprv_malloc(totalSize); 520 if (swapped == NULL) { 521 udata_closeSwapper(ds); 522 *status = U_MEMORY_ALLOCATION_ERROR; 523 return NULL; 524 } 525 ucnvsel_swap(ds, p, length, swapped, status); 526 udata_closeSwapper(ds); 527 if (U_FAILURE(*status)) { 528 uprv_free(swapped); 529 return NULL; 530 } 531 p = swapped; 532 pHeader = (const DataHeader *)p; 533 } 534 if (length < (pHeader->dataHeader.headerSize + 16 * 4)) { 535 // not even enough space for the header and the indexes 536 uprv_free(swapped); 537 *status = U_INDEX_OUTOFBOUNDS_ERROR; 538 return NULL; 539 } 540 p += pHeader->dataHeader.headerSize; 541 length -= pHeader->dataHeader.headerSize; 542 // indexes 543 const int32_t *indexes = (const int32_t *)p; 544 if (length < indexes[UCNVSEL_INDEX_SIZE]) { 545 uprv_free(swapped); 546 *status = U_INDEX_OUTOFBOUNDS_ERROR; 547 return NULL; 548 } 549 p += UCNVSEL_INDEX_COUNT * 4; 550 // create and populate the selector object 551 UConverterSelector* sel = (UConverterSelector*)uprv_malloc(sizeof(UConverterSelector)); 552 char **encodings = 553 (char **)uprv_malloc( 554 indexes[UCNVSEL_INDEX_NAMES_COUNT] * sizeof(char *)); 555 if (sel == NULL || encodings == NULL) { 556 uprv_free(swapped); 557 uprv_free(sel); 558 uprv_free(encodings); 559 *status = U_MEMORY_ALLOCATION_ERROR; 560 return NULL; 561 } 562 uprv_memset(sel, 0, sizeof(UConverterSelector)); 563 sel->pvCount = indexes[UCNVSEL_INDEX_PV_COUNT]; 564 sel->encodings = encodings; 565 sel->encodingsCount = indexes[UCNVSEL_INDEX_NAMES_COUNT]; 566 sel->encodingStrLength = indexes[UCNVSEL_INDEX_NAMES_LENGTH]; 567 sel->swapped = swapped; 568 // trie 569 sel->trie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, 570 p, indexes[UCNVSEL_INDEX_TRIE_SIZE], NULL, 571 status); 572 p += indexes[UCNVSEL_INDEX_TRIE_SIZE]; 573 if (U_FAILURE(*status)) { 574 ucnvsel_close(sel); 575 return NULL; 576 } 577 // bit vectors 578 sel->pv = (uint32_t *)p; 579 p += sel->pvCount * 4; 580 // encoding names 581 char* s = (char*)p; 582 for (int32_t i = 0; i < sel->encodingsCount; ++i) { 583 sel->encodings[i] = s; 584 s += uprv_strlen(s) + 1; 585 } 586 p += sel->encodingStrLength; 587 588 return sel; 589 } 590 591 // a bunch of functions for the enumeration thingie! Nothing fancy here. Just 592 // iterate over the selected encodings 593 struct Enumerator { 594 int16_t* index; 595 int16_t length; 596 int16_t cur; 597 const UConverterSelector* sel; 598 }; 599 600 U_CDECL_BEGIN 601 602 static void U_CALLCONV 603 ucnvsel_close_selector_iterator(UEnumeration *enumerator) { 604 uprv_free(((Enumerator*)(enumerator->context))->index); 605 uprv_free(enumerator->context); 606 uprv_free(enumerator); 607 } 608 609 610 static int32_t U_CALLCONV 611 ucnvsel_count_encodings(UEnumeration *enumerator, UErrorCode *status) { 612 // check if already failed 613 if (U_FAILURE(*status)) { 614 return 0; 615 } 616 return ((Enumerator*)(enumerator->context))->length; 617 } 618 619 620 static const char* U_CALLCONV ucnvsel_next_encoding(UEnumeration* enumerator, 621 int32_t* resultLength, 622 UErrorCode* status) { 623 // check if already failed 624 if (U_FAILURE(*status)) { 625 return NULL; 626 } 627 628 int16_t cur = ((Enumerator*)(enumerator->context))->cur; 629 const UConverterSelector* sel; 630 const char* result; 631 if (cur >= ((Enumerator*)(enumerator->context))->length) { 632 return NULL; 633 } 634 sel = ((Enumerator*)(enumerator->context))->sel; 635 result = sel->encodings[((Enumerator*)(enumerator->context))->index[cur] ]; 636 ((Enumerator*)(enumerator->context))->cur++; 637 if (resultLength) { 638 *resultLength = (int32_t)uprv_strlen(result); 639 } 640 return result; 641 } 642 643 static void U_CALLCONV ucnvsel_reset_iterator(UEnumeration* enumerator, 644 UErrorCode* status) { 645 // check if already failed 646 if (U_FAILURE(*status)) { 647 return ; 648 } 649 ((Enumerator*)(enumerator->context))->cur = 0; 650 } 651 652 U_CDECL_END 653 654 655 static const UEnumeration defaultEncodings = { 656 NULL, 657 NULL, 658 ucnvsel_close_selector_iterator, 659 ucnvsel_count_encodings, 660 uenum_unextDefault, 661 ucnvsel_next_encoding, 662 ucnvsel_reset_iterator 663 }; 664 665 666 // internal fn to intersect two sets of masks 667 // returns whether the mask has reduced to all zeros 668 static UBool intersectMasks(uint32_t* dest, const uint32_t* source1, int32_t len) { 669 int32_t i; 670 uint32_t oredDest = 0; 671 for (i = 0 ; i < len ; ++i) { 672 oredDest |= (dest[i] &= source1[i]); 673 } 674 return oredDest == 0; 675 } 676 677 // internal fn to count how many 1's are there in a mask 678 // algorithm taken from http://graphics.stanford.edu/~seander/bithacks.html 679 static int16_t countOnes(uint32_t* mask, int32_t len) { 680 int32_t i, totalOnes = 0; 681 for (i = 0 ; i < len ; ++i) { 682 uint32_t ent = mask[i]; 683 for (; ent; totalOnes++) 684 { 685 ent &= ent - 1; // clear the least significant bit set 686 } 687 } 688 return static_cast<int16_t>(totalOnes); 689 } 690 691 692 /* internal function! */ 693 static UEnumeration *selectForMask(const UConverterSelector* sel, 694 uint32_t *mask, UErrorCode *status) { 695 // this is the context we will use. Store a table of indices to which 696 // encodings are legit. 697 struct Enumerator* result = (Enumerator*)uprv_malloc(sizeof(Enumerator)); 698 if (result == NULL) { 699 uprv_free(mask); 700 *status = U_MEMORY_ALLOCATION_ERROR; 701 return NULL; 702 } 703 result->index = NULL; // this will be allocated later! 704 result->length = result->cur = 0; 705 result->sel = sel; 706 707 UEnumeration *en = (UEnumeration *)uprv_malloc(sizeof(UEnumeration)); 708 if (en == NULL) { 709 // TODO(markus): Combine Enumerator and UEnumeration into one struct. 710 uprv_free(mask); 711 uprv_free(result); 712 *status = U_MEMORY_ALLOCATION_ERROR; 713 return NULL; 714 } 715 memcpy(en, &defaultEncodings, sizeof(UEnumeration)); 716 en->context = result; 717 718 int32_t columns = (sel->encodingsCount+31)/32; 719 int16_t numOnes = countOnes(mask, columns); 720 // now, we know the exact space we need for index 721 if (numOnes > 0) { 722 result->index = (int16_t*) uprv_malloc(numOnes * sizeof(int16_t)); 723 724 int32_t i, j; 725 int16_t k = 0; 726 for (j = 0 ; j < columns; j++) { 727 uint32_t v = mask[j]; 728 for (i = 0 ; i < 32 && k < sel->encodingsCount; i++, k++) { 729 if ((v & 1) != 0) { 730 result->index[result->length++] = k; 731 } 732 v >>= 1; 733 } 734 } 735 } //otherwise, index will remain NULL (and will never be touched by 736 //the enumerator code anyway) 737 uprv_free(mask); 738 return en; 739 } 740 741 /* check a string against the selector - UTF16 version */ 742 U_CAPI UEnumeration * U_EXPORT2 743 ucnvsel_selectForString(const UConverterSelector* sel, 744 const UChar *s, int32_t length, UErrorCode *status) { 745 // check if already failed 746 if (U_FAILURE(*status)) { 747 return NULL; 748 } 749 // ensure args make sense! 750 if (sel == NULL || (s == NULL && length != 0)) { 751 *status = U_ILLEGAL_ARGUMENT_ERROR; 752 return NULL; 753 } 754 755 int32_t columns = (sel->encodingsCount+31)/32; 756 uint32_t* mask = (uint32_t*) uprv_malloc(columns * 4); 757 if (mask == NULL) { 758 *status = U_MEMORY_ALLOCATION_ERROR; 759 return NULL; 760 } 761 uprv_memset(mask, ~0, columns *4); 762 763 if(s!=NULL) { 764 const UChar *limit; 765 if (length >= 0) { 766 limit = s + length; 767 } else { 768 limit = NULL; 769 } 770 771 while (limit == NULL ? *s != 0 : s != limit) { 772 UChar32 c; 773 uint16_t pvIndex; 774 UTRIE2_U16_NEXT16(sel->trie, s, limit, c, pvIndex); 775 if (intersectMasks(mask, sel->pv+pvIndex, columns)) { 776 break; 777 } 778 } 779 } 780 return selectForMask(sel, mask, status); 781 } 782 783 /* check a string against the selector - UTF8 version */ 784 U_CAPI UEnumeration * U_EXPORT2 785 ucnvsel_selectForUTF8(const UConverterSelector* sel, 786 const char *s, int32_t length, UErrorCode *status) { 787 // check if already failed 788 if (U_FAILURE(*status)) { 789 return NULL; 790 } 791 // ensure args make sense! 792 if (sel == NULL || (s == NULL && length != 0)) { 793 *status = U_ILLEGAL_ARGUMENT_ERROR; 794 return NULL; 795 } 796 797 int32_t columns = (sel->encodingsCount+31)/32; 798 uint32_t* mask = (uint32_t*) uprv_malloc(columns * 4); 799 if (mask == NULL) { 800 *status = U_MEMORY_ALLOCATION_ERROR; 801 return NULL; 802 } 803 uprv_memset(mask, ~0, columns *4); 804 805 if (length < 0) { 806 length = (int32_t)uprv_strlen(s); 807 } 808 809 if(s!=NULL) { 810 const char *limit = s + length; 811 812 while (s != limit) { 813 uint16_t pvIndex; 814 UTRIE2_U8_NEXT16(sel->trie, s, limit, pvIndex); 815 if (intersectMasks(mask, sel->pv+pvIndex, columns)) { 816 break; 817 } 818 } 819 } 820 return selectForMask(sel, mask, status); 821 } 822 823 #endif // !UCONFIG_NO_CONVERSION 824