1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2008-2009, International Business Machines 5 * Corporation, Google and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 */ 9 // Author : eldawy (at) google.com (Mohamed Eldawy) 10 // ucnvsel.cpp 11 // 12 // Purpose: To generate a list of encodings capable of handling 13 // a given Unicode text 14 // 15 // Started 09-April-2008 16 17 /** 18 * \file 19 * 20 * This is an implementation of an encoding selector. 21 * The goal is, given a unicode string, find the encodings 22 * this string can be mapped to. To make processing faster 23 * a trie is built when you call ucnvsel_open() that 24 * stores all encodings a codepoint can map to 25 */ 26 27 #include "unicode/ucnvsel.h" 28 29 #include <string.h> 30 31 #include "unicode/uchar.h" 32 #include "unicode/uniset.h" 33 #include "unicode/ucnv.h" 34 #include "unicode/ustring.h" 35 #include "unicode/uchriter.h" 36 #include "utrie2.h" 37 #include "propsvec.h" 38 #include "uassert.h" 39 #include "ucmndata.h" 40 #include "uenumimp.h" 41 #include "cmemory.h" 42 #include "cstring.h" 43 44 U_NAMESPACE_USE 45 46 struct UConverterSelector { 47 UTrie2 *trie; // 16 bit trie containing offsets into pv 48 uint32_t* pv; // table of bits! 49 int32_t pvCount; 50 char** encodings; // which encodings did user ask to use? 51 int32_t encodingsCount; 52 int32_t encodingStrLength; 53 uint8_t* swapped; 54 UBool ownPv, ownEncodingStrings; 55 }; 56 57 static void generateSelectorData(UConverterSelector* result, 58 UPropsVectors *upvec, 59 const USet* excludedCodePoints, 60 const UConverterUnicodeSet whichSet, 61 UErrorCode* status) { 62 if (U_FAILURE(*status)) { 63 return; 64 } 65 66 int32_t columns = (result->encodingsCount+31)/32; 67 68 // set errorValue to all-ones 69 for (int32_t col = 0; col < columns; col++) { 70 upvec_setValue(upvec, UPVEC_ERROR_VALUE_CP, UPVEC_ERROR_VALUE_CP, 71 col, ~0, ~0, status); 72 } 73 74 for (int32_t i = 0; i < result->encodingsCount; ++i) { 75 uint32_t mask; 76 uint32_t column; 77 int32_t item_count; 78 int32_t j; 79 UConverter* test_converter = ucnv_open(result->encodings[i], status); 80 if (U_FAILURE(*status)) { 81 return; 82 } 83 USet* unicode_point_set; 84 unicode_point_set = uset_open(1, 0); // empty set 85 86 ucnv_getUnicodeSet(test_converter, unicode_point_set, 87 whichSet, status); 88 if (U_FAILURE(*status)) { 89 ucnv_close(test_converter); 90 return; 91 } 92 93 column = i / 32; 94 mask = 1 << (i%32); 95 // now iterate over intervals on set i! 96 item_count = uset_getItemCount(unicode_point_set); 97 98 for (j = 0; j < item_count; ++j) { 99 UChar32 start_char; 100 UChar32 end_char; 101 UErrorCode smallStatus = U_ZERO_ERROR; 102 uset_getItem(unicode_point_set, j, &start_char, &end_char, NULL, 0, 103 &smallStatus); 104 if (U_FAILURE(smallStatus)) { 105 // this will be reached for the converters that fill the set with 106 // strings. Those should be ignored by our system 107 } else { 108 upvec_setValue(upvec, start_char, end_char, column, ~0, mask, 109 status); 110 } 111 } 112 ucnv_close(test_converter); 113 uset_close(unicode_point_set); 114 if (U_FAILURE(*status)) { 115 return; 116 } 117 } 118 119 // handle excluded encodings! Simply set their values to all 1's in the upvec 120 if (excludedCodePoints) { 121 int32_t item_count = uset_getItemCount(excludedCodePoints); 122 for (int32_t j = 0; j < item_count; ++j) { 123 UChar32 start_char; 124 UChar32 end_char; 125 126 uset_getItem(excludedCodePoints, j, &start_char, &end_char, NULL, 0, 127 status); 128 for (int32_t col = 0; col < columns; col++) { 129 upvec_setValue(upvec, start_char, end_char, col, ~0, ~0, 130 status); 131 } 132 } 133 } 134 135 // alright. Now, let's put things in the same exact form you'd get when you 136 // unserialize things. 137 result->trie = upvec_compactToUTrie2WithRowIndexes(upvec, status); 138 result->pv = upvec_cloneArray(upvec, &result->pvCount, NULL, status); 139 result->pvCount *= columns; // number of uint32_t = rows * columns 140 result->ownPv = TRUE; 141 } 142 143 /* open a selector. If converterListSize is 0, build for all converters. 144 If excludedCodePoints is NULL, don't exclude any codepoints */ 145 U_CAPI UConverterSelector* U_EXPORT2 146 ucnvsel_open(const char* const* converterList, int32_t converterListSize, 147 const USet* excludedCodePoints, 148 const UConverterUnicodeSet whichSet, UErrorCode* status) { 149 // check if already failed 150 if (U_FAILURE(*status)) { 151 return NULL; 152 } 153 // ensure args make sense! 154 if (converterListSize < 0 || (converterList == NULL && converterListSize != 0)) { 155 *status = U_ILLEGAL_ARGUMENT_ERROR; 156 return NULL; 157 } 158 159 // allocate a new converter 160 LocalUConverterSelectorPointer newSelector( 161 (UConverterSelector*)uprv_malloc(sizeof(UConverterSelector))); 162 if (newSelector.isNull()) { 163 *status = U_MEMORY_ALLOCATION_ERROR; 164 return NULL; 165 } 166 uprv_memset(newSelector.getAlias(), 0, sizeof(UConverterSelector)); 167 168 if (converterListSize == 0) { 169 converterList = NULL; 170 converterListSize = ucnv_countAvailable(); 171 } 172 newSelector->encodings = 173 (char**)uprv_malloc(converterListSize * sizeof(char*)); 174 if (!newSelector->encodings) { 175 *status = U_MEMORY_ALLOCATION_ERROR; 176 return NULL; 177 } 178 newSelector->encodings[0] = NULL; // now we can call ucnvsel_close() 179 180 // make a backup copy of the list of converters 181 int32_t totalSize = 0; 182 int32_t i; 183 for (i = 0; i < converterListSize; i++) { 184 totalSize += 185 (int32_t)uprv_strlen(converterList != NULL ? converterList[i] : ucnv_getAvailableName(i)) + 1; 186 } 187 // 4-align the totalSize to 4-align the size of the serialized form 188 int32_t encodingStrPadding = totalSize & 3; 189 if (encodingStrPadding != 0) { 190 encodingStrPadding = 4 - encodingStrPadding; 191 } 192 newSelector->encodingStrLength = totalSize += encodingStrPadding; 193 char* allStrings = (char*) uprv_malloc(totalSize); 194 if (!allStrings) { 195 *status = U_MEMORY_ALLOCATION_ERROR; 196 return NULL; 197 } 198 199 for (i = 0; i < converterListSize; i++) { 200 newSelector->encodings[i] = allStrings; 201 uprv_strcpy(newSelector->encodings[i], 202 converterList != NULL ? converterList[i] : ucnv_getAvailableName(i)); 203 allStrings += uprv_strlen(newSelector->encodings[i]) + 1; 204 } 205 while (encodingStrPadding > 0) { 206 *allStrings++ = 0; 207 --encodingStrPadding; 208 } 209 210 newSelector->ownEncodingStrings = TRUE; 211 newSelector->encodingsCount = converterListSize; 212 UPropsVectors *upvec = upvec_open((converterListSize+31)/32, status); 213 generateSelectorData(newSelector.getAlias(), upvec, excludedCodePoints, whichSet, status); 214 upvec_close(upvec); 215 216 if (U_FAILURE(*status)) { 217 return NULL; 218 } 219 220 return newSelector.orphan(); 221 } 222 223 /* close opened selector */ 224 U_CAPI void U_EXPORT2 225 ucnvsel_close(UConverterSelector *sel) { 226 if (!sel) { 227 return; 228 } 229 if (sel->ownEncodingStrings) { 230 uprv_free(sel->encodings[0]); 231 } 232 uprv_free(sel->encodings); 233 if (sel->ownPv) { 234 uprv_free(sel->pv); 235 } 236 utrie2_close(sel->trie); 237 uprv_free(sel->swapped); 238 uprv_free(sel); 239 } 240 241 static const UDataInfo dataInfo = { 242 sizeof(UDataInfo), 243 0, 244 245 U_IS_BIG_ENDIAN, 246 U_CHARSET_FAMILY, 247 U_SIZEOF_UCHAR, 248 0, 249 250 { 0x43, 0x53, 0x65, 0x6c }, /* dataFormat="CSel" */ 251 { 1, 0, 0, 0 }, /* formatVersion */ 252 { 0, 0, 0, 0 } /* dataVersion */ 253 }; 254 255 enum { 256 UCNVSEL_INDEX_TRIE_SIZE, // trie size in bytes 257 UCNVSEL_INDEX_PV_COUNT, // number of uint32_t in the bit vectors 258 UCNVSEL_INDEX_NAMES_COUNT, // number of encoding names 259 UCNVSEL_INDEX_NAMES_LENGTH, // number of encoding name bytes including padding 260 UCNVSEL_INDEX_SIZE = 15, // bytes following the DataHeader 261 UCNVSEL_INDEX_COUNT = 16 262 }; 263 264 /* 265 * Serialized form of a UConverterSelector, formatVersion 1: 266 * 267 * The serialized form begins with a standard ICU DataHeader with a UDataInfo 268 * as the template above. 269 * This is followed by: 270 * int32_t indexes[UCNVSEL_INDEX_COUNT]; // see index entry constants above 271 * serialized UTrie2; // indexes[UCNVSEL_INDEX_TRIE_SIZE] bytes 272 * uint32_t pv[indexes[UCNVSEL_INDEX_PV_COUNT]]; // bit vectors 273 * char* encodingNames[indexes[UCNVSEL_INDEX_NAMES_LENGTH]]; // NUL-terminated strings + padding 274 */ 275 276 /* serialize a selector */ 277 U_CAPI int32_t U_EXPORT2 278 ucnvsel_serialize(const UConverterSelector* sel, 279 void* buffer, int32_t bufferCapacity, UErrorCode* status) { 280 // check if already failed 281 if (U_FAILURE(*status)) { 282 return 0; 283 } 284 // ensure args make sense! 285 uint8_t *p = (uint8_t *)buffer; 286 if (bufferCapacity < 0 || 287 (bufferCapacity > 0 && (p == NULL || (U_POINTER_MASK_LSB(p, 3) != 0))) 288 ) { 289 *status = U_ILLEGAL_ARGUMENT_ERROR; 290 return 0; 291 } 292 // add up the size of the serialized form 293 int32_t serializedTrieSize = utrie2_serialize(sel->trie, NULL, 0, status); 294 if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) { 295 return 0; 296 } 297 *status = U_ZERO_ERROR; 298 299 DataHeader header; 300 uprv_memset(&header, 0, sizeof(header)); 301 header.dataHeader.headerSize = (uint16_t)((sizeof(header) + 15) & ~15); 302 header.dataHeader.magic1 = 0xda; 303 header.dataHeader.magic2 = 0x27; 304 uprv_memcpy(&header.info, &dataInfo, sizeof(dataInfo)); 305 306 int32_t indexes[UCNVSEL_INDEX_COUNT] = { 307 serializedTrieSize, 308 sel->pvCount, 309 sel->encodingsCount, 310 sel->encodingStrLength 311 }; 312 313 int32_t totalSize = 314 header.dataHeader.headerSize + 315 (int32_t)sizeof(indexes) + 316 serializedTrieSize + 317 sel->pvCount * 4 + 318 sel->encodingStrLength; 319 indexes[UCNVSEL_INDEX_SIZE] = totalSize - header.dataHeader.headerSize; 320 if (totalSize > bufferCapacity) { 321 *status = U_BUFFER_OVERFLOW_ERROR; 322 return totalSize; 323 } 324 // ok, save! 325 int32_t length = header.dataHeader.headerSize; 326 uprv_memcpy(p, &header, sizeof(header)); 327 uprv_memset(p + sizeof(header), 0, length - sizeof(header)); 328 p += length; 329 330 length = (int32_t)sizeof(indexes); 331 uprv_memcpy(p, indexes, length); 332 p += length; 333 334 utrie2_serialize(sel->trie, p, serializedTrieSize, status); 335 p += serializedTrieSize; 336 337 length = sel->pvCount * 4; 338 uprv_memcpy(p, sel->pv, length); 339 p += length; 340 341 uprv_memcpy(p, sel->encodings[0], sel->encodingStrLength); 342 p += sel->encodingStrLength; 343 344 return totalSize; 345 } 346 347 /** 348 * swap a selector into the desired Endianness and Asciiness of 349 * the system. Just as FYI, selectors are always saved in the format 350 * of the system that created them. They are only converted if used 351 * on another system. In other words, selectors created on different 352 * system can be different even if the params are identical (endianness 353 * and Asciiness differences only) 354 * 355 * @param ds pointer to data swapper containing swapping info 356 * @param inData pointer to incoming data 357 * @param length length of inData in bytes 358 * @param outData pointer to output data. Capacity should 359 * be at least equal to capacity of inData 360 * @param status an in/out ICU UErrorCode 361 * @return 0 on failure, number of bytes swapped on success 362 * number of bytes swapped can be smaller than length 363 */ 364 static int32_t 365 ucnvsel_swap(const UDataSwapper *ds, 366 const void *inData, int32_t length, 367 void *outData, UErrorCode *status) { 368 /* udata_swapDataHeader checks the arguments */ 369 int32_t headerSize = udata_swapDataHeader(ds, inData, length, outData, status); 370 if(U_FAILURE(*status)) { 371 return 0; 372 } 373 374 /* check data format and format version */ 375 const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData + 4); 376 if(!( 377 pInfo->dataFormat[0] == 0x43 && /* dataFormat="CSel" */ 378 pInfo->dataFormat[1] == 0x53 && 379 pInfo->dataFormat[2] == 0x65 && 380 pInfo->dataFormat[3] == 0x6c 381 )) { 382 udata_printError(ds, "ucnvsel_swap(): data format %02x.%02x.%02x.%02x is not recognized as UConverterSelector data\n", 383 pInfo->dataFormat[0], pInfo->dataFormat[1], 384 pInfo->dataFormat[2], pInfo->dataFormat[3]); 385 *status = U_INVALID_FORMAT_ERROR; 386 return 0; 387 } 388 if(pInfo->formatVersion[0] != 1) { 389 udata_printError(ds, "ucnvsel_swap(): format version %02x is not supported\n", 390 pInfo->formatVersion[0]); 391 *status = U_UNSUPPORTED_ERROR; 392 return 0; 393 } 394 395 if(length >= 0) { 396 length -= headerSize; 397 if(length < 16*4) { 398 udata_printError(ds, "ucnvsel_swap(): too few bytes (%d after header) for UConverterSelector data\n", 399 length); 400 *status = U_INDEX_OUTOFBOUNDS_ERROR; 401 return 0; 402 } 403 } 404 405 const uint8_t *inBytes = (const uint8_t *)inData + headerSize; 406 uint8_t *outBytes = (uint8_t *)outData + headerSize; 407 408 /* read the indexes */ 409 const int32_t *inIndexes = (const int32_t *)inBytes; 410 int32_t indexes[16]; 411 int32_t i; 412 for(i = 0; i < 16; ++i) { 413 indexes[i] = udata_readInt32(ds, inIndexes[i]); 414 } 415 416 /* get the total length of the data */ 417 int32_t size = indexes[UCNVSEL_INDEX_SIZE]; 418 if(length >= 0) { 419 if(length < size) { 420 udata_printError(ds, "ucnvsel_swap(): too few bytes (%d after header) for all of UConverterSelector data\n", 421 length); 422 *status = U_INDEX_OUTOFBOUNDS_ERROR; 423 return 0; 424 } 425 426 /* copy the data for inaccessible bytes */ 427 if(inBytes != outBytes) { 428 uprv_memcpy(outBytes, inBytes, size); 429 } 430 431 int32_t offset = 0, count; 432 433 /* swap the int32_t indexes[] */ 434 count = UCNVSEL_INDEX_COUNT*4; 435 ds->swapArray32(ds, inBytes, count, outBytes, status); 436 offset += count; 437 438 /* swap the UTrie2 */ 439 count = indexes[UCNVSEL_INDEX_TRIE_SIZE]; 440 utrie2_swap(ds, inBytes + offset, count, outBytes + offset, status); 441 offset += count; 442 443 /* swap the uint32_t pv[] */ 444 count = indexes[UCNVSEL_INDEX_PV_COUNT]*4; 445 ds->swapArray32(ds, inBytes + offset, count, outBytes + offset, status); 446 offset += count; 447 448 /* swap the encoding names */ 449 count = indexes[UCNVSEL_INDEX_NAMES_LENGTH]; 450 ds->swapInvChars(ds, inBytes + offset, count, outBytes + offset, status); 451 offset += count; 452 453 U_ASSERT(offset == size); 454 } 455 456 return headerSize + size; 457 } 458 459 /* unserialize a selector */ 460 U_CAPI UConverterSelector* U_EXPORT2 461 ucnvsel_openFromSerialized(const void* buffer, int32_t length, UErrorCode* status) { 462 // check if already failed 463 if (U_FAILURE(*status)) { 464 return NULL; 465 } 466 // ensure args make sense! 467 const uint8_t *p = (const uint8_t *)buffer; 468 if (length <= 0 || 469 (length > 0 && (p == NULL || (U_POINTER_MASK_LSB(p, 3) != 0))) 470 ) { 471 *status = U_ILLEGAL_ARGUMENT_ERROR; 472 return NULL; 473 } 474 // header 475 if (length < 32) { 476 // not even enough space for a minimal header 477 *status = U_INDEX_OUTOFBOUNDS_ERROR; 478 return NULL; 479 } 480 const DataHeader *pHeader = (const DataHeader *)p; 481 if (!( 482 pHeader->dataHeader.magic1==0xda && 483 pHeader->dataHeader.magic2==0x27 && 484 pHeader->info.dataFormat[0] == 0x43 && 485 pHeader->info.dataFormat[1] == 0x53 && 486 pHeader->info.dataFormat[2] == 0x65 && 487 pHeader->info.dataFormat[3] == 0x6c 488 )) { 489 /* header not valid or dataFormat not recognized */ 490 *status = U_INVALID_FORMAT_ERROR; 491 return NULL; 492 } 493 if (pHeader->info.formatVersion[0] != 1) { 494 *status = U_UNSUPPORTED_ERROR; 495 return NULL; 496 } 497 uint8_t* swapped = NULL; 498 if (pHeader->info.isBigEndian != U_IS_BIG_ENDIAN || 499 pHeader->info.charsetFamily != U_CHARSET_FAMILY 500 ) { 501 // swap the data 502 UDataSwapper *ds = 503 udata_openSwapperForInputData(p, length, U_IS_BIG_ENDIAN, U_CHARSET_FAMILY, status); 504 int32_t totalSize = ucnvsel_swap(ds, p, -1, NULL, status); 505 if (U_FAILURE(*status)) { 506 udata_closeSwapper(ds); 507 return NULL; 508 } 509 if (length < totalSize) { 510 udata_closeSwapper(ds); 511 *status = U_INDEX_OUTOFBOUNDS_ERROR; 512 return NULL; 513 } 514 swapped = (uint8_t*)uprv_malloc(totalSize); 515 if (swapped == NULL) { 516 udata_closeSwapper(ds); 517 *status = U_MEMORY_ALLOCATION_ERROR; 518 return NULL; 519 } 520 ucnvsel_swap(ds, p, length, swapped, status); 521 udata_closeSwapper(ds); 522 if (U_FAILURE(*status)) { 523 uprv_free(swapped); 524 return NULL; 525 } 526 p = swapped; 527 pHeader = (const DataHeader *)p; 528 } 529 if (length < (pHeader->dataHeader.headerSize + 16 * 4)) { 530 // not even enough space for the header and the indexes 531 uprv_free(swapped); 532 *status = U_INDEX_OUTOFBOUNDS_ERROR; 533 return NULL; 534 } 535 p += pHeader->dataHeader.headerSize; 536 length -= pHeader->dataHeader.headerSize; 537 // indexes 538 const int32_t *indexes = (const int32_t *)p; 539 if (length < indexes[UCNVSEL_INDEX_SIZE]) { 540 uprv_free(swapped); 541 *status = U_INDEX_OUTOFBOUNDS_ERROR; 542 return NULL; 543 } 544 p += UCNVSEL_INDEX_COUNT * 4; 545 // create and populate the selector object 546 UConverterSelector* sel = (UConverterSelector*)uprv_malloc(sizeof(UConverterSelector)); 547 char **encodings = 548 (char **)uprv_malloc( 549 indexes[UCNVSEL_INDEX_NAMES_COUNT] * sizeof(char *)); 550 if (sel == NULL || encodings == NULL) { 551 uprv_free(swapped); 552 uprv_free(sel); 553 uprv_free(encodings); 554 *status = U_MEMORY_ALLOCATION_ERROR; 555 return NULL; 556 } 557 uprv_memset(sel, 0, sizeof(UConverterSelector)); 558 sel->pvCount = indexes[UCNVSEL_INDEX_PV_COUNT]; 559 sel->encodings = encodings; 560 sel->encodingsCount = indexes[UCNVSEL_INDEX_NAMES_COUNT]; 561 sel->encodingStrLength = indexes[UCNVSEL_INDEX_NAMES_LENGTH]; 562 sel->swapped = swapped; 563 // trie 564 sel->trie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, 565 p, indexes[UCNVSEL_INDEX_TRIE_SIZE], NULL, 566 status); 567 p += indexes[UCNVSEL_INDEX_TRIE_SIZE]; 568 if (U_FAILURE(*status)) { 569 ucnvsel_close(sel); 570 return NULL; 571 } 572 // bit vectors 573 sel->pv = (uint32_t *)p; 574 p += sel->pvCount * 4; 575 // encoding names 576 char* s = (char*)p; 577 for (int32_t i = 0; i < sel->encodingsCount; ++i) { 578 sel->encodings[i] = s; 579 s += uprv_strlen(s) + 1; 580 } 581 p += sel->encodingStrLength; 582 583 return sel; 584 } 585 586 // a bunch of functions for the enumeration thingie! Nothing fancy here. Just 587 // iterate over the selected encodings 588 struct Enumerator { 589 int16_t* index; 590 int16_t length; 591 int16_t cur; 592 const UConverterSelector* sel; 593 }; 594 595 U_CDECL_BEGIN 596 597 static void U_CALLCONV 598 ucnvsel_close_selector_iterator(UEnumeration *enumerator) { 599 uprv_free(((Enumerator*)(enumerator->context))->index); 600 uprv_free(enumerator->context); 601 uprv_free(enumerator); 602 } 603 604 605 static int32_t U_CALLCONV 606 ucnvsel_count_encodings(UEnumeration *enumerator, UErrorCode *status) { 607 // check if already failed 608 if (U_FAILURE(*status)) { 609 return 0; 610 } 611 return ((Enumerator*)(enumerator->context))->length; 612 } 613 614 615 static const char* U_CALLCONV ucnvsel_next_encoding(UEnumeration* enumerator, 616 int32_t* resultLength, 617 UErrorCode* status) { 618 // check if already failed 619 if (U_FAILURE(*status)) { 620 return NULL; 621 } 622 623 int16_t cur = ((Enumerator*)(enumerator->context))->cur; 624 const UConverterSelector* sel; 625 const char* result; 626 if (cur >= ((Enumerator*)(enumerator->context))->length) { 627 return NULL; 628 } 629 sel = ((Enumerator*)(enumerator->context))->sel; 630 result = sel->encodings[((Enumerator*)(enumerator->context))->index[cur] ]; 631 ((Enumerator*)(enumerator->context))->cur++; 632 if (resultLength) { 633 *resultLength = (int32_t)uprv_strlen(result); 634 } 635 return result; 636 } 637 638 static void U_CALLCONV ucnvsel_reset_iterator(UEnumeration* enumerator, 639 UErrorCode* status) { 640 // check if already failed 641 if (U_FAILURE(*status)) { 642 return ; 643 } 644 ((Enumerator*)(enumerator->context))->cur = 0; 645 } 646 647 U_CDECL_END 648 649 650 static const UEnumeration defaultEncodings = { 651 NULL, 652 NULL, 653 ucnvsel_close_selector_iterator, 654 ucnvsel_count_encodings, 655 uenum_unextDefault, 656 ucnvsel_next_encoding, 657 ucnvsel_reset_iterator 658 }; 659 660 661 // internal fn to intersect two sets of masks 662 // returns whether the mask has reduced to all zeros 663 static UBool intersectMasks(uint32_t* dest, const uint32_t* source1, int32_t len) { 664 int32_t i; 665 uint32_t oredDest = 0; 666 for (i = 0 ; i < len ; ++i) { 667 oredDest |= (dest[i] &= source1[i]); 668 } 669 return oredDest == 0; 670 } 671 672 // internal fn to count how many 1's are there in a mask 673 // algorithm taken from http://graphics.stanford.edu/~seander/bithacks.html 674 static int16_t countOnes(uint32_t* mask, int32_t len) { 675 int32_t i, totalOnes = 0; 676 for (i = 0 ; i < len ; ++i) { 677 uint32_t ent = mask[i]; 678 for (; ent; totalOnes++) 679 { 680 ent &= ent - 1; // clear the least significant bit set 681 } 682 } 683 return totalOnes; 684 } 685 686 687 /* internal function! */ 688 static UEnumeration *selectForMask(const UConverterSelector* sel, 689 uint32_t *mask, UErrorCode *status) { 690 // this is the context we will use. Store a table of indices to which 691 // encodings are legit. 692 struct Enumerator* result = (Enumerator*)uprv_malloc(sizeof(Enumerator)); 693 if (result == NULL) { 694 uprv_free(mask); 695 *status = U_MEMORY_ALLOCATION_ERROR; 696 return NULL; 697 } 698 result->index = NULL; // this will be allocated later! 699 result->length = result->cur = 0; 700 result->sel = sel; 701 702 UEnumeration *en = (UEnumeration *)uprv_malloc(sizeof(UEnumeration)); 703 if (en == NULL) { 704 // TODO(markus): Combine Enumerator and UEnumeration into one struct. 705 uprv_free(mask); 706 uprv_free(result); 707 *status = U_MEMORY_ALLOCATION_ERROR; 708 return NULL; 709 } 710 memcpy(en, &defaultEncodings, sizeof(UEnumeration)); 711 en->context = result; 712 713 int32_t columns = (sel->encodingsCount+31)/32; 714 int16_t numOnes = countOnes(mask, columns); 715 // now, we know the exact space we need for index 716 if (numOnes > 0) { 717 result->index = (int16_t*) uprv_malloc(numOnes * sizeof(int16_t)); 718 719 int32_t i, j; 720 int16_t k = 0; 721 for (j = 0 ; j < columns; j++) { 722 uint32_t v = mask[j]; 723 for (i = 0 ; i < 32 && k < sel->encodingsCount; i++, k++) { 724 if ((v & 1) != 0) { 725 result->index[result->length++] = k; 726 } 727 v >>= 1; 728 } 729 } 730 } //otherwise, index will remain NULL (and will never be touched by 731 //the enumerator code anyway) 732 uprv_free(mask); 733 return en; 734 } 735 736 /* check a string against the selector - UTF16 version */ 737 U_CAPI UEnumeration * U_EXPORT2 738 ucnvsel_selectForString(const UConverterSelector* sel, 739 const UChar *s, int32_t length, UErrorCode *status) { 740 // check if already failed 741 if (U_FAILURE(*status)) { 742 return NULL; 743 } 744 // ensure args make sense! 745 if (sel == NULL || (s == NULL && length != 0)) { 746 *status = U_ILLEGAL_ARGUMENT_ERROR; 747 return NULL; 748 } 749 750 int32_t columns = (sel->encodingsCount+31)/32; 751 uint32_t* mask = (uint32_t*) uprv_malloc(columns * 4); 752 if (mask == NULL) { 753 *status = U_MEMORY_ALLOCATION_ERROR; 754 return NULL; 755 } 756 uprv_memset(mask, ~0, columns *4); 757 758 const UChar *limit; 759 if (length >= 0) { 760 limit = s + length; 761 } else { 762 limit = NULL; 763 } 764 765 while (limit == NULL ? *s != 0 : s != limit) { 766 UChar32 c; 767 uint16_t pvIndex; 768 UTRIE2_U16_NEXT16(sel->trie, s, limit, c, pvIndex); 769 if (intersectMasks(mask, sel->pv+pvIndex, columns)) { 770 break; 771 } 772 } 773 return selectForMask(sel, mask, status); 774 } 775 776 /* check a string against the selector - UTF8 version */ 777 U_CAPI UEnumeration * U_EXPORT2 778 ucnvsel_selectForUTF8(const UConverterSelector* sel, 779 const char *s, int32_t length, UErrorCode *status) { 780 // check if already failed 781 if (U_FAILURE(*status)) { 782 return NULL; 783 } 784 // ensure args make sense! 785 if (sel == NULL || (s == NULL && length != 0)) { 786 *status = U_ILLEGAL_ARGUMENT_ERROR; 787 return NULL; 788 } 789 790 int32_t columns = (sel->encodingsCount+31)/32; 791 uint32_t* mask = (uint32_t*) uprv_malloc(columns * 4); 792 if (mask == NULL) { 793 *status = U_MEMORY_ALLOCATION_ERROR; 794 return NULL; 795 } 796 uprv_memset(mask, ~0, columns *4); 797 798 if (length < 0) { 799 length = (int32_t)uprv_strlen(s); 800 } 801 const char *limit = s + length; 802 803 while (s != limit) { 804 uint16_t pvIndex; 805 UTRIE2_U8_NEXT16(sel->trie, s, limit, pvIndex); 806 if (intersectMasks(mask, sel->pv+pvIndex, columns)) { 807 break; 808 } 809 } 810 return selectForMask(sel, mask, status); 811 } 812