1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2008-2011, International Business Machines 5 * Corporation, Google and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 */ 9 // Author : eldawy (at) google.com (Mohamed Eldawy) 10 // ucnvsel.cpp 11 // 12 // Purpose: To generate a list of encodings capable of handling 13 // a given Unicode text 14 // 15 // Started 09-April-2008 16 17 /** 18 * \file 19 * 20 * This is an implementation of an encoding selector. 21 * The goal is, given a unicode string, find the encodings 22 * this string can be mapped to. To make processing faster 23 * a trie is built when you call ucnvsel_open() that 24 * stores all encodings a codepoint can map to 25 */ 26 27 #include "unicode/ucnvsel.h" 28 29 #if !UCONFIG_NO_CONVERSION 30 31 #include <string.h> 32 33 #include "unicode/uchar.h" 34 #include "unicode/uniset.h" 35 #include "unicode/ucnv.h" 36 #include "unicode/ustring.h" 37 #include "unicode/uchriter.h" 38 #include "utrie2.h" 39 #include "propsvec.h" 40 #include "uassert.h" 41 #include "ucmndata.h" 42 #include "uenumimp.h" 43 #include "cmemory.h" 44 #include "cstring.h" 45 46 U_NAMESPACE_USE 47 48 struct UConverterSelector { 49 UTrie2 *trie; // 16 bit trie containing offsets into pv 50 uint32_t* pv; // table of bits! 51 int32_t pvCount; 52 char** encodings; // which encodings did user ask to use? 53 int32_t encodingsCount; 54 int32_t encodingStrLength; 55 uint8_t* swapped; 56 UBool ownPv, ownEncodingStrings; 57 }; 58 59 static void generateSelectorData(UConverterSelector* result, 60 UPropsVectors *upvec, 61 const USet* excludedCodePoints, 62 const UConverterUnicodeSet whichSet, 63 UErrorCode* status) { 64 if (U_FAILURE(*status)) { 65 return; 66 } 67 68 int32_t columns = (result->encodingsCount+31)/32; 69 70 // set errorValue to all-ones 71 for (int32_t col = 0; col < columns; col++) { 72 upvec_setValue(upvec, UPVEC_ERROR_VALUE_CP, UPVEC_ERROR_VALUE_CP, 73 col, ~0, ~0, status); 74 } 75 76 for (int32_t i = 0; i < result->encodingsCount; ++i) { 77 uint32_t mask; 78 uint32_t column; 79 int32_t item_count; 80 int32_t j; 81 UConverter* test_converter = ucnv_open(result->encodings[i], status); 82 if (U_FAILURE(*status)) { 83 return; 84 } 85 USet* unicode_point_set; 86 unicode_point_set = uset_open(1, 0); // empty set 87 88 ucnv_getUnicodeSet(test_converter, unicode_point_set, 89 whichSet, status); 90 if (U_FAILURE(*status)) { 91 ucnv_close(test_converter); 92 return; 93 } 94 95 column = i / 32; 96 mask = 1 << (i%32); 97 // now iterate over intervals on set i! 98 item_count = uset_getItemCount(unicode_point_set); 99 100 for (j = 0; j < item_count; ++j) { 101 UChar32 start_char; 102 UChar32 end_char; 103 UErrorCode smallStatus = U_ZERO_ERROR; 104 uset_getItem(unicode_point_set, j, &start_char, &end_char, NULL, 0, 105 &smallStatus); 106 if (U_FAILURE(smallStatus)) { 107 // this will be reached for the converters that fill the set with 108 // strings. Those should be ignored by our system 109 } else { 110 upvec_setValue(upvec, start_char, end_char, column, ~0, mask, 111 status); 112 } 113 } 114 ucnv_close(test_converter); 115 uset_close(unicode_point_set); 116 if (U_FAILURE(*status)) { 117 return; 118 } 119 } 120 121 // handle excluded encodings! Simply set their values to all 1's in the upvec 122 if (excludedCodePoints) { 123 int32_t item_count = uset_getItemCount(excludedCodePoints); 124 for (int32_t j = 0; j < item_count; ++j) { 125 UChar32 start_char; 126 UChar32 end_char; 127 128 uset_getItem(excludedCodePoints, j, &start_char, &end_char, NULL, 0, 129 status); 130 for (int32_t col = 0; col < columns; col++) { 131 upvec_setValue(upvec, start_char, end_char, col, ~0, ~0, 132 status); 133 } 134 } 135 } 136 137 // alright. Now, let's put things in the same exact form you'd get when you 138 // unserialize things. 139 result->trie = upvec_compactToUTrie2WithRowIndexes(upvec, status); 140 result->pv = upvec_cloneArray(upvec, &result->pvCount, NULL, status); 141 result->pvCount *= columns; // number of uint32_t = rows * columns 142 result->ownPv = TRUE; 143 } 144 145 /* open a selector. If converterListSize is 0, build for all converters. 146 If excludedCodePoints is NULL, don't exclude any codepoints */ 147 U_CAPI UConverterSelector* U_EXPORT2 148 ucnvsel_open(const char* const* converterList, int32_t converterListSize, 149 const USet* excludedCodePoints, 150 const UConverterUnicodeSet whichSet, UErrorCode* status) { 151 // check if already failed 152 if (U_FAILURE(*status)) { 153 return NULL; 154 } 155 // ensure args make sense! 156 if (converterListSize < 0 || (converterList == NULL && converterListSize != 0)) { 157 *status = U_ILLEGAL_ARGUMENT_ERROR; 158 return NULL; 159 } 160 161 // allocate a new converter 162 LocalUConverterSelectorPointer newSelector( 163 (UConverterSelector*)uprv_malloc(sizeof(UConverterSelector))); 164 if (newSelector.isNull()) { 165 *status = U_MEMORY_ALLOCATION_ERROR; 166 return NULL; 167 } 168 uprv_memset(newSelector.getAlias(), 0, sizeof(UConverterSelector)); 169 170 if (converterListSize == 0) { 171 converterList = NULL; 172 converterListSize = ucnv_countAvailable(); 173 } 174 newSelector->encodings = 175 (char**)uprv_malloc(converterListSize * sizeof(char*)); 176 if (!newSelector->encodings) { 177 *status = U_MEMORY_ALLOCATION_ERROR; 178 return NULL; 179 } 180 newSelector->encodings[0] = NULL; // now we can call ucnvsel_close() 181 182 // make a backup copy of the list of converters 183 int32_t totalSize = 0; 184 int32_t i; 185 for (i = 0; i < converterListSize; i++) { 186 totalSize += 187 (int32_t)uprv_strlen(converterList != NULL ? converterList[i] : ucnv_getAvailableName(i)) + 1; 188 } 189 // 4-align the totalSize to 4-align the size of the serialized form 190 int32_t encodingStrPadding = totalSize & 3; 191 if (encodingStrPadding != 0) { 192 encodingStrPadding = 4 - encodingStrPadding; 193 } 194 newSelector->encodingStrLength = totalSize += encodingStrPadding; 195 char* allStrings = (char*) uprv_malloc(totalSize); 196 if (!allStrings) { 197 *status = U_MEMORY_ALLOCATION_ERROR; 198 return NULL; 199 } 200 201 for (i = 0; i < converterListSize; i++) { 202 newSelector->encodings[i] = allStrings; 203 uprv_strcpy(newSelector->encodings[i], 204 converterList != NULL ? converterList[i] : ucnv_getAvailableName(i)); 205 allStrings += uprv_strlen(newSelector->encodings[i]) + 1; 206 } 207 while (encodingStrPadding > 0) { 208 *allStrings++ = 0; 209 --encodingStrPadding; 210 } 211 212 newSelector->ownEncodingStrings = TRUE; 213 newSelector->encodingsCount = converterListSize; 214 UPropsVectors *upvec = upvec_open((converterListSize+31)/32, status); 215 generateSelectorData(newSelector.getAlias(), upvec, excludedCodePoints, whichSet, status); 216 upvec_close(upvec); 217 218 if (U_FAILURE(*status)) { 219 return NULL; 220 } 221 222 return newSelector.orphan(); 223 } 224 225 /* close opened selector */ 226 U_CAPI void U_EXPORT2 227 ucnvsel_close(UConverterSelector *sel) { 228 if (!sel) { 229 return; 230 } 231 if (sel->ownEncodingStrings) { 232 uprv_free(sel->encodings[0]); 233 } 234 uprv_free(sel->encodings); 235 if (sel->ownPv) { 236 uprv_free(sel->pv); 237 } 238 utrie2_close(sel->trie); 239 uprv_free(sel->swapped); 240 uprv_free(sel); 241 } 242 243 static const UDataInfo dataInfo = { 244 sizeof(UDataInfo), 245 0, 246 247 U_IS_BIG_ENDIAN, 248 U_CHARSET_FAMILY, 249 U_SIZEOF_UCHAR, 250 0, 251 252 { 0x43, 0x53, 0x65, 0x6c }, /* dataFormat="CSel" */ 253 { 1, 0, 0, 0 }, /* formatVersion */ 254 { 0, 0, 0, 0 } /* dataVersion */ 255 }; 256 257 enum { 258 UCNVSEL_INDEX_TRIE_SIZE, // trie size in bytes 259 UCNVSEL_INDEX_PV_COUNT, // number of uint32_t in the bit vectors 260 UCNVSEL_INDEX_NAMES_COUNT, // number of encoding names 261 UCNVSEL_INDEX_NAMES_LENGTH, // number of encoding name bytes including padding 262 UCNVSEL_INDEX_SIZE = 15, // bytes following the DataHeader 263 UCNVSEL_INDEX_COUNT = 16 264 }; 265 266 /* 267 * Serialized form of a UConverterSelector, formatVersion 1: 268 * 269 * The serialized form begins with a standard ICU DataHeader with a UDataInfo 270 * as the template above. 271 * This is followed by: 272 * int32_t indexes[UCNVSEL_INDEX_COUNT]; // see index entry constants above 273 * serialized UTrie2; // indexes[UCNVSEL_INDEX_TRIE_SIZE] bytes 274 * uint32_t pv[indexes[UCNVSEL_INDEX_PV_COUNT]]; // bit vectors 275 * char* encodingNames[indexes[UCNVSEL_INDEX_NAMES_LENGTH]]; // NUL-terminated strings + padding 276 */ 277 278 /* serialize a selector */ 279 U_CAPI int32_t U_EXPORT2 280 ucnvsel_serialize(const UConverterSelector* sel, 281 void* buffer, int32_t bufferCapacity, UErrorCode* status) { 282 // check if already failed 283 if (U_FAILURE(*status)) { 284 return 0; 285 } 286 // ensure args make sense! 287 uint8_t *p = (uint8_t *)buffer; 288 if (bufferCapacity < 0 || 289 (bufferCapacity > 0 && (p == NULL || (U_POINTER_MASK_LSB(p, 3) != 0))) 290 ) { 291 *status = U_ILLEGAL_ARGUMENT_ERROR; 292 return 0; 293 } 294 // add up the size of the serialized form 295 int32_t serializedTrieSize = utrie2_serialize(sel->trie, NULL, 0, status); 296 if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) { 297 return 0; 298 } 299 *status = U_ZERO_ERROR; 300 301 DataHeader header; 302 uprv_memset(&header, 0, sizeof(header)); 303 header.dataHeader.headerSize = (uint16_t)((sizeof(header) + 15) & ~15); 304 header.dataHeader.magic1 = 0xda; 305 header.dataHeader.magic2 = 0x27; 306 uprv_memcpy(&header.info, &dataInfo, sizeof(dataInfo)); 307 308 int32_t indexes[UCNVSEL_INDEX_COUNT] = { 309 serializedTrieSize, 310 sel->pvCount, 311 sel->encodingsCount, 312 sel->encodingStrLength 313 }; 314 315 int32_t totalSize = 316 header.dataHeader.headerSize + 317 (int32_t)sizeof(indexes) + 318 serializedTrieSize + 319 sel->pvCount * 4 + 320 sel->encodingStrLength; 321 indexes[UCNVSEL_INDEX_SIZE] = totalSize - header.dataHeader.headerSize; 322 if (totalSize > bufferCapacity) { 323 *status = U_BUFFER_OVERFLOW_ERROR; 324 return totalSize; 325 } 326 // ok, save! 327 int32_t length = header.dataHeader.headerSize; 328 uprv_memcpy(p, &header, sizeof(header)); 329 uprv_memset(p + sizeof(header), 0, length - sizeof(header)); 330 p += length; 331 332 length = (int32_t)sizeof(indexes); 333 uprv_memcpy(p, indexes, length); 334 p += length; 335 336 utrie2_serialize(sel->trie, p, serializedTrieSize, status); 337 p += serializedTrieSize; 338 339 length = sel->pvCount * 4; 340 uprv_memcpy(p, sel->pv, length); 341 p += length; 342 343 uprv_memcpy(p, sel->encodings[0], sel->encodingStrLength); 344 p += sel->encodingStrLength; 345 346 return totalSize; 347 } 348 349 /** 350 * swap a selector into the desired Endianness and Asciiness of 351 * the system. Just as FYI, selectors are always saved in the format 352 * of the system that created them. They are only converted if used 353 * on another system. In other words, selectors created on different 354 * system can be different even if the params are identical (endianness 355 * and Asciiness differences only) 356 * 357 * @param ds pointer to data swapper containing swapping info 358 * @param inData pointer to incoming data 359 * @param length length of inData in bytes 360 * @param outData pointer to output data. Capacity should 361 * be at least equal to capacity of inData 362 * @param status an in/out ICU UErrorCode 363 * @return 0 on failure, number of bytes swapped on success 364 * number of bytes swapped can be smaller than length 365 */ 366 static int32_t 367 ucnvsel_swap(const UDataSwapper *ds, 368 const void *inData, int32_t length, 369 void *outData, UErrorCode *status) { 370 /* udata_swapDataHeader checks the arguments */ 371 int32_t headerSize = udata_swapDataHeader(ds, inData, length, outData, status); 372 if(U_FAILURE(*status)) { 373 return 0; 374 } 375 376 /* check data format and format version */ 377 const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData + 4); 378 if(!( 379 pInfo->dataFormat[0] == 0x43 && /* dataFormat="CSel" */ 380 pInfo->dataFormat[1] == 0x53 && 381 pInfo->dataFormat[2] == 0x65 && 382 pInfo->dataFormat[3] == 0x6c 383 )) { 384 udata_printError(ds, "ucnvsel_swap(): data format %02x.%02x.%02x.%02x is not recognized as UConverterSelector data\n", 385 pInfo->dataFormat[0], pInfo->dataFormat[1], 386 pInfo->dataFormat[2], pInfo->dataFormat[3]); 387 *status = U_INVALID_FORMAT_ERROR; 388 return 0; 389 } 390 if(pInfo->formatVersion[0] != 1) { 391 udata_printError(ds, "ucnvsel_swap(): format version %02x is not supported\n", 392 pInfo->formatVersion[0]); 393 *status = U_UNSUPPORTED_ERROR; 394 return 0; 395 } 396 397 if(length >= 0) { 398 length -= headerSize; 399 if(length < 16*4) { 400 udata_printError(ds, "ucnvsel_swap(): too few bytes (%d after header) for UConverterSelector data\n", 401 length); 402 *status = U_INDEX_OUTOFBOUNDS_ERROR; 403 return 0; 404 } 405 } 406 407 const uint8_t *inBytes = (const uint8_t *)inData + headerSize; 408 uint8_t *outBytes = (uint8_t *)outData + headerSize; 409 410 /* read the indexes */ 411 const int32_t *inIndexes = (const int32_t *)inBytes; 412 int32_t indexes[16]; 413 int32_t i; 414 for(i = 0; i < 16; ++i) { 415 indexes[i] = udata_readInt32(ds, inIndexes[i]); 416 } 417 418 /* get the total length of the data */ 419 int32_t size = indexes[UCNVSEL_INDEX_SIZE]; 420 if(length >= 0) { 421 if(length < size) { 422 udata_printError(ds, "ucnvsel_swap(): too few bytes (%d after header) for all of UConverterSelector data\n", 423 length); 424 *status = U_INDEX_OUTOFBOUNDS_ERROR; 425 return 0; 426 } 427 428 /* copy the data for inaccessible bytes */ 429 if(inBytes != outBytes) { 430 uprv_memcpy(outBytes, inBytes, size); 431 } 432 433 int32_t offset = 0, count; 434 435 /* swap the int32_t indexes[] */ 436 count = UCNVSEL_INDEX_COUNT*4; 437 ds->swapArray32(ds, inBytes, count, outBytes, status); 438 offset += count; 439 440 /* swap the UTrie2 */ 441 count = indexes[UCNVSEL_INDEX_TRIE_SIZE]; 442 utrie2_swap(ds, inBytes + offset, count, outBytes + offset, status); 443 offset += count; 444 445 /* swap the uint32_t pv[] */ 446 count = indexes[UCNVSEL_INDEX_PV_COUNT]*4; 447 ds->swapArray32(ds, inBytes + offset, count, outBytes + offset, status); 448 offset += count; 449 450 /* swap the encoding names */ 451 count = indexes[UCNVSEL_INDEX_NAMES_LENGTH]; 452 ds->swapInvChars(ds, inBytes + offset, count, outBytes + offset, status); 453 offset += count; 454 455 U_ASSERT(offset == size); 456 } 457 458 return headerSize + size; 459 } 460 461 /* unserialize a selector */ 462 U_CAPI UConverterSelector* U_EXPORT2 463 ucnvsel_openFromSerialized(const void* buffer, int32_t length, UErrorCode* status) { 464 // check if already failed 465 if (U_FAILURE(*status)) { 466 return NULL; 467 } 468 // ensure args make sense! 469 const uint8_t *p = (const uint8_t *)buffer; 470 if (length <= 0 || 471 (length > 0 && (p == NULL || (U_POINTER_MASK_LSB(p, 3) != 0))) 472 ) { 473 *status = U_ILLEGAL_ARGUMENT_ERROR; 474 return NULL; 475 } 476 // header 477 if (length < 32) { 478 // not even enough space for a minimal header 479 *status = U_INDEX_OUTOFBOUNDS_ERROR; 480 return NULL; 481 } 482 const DataHeader *pHeader = (const DataHeader *)p; 483 if (!( 484 pHeader->dataHeader.magic1==0xda && 485 pHeader->dataHeader.magic2==0x27 && 486 pHeader->info.dataFormat[0] == 0x43 && 487 pHeader->info.dataFormat[1] == 0x53 && 488 pHeader->info.dataFormat[2] == 0x65 && 489 pHeader->info.dataFormat[3] == 0x6c 490 )) { 491 /* header not valid or dataFormat not recognized */ 492 *status = U_INVALID_FORMAT_ERROR; 493 return NULL; 494 } 495 if (pHeader->info.formatVersion[0] != 1) { 496 *status = U_UNSUPPORTED_ERROR; 497 return NULL; 498 } 499 uint8_t* swapped = NULL; 500 if (pHeader->info.isBigEndian != U_IS_BIG_ENDIAN || 501 pHeader->info.charsetFamily != U_CHARSET_FAMILY 502 ) { 503 // swap the data 504 UDataSwapper *ds = 505 udata_openSwapperForInputData(p, length, U_IS_BIG_ENDIAN, U_CHARSET_FAMILY, status); 506 int32_t totalSize = ucnvsel_swap(ds, p, -1, NULL, status); 507 if (U_FAILURE(*status)) { 508 udata_closeSwapper(ds); 509 return NULL; 510 } 511 if (length < totalSize) { 512 udata_closeSwapper(ds); 513 *status = U_INDEX_OUTOFBOUNDS_ERROR; 514 return NULL; 515 } 516 swapped = (uint8_t*)uprv_malloc(totalSize); 517 if (swapped == NULL) { 518 udata_closeSwapper(ds); 519 *status = U_MEMORY_ALLOCATION_ERROR; 520 return NULL; 521 } 522 ucnvsel_swap(ds, p, length, swapped, status); 523 udata_closeSwapper(ds); 524 if (U_FAILURE(*status)) { 525 uprv_free(swapped); 526 return NULL; 527 } 528 p = swapped; 529 pHeader = (const DataHeader *)p; 530 } 531 if (length < (pHeader->dataHeader.headerSize + 16 * 4)) { 532 // not even enough space for the header and the indexes 533 uprv_free(swapped); 534 *status = U_INDEX_OUTOFBOUNDS_ERROR; 535 return NULL; 536 } 537 p += pHeader->dataHeader.headerSize; 538 length -= pHeader->dataHeader.headerSize; 539 // indexes 540 const int32_t *indexes = (const int32_t *)p; 541 if (length < indexes[UCNVSEL_INDEX_SIZE]) { 542 uprv_free(swapped); 543 *status = U_INDEX_OUTOFBOUNDS_ERROR; 544 return NULL; 545 } 546 p += UCNVSEL_INDEX_COUNT * 4; 547 // create and populate the selector object 548 UConverterSelector* sel = (UConverterSelector*)uprv_malloc(sizeof(UConverterSelector)); 549 char **encodings = 550 (char **)uprv_malloc( 551 indexes[UCNVSEL_INDEX_NAMES_COUNT] * sizeof(char *)); 552 if (sel == NULL || encodings == NULL) { 553 uprv_free(swapped); 554 uprv_free(sel); 555 uprv_free(encodings); 556 *status = U_MEMORY_ALLOCATION_ERROR; 557 return NULL; 558 } 559 uprv_memset(sel, 0, sizeof(UConverterSelector)); 560 sel->pvCount = indexes[UCNVSEL_INDEX_PV_COUNT]; 561 sel->encodings = encodings; 562 sel->encodingsCount = indexes[UCNVSEL_INDEX_NAMES_COUNT]; 563 sel->encodingStrLength = indexes[UCNVSEL_INDEX_NAMES_LENGTH]; 564 sel->swapped = swapped; 565 // trie 566 sel->trie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, 567 p, indexes[UCNVSEL_INDEX_TRIE_SIZE], NULL, 568 status); 569 p += indexes[UCNVSEL_INDEX_TRIE_SIZE]; 570 if (U_FAILURE(*status)) { 571 ucnvsel_close(sel); 572 return NULL; 573 } 574 // bit vectors 575 sel->pv = (uint32_t *)p; 576 p += sel->pvCount * 4; 577 // encoding names 578 char* s = (char*)p; 579 for (int32_t i = 0; i < sel->encodingsCount; ++i) { 580 sel->encodings[i] = s; 581 s += uprv_strlen(s) + 1; 582 } 583 p += sel->encodingStrLength; 584 585 return sel; 586 } 587 588 // a bunch of functions for the enumeration thingie! Nothing fancy here. Just 589 // iterate over the selected encodings 590 struct Enumerator { 591 int16_t* index; 592 int16_t length; 593 int16_t cur; 594 const UConverterSelector* sel; 595 }; 596 597 U_CDECL_BEGIN 598 599 static void U_CALLCONV 600 ucnvsel_close_selector_iterator(UEnumeration *enumerator) { 601 uprv_free(((Enumerator*)(enumerator->context))->index); 602 uprv_free(enumerator->context); 603 uprv_free(enumerator); 604 } 605 606 607 static int32_t U_CALLCONV 608 ucnvsel_count_encodings(UEnumeration *enumerator, UErrorCode *status) { 609 // check if already failed 610 if (U_FAILURE(*status)) { 611 return 0; 612 } 613 return ((Enumerator*)(enumerator->context))->length; 614 } 615 616 617 static const char* U_CALLCONV ucnvsel_next_encoding(UEnumeration* enumerator, 618 int32_t* resultLength, 619 UErrorCode* status) { 620 // check if already failed 621 if (U_FAILURE(*status)) { 622 return NULL; 623 } 624 625 int16_t cur = ((Enumerator*)(enumerator->context))->cur; 626 const UConverterSelector* sel; 627 const char* result; 628 if (cur >= ((Enumerator*)(enumerator->context))->length) { 629 return NULL; 630 } 631 sel = ((Enumerator*)(enumerator->context))->sel; 632 result = sel->encodings[((Enumerator*)(enumerator->context))->index[cur] ]; 633 ((Enumerator*)(enumerator->context))->cur++; 634 if (resultLength) { 635 *resultLength = (int32_t)uprv_strlen(result); 636 } 637 return result; 638 } 639 640 static void U_CALLCONV ucnvsel_reset_iterator(UEnumeration* enumerator, 641 UErrorCode* status) { 642 // check if already failed 643 if (U_FAILURE(*status)) { 644 return ; 645 } 646 ((Enumerator*)(enumerator->context))->cur = 0; 647 } 648 649 U_CDECL_END 650 651 652 static const UEnumeration defaultEncodings = { 653 NULL, 654 NULL, 655 ucnvsel_close_selector_iterator, 656 ucnvsel_count_encodings, 657 uenum_unextDefault, 658 ucnvsel_next_encoding, 659 ucnvsel_reset_iterator 660 }; 661 662 663 // internal fn to intersect two sets of masks 664 // returns whether the mask has reduced to all zeros 665 static UBool intersectMasks(uint32_t* dest, const uint32_t* source1, int32_t len) { 666 int32_t i; 667 uint32_t oredDest = 0; 668 for (i = 0 ; i < len ; ++i) { 669 oredDest |= (dest[i] &= source1[i]); 670 } 671 return oredDest == 0; 672 } 673 674 // internal fn to count how many 1's are there in a mask 675 // algorithm taken from http://graphics.stanford.edu/~seander/bithacks.html 676 static int16_t countOnes(uint32_t* mask, int32_t len) { 677 int32_t i, totalOnes = 0; 678 for (i = 0 ; i < len ; ++i) { 679 uint32_t ent = mask[i]; 680 for (; ent; totalOnes++) 681 { 682 ent &= ent - 1; // clear the least significant bit set 683 } 684 } 685 return totalOnes; 686 } 687 688 689 /* internal function! */ 690 static UEnumeration *selectForMask(const UConverterSelector* sel, 691 uint32_t *mask, UErrorCode *status) { 692 // this is the context we will use. Store a table of indices to which 693 // encodings are legit. 694 struct Enumerator* result = (Enumerator*)uprv_malloc(sizeof(Enumerator)); 695 if (result == NULL) { 696 uprv_free(mask); 697 *status = U_MEMORY_ALLOCATION_ERROR; 698 return NULL; 699 } 700 result->index = NULL; // this will be allocated later! 701 result->length = result->cur = 0; 702 result->sel = sel; 703 704 UEnumeration *en = (UEnumeration *)uprv_malloc(sizeof(UEnumeration)); 705 if (en == NULL) { 706 // TODO(markus): Combine Enumerator and UEnumeration into one struct. 707 uprv_free(mask); 708 uprv_free(result); 709 *status = U_MEMORY_ALLOCATION_ERROR; 710 return NULL; 711 } 712 memcpy(en, &defaultEncodings, sizeof(UEnumeration)); 713 en->context = result; 714 715 int32_t columns = (sel->encodingsCount+31)/32; 716 int16_t numOnes = countOnes(mask, columns); 717 // now, we know the exact space we need for index 718 if (numOnes > 0) { 719 result->index = (int16_t*) uprv_malloc(numOnes * sizeof(int16_t)); 720 721 int32_t i, j; 722 int16_t k = 0; 723 for (j = 0 ; j < columns; j++) { 724 uint32_t v = mask[j]; 725 for (i = 0 ; i < 32 && k < sel->encodingsCount; i++, k++) { 726 if ((v & 1) != 0) { 727 result->index[result->length++] = k; 728 } 729 v >>= 1; 730 } 731 } 732 } //otherwise, index will remain NULL (and will never be touched by 733 //the enumerator code anyway) 734 uprv_free(mask); 735 return en; 736 } 737 738 /* check a string against the selector - UTF16 version */ 739 U_CAPI UEnumeration * U_EXPORT2 740 ucnvsel_selectForString(const UConverterSelector* sel, 741 const UChar *s, int32_t length, UErrorCode *status) { 742 // check if already failed 743 if (U_FAILURE(*status)) { 744 return NULL; 745 } 746 // ensure args make sense! 747 if (sel == NULL || (s == NULL && length != 0)) { 748 *status = U_ILLEGAL_ARGUMENT_ERROR; 749 return NULL; 750 } 751 752 int32_t columns = (sel->encodingsCount+31)/32; 753 uint32_t* mask = (uint32_t*) uprv_malloc(columns * 4); 754 if (mask == NULL) { 755 *status = U_MEMORY_ALLOCATION_ERROR; 756 return NULL; 757 } 758 uprv_memset(mask, ~0, columns *4); 759 760 if(s!=NULL) { 761 const UChar *limit; 762 if (length >= 0) { 763 limit = s + length; 764 } else { 765 limit = NULL; 766 } 767 768 while (limit == NULL ? *s != 0 : s != limit) { 769 UChar32 c; 770 uint16_t pvIndex; 771 UTRIE2_U16_NEXT16(sel->trie, s, limit, c, pvIndex); 772 if (intersectMasks(mask, sel->pv+pvIndex, columns)) { 773 break; 774 } 775 } 776 } 777 return selectForMask(sel, mask, status); 778 } 779 780 /* check a string against the selector - UTF8 version */ 781 U_CAPI UEnumeration * U_EXPORT2 782 ucnvsel_selectForUTF8(const UConverterSelector* sel, 783 const char *s, int32_t length, UErrorCode *status) { 784 // check if already failed 785 if (U_FAILURE(*status)) { 786 return NULL; 787 } 788 // ensure args make sense! 789 if (sel == NULL || (s == NULL && length != 0)) { 790 *status = U_ILLEGAL_ARGUMENT_ERROR; 791 return NULL; 792 } 793 794 int32_t columns = (sel->encodingsCount+31)/32; 795 uint32_t* mask = (uint32_t*) uprv_malloc(columns * 4); 796 if (mask == NULL) { 797 *status = U_MEMORY_ALLOCATION_ERROR; 798 return NULL; 799 } 800 uprv_memset(mask, ~0, columns *4); 801 802 if (length < 0) { 803 length = (int32_t)uprv_strlen(s); 804 } 805 806 if(s!=NULL) { 807 const char *limit = s + length; 808 809 while (s != limit) { 810 uint16_t pvIndex; 811 UTRIE2_U8_NEXT16(sel->trie, s, limit, pvIndex); 812 if (intersectMasks(mask, sel->pv+pvIndex, columns)) { 813 break; 814 } 815 } 816 } 817 return selectForMask(sel, mask, status); 818 } 819 820 #endif // !UCONFIG_NO_CONVERSION 821