1 /* 2 ***************************************************************************** 3 * Copyright (C) 1996-2006, International Business Machines Corporation and * 4 * others. All Rights Reserved. * 5 ***************************************************************************** 6 */ 7 8 #include "unicode/utypes.h" 9 10 #if !UCONFIG_NO_NORMALIZATION 11 12 #include "unicode/uset.h" 13 #include "unicode/ustring.h" 14 #include "hash.h" 15 #include "unormimp.h" 16 #include "unicode/caniter.h" 17 #include "unicode/normlzr.h" 18 #include "unicode/uchar.h" 19 #include "cmemory.h" 20 21 /** 22 * This class allows one to iterate through all the strings that are canonically equivalent to a given 23 * string. For example, here are some sample results: 24 Results for: {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA} 25 1: \u0041\u030A\u0064\u0307\u0327 26 = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA} 27 2: \u0041\u030A\u0064\u0327\u0307 28 = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE} 29 3: \u0041\u030A\u1E0B\u0327 30 = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA} 31 4: \u0041\u030A\u1E11\u0307 32 = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE} 33 5: \u00C5\u0064\u0307\u0327 34 = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA} 35 6: \u00C5\u0064\u0327\u0307 36 = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE} 37 7: \u00C5\u1E0B\u0327 38 = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA} 39 8: \u00C5\u1E11\u0307 40 = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE} 41 9: \u212B\u0064\u0307\u0327 42 = {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA} 43 10: \u212B\u0064\u0327\u0307 44 = {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE} 45 11: \u212B\u1E0B\u0327 46 = {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA} 47 12: \u212B\u1E11\u0307 48 = {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE} 49 *<br>Note: the code is intended for use with small strings, and is not suitable for larger ones, 50 * since it has not been optimized for that situation. 51 *@author M. Davis 52 *@draft 53 */ 54 55 // public 56 57 U_NAMESPACE_BEGIN 58 59 // TODO: add boilerplate methods. 60 61 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CanonicalIterator) 62 63 /** 64 *@param source string to get results for 65 */ 66 CanonicalIterator::CanonicalIterator(const UnicodeString &sourceStr, UErrorCode &status) : 67 pieces(NULL), 68 pieces_length(0), 69 pieces_lengths(NULL), 70 current(NULL), 71 current_length(0) 72 { 73 if(U_SUCCESS(status)) { 74 setSource(sourceStr, status); 75 } 76 } 77 78 CanonicalIterator::~CanonicalIterator() { 79 cleanPieces(); 80 } 81 82 void CanonicalIterator::cleanPieces() { 83 int32_t i = 0; 84 if(pieces != NULL) { 85 for(i = 0; i < pieces_length; i++) { 86 if(pieces[i] != NULL) { 87 delete[] pieces[i]; 88 } 89 } 90 uprv_free(pieces); 91 pieces = NULL; 92 pieces_length = 0; 93 } 94 if(pieces_lengths != NULL) { 95 uprv_free(pieces_lengths); 96 pieces_lengths = NULL; 97 } 98 if(current != NULL) { 99 uprv_free(current); 100 current = NULL; 101 current_length = 0; 102 } 103 } 104 105 /** 106 *@return gets the source: NOTE: it is the NFD form of source 107 */ 108 UnicodeString CanonicalIterator::getSource() { 109 return source; 110 } 111 112 /** 113 * Resets the iterator so that one can start again from the beginning. 114 */ 115 void CanonicalIterator::reset() { 116 done = FALSE; 117 for (int i = 0; i < current_length; ++i) { 118 current[i] = 0; 119 } 120 } 121 122 /** 123 *@return the next string that is canonically equivalent. The value null is returned when 124 * the iteration is done. 125 */ 126 UnicodeString CanonicalIterator::next() { 127 int32_t i = 0; 128 129 if (done) { 130 buffer.setToBogus(); 131 return buffer; 132 } 133 134 // delete old contents 135 buffer.remove(); 136 137 // construct return value 138 139 for (i = 0; i < pieces_length; ++i) { 140 buffer.append(pieces[i][current[i]]); 141 } 142 //String result = buffer.toString(); // not needed 143 144 // find next value for next time 145 146 for (i = current_length - 1; ; --i) { 147 if (i < 0) { 148 done = TRUE; 149 break; 150 } 151 current[i]++; 152 if (current[i] < pieces_lengths[i]) break; // got sequence 153 current[i] = 0; 154 } 155 return buffer; 156 } 157 158 /** 159 *@param set the source string to iterate against. This allows the same iterator to be used 160 * while changing the source string, saving object creation. 161 */ 162 void CanonicalIterator::setSource(const UnicodeString &newSource, UErrorCode &status) { 163 int32_t list_length = 0; 164 UChar32 cp = 0; 165 int32_t start = 0; 166 int32_t i = 0; 167 UnicodeString *list = NULL; 168 169 Normalizer::normalize(newSource, UNORM_NFD, 0, source, status); 170 if(U_FAILURE(status)) { 171 return; 172 } 173 done = FALSE; 174 175 cleanPieces(); 176 177 // catch degenerate case 178 if (newSource.length() == 0) { 179 pieces = (UnicodeString **)uprv_malloc(sizeof(UnicodeString *)); 180 pieces_lengths = (int32_t*)uprv_malloc(1 * sizeof(int32_t)); 181 pieces_length = 1; 182 current = (int32_t*)uprv_malloc(1 * sizeof(int32_t)); 183 current_length = 1; 184 if (pieces == NULL || pieces_lengths == NULL || current == NULL) { 185 status = U_MEMORY_ALLOCATION_ERROR; 186 goto CleanPartialInitialization; 187 } 188 current[0] = 0; 189 pieces[0] = new UnicodeString[1]; 190 pieces_lengths[0] = 1; 191 if (pieces[0] == 0) { 192 status = U_MEMORY_ALLOCATION_ERROR; 193 goto CleanPartialInitialization; 194 } 195 return; 196 } 197 198 199 list = new UnicodeString[source.length()]; 200 if (list == 0) { 201 status = U_MEMORY_ALLOCATION_ERROR; 202 goto CleanPartialInitialization; 203 } 204 205 // i should initialy be the number of code units at the 206 // start of the string 207 i = UTF16_CHAR_LENGTH(source.char32At(0)); 208 //int32_t i = 1; 209 // find the segments 210 // This code iterates through the source string and 211 // extracts segments that end up on a codepoint that 212 // doesn't start any decompositions. (Analysis is done 213 // on the NFD form - see above). 214 for (; i < source.length(); i += UTF16_CHAR_LENGTH(cp)) { 215 cp = source.char32At(i); 216 if (unorm_isCanonSafeStart(cp)) { 217 source.extract(start, i-start, list[list_length++]); // add up to i 218 start = i; 219 } 220 } 221 source.extract(start, i-start, list[list_length++]); // add last one 222 223 224 // allocate the arrays, and find the strings that are CE to each segment 225 pieces = (UnicodeString **)uprv_malloc(list_length * sizeof(UnicodeString *)); 226 pieces_length = list_length; 227 pieces_lengths = (int32_t*)uprv_malloc(list_length * sizeof(int32_t)); 228 current = (int32_t*)uprv_malloc(list_length * sizeof(int32_t)); 229 current_length = list_length; 230 if (pieces == NULL || pieces_lengths == NULL || current == NULL) { 231 status = U_MEMORY_ALLOCATION_ERROR; 232 goto CleanPartialInitialization; 233 } 234 235 for (i = 0; i < current_length; i++) { 236 current[i] = 0; 237 } 238 // for each segment, get all the combinations that can produce 239 // it after NFD normalization 240 for (i = 0; i < pieces_length; ++i) { 241 //if (PROGRESS) printf("SEGMENT\n"); 242 pieces[i] = getEquivalents(list[i], pieces_lengths[i], status); 243 } 244 245 delete[] list; 246 return; 247 // Common section to cleanup all local variables and reset object variables. 248 CleanPartialInitialization: 249 if (list != NULL) { 250 delete[] list; 251 } 252 cleanPieces(); 253 } 254 255 /** 256 * Dumb recursive implementation of permutation. 257 * TODO: optimize 258 * @param source the string to find permutations for 259 * @return the results in a set. 260 */ 261 void U_EXPORT2 CanonicalIterator::permute(UnicodeString &source, UBool skipZeros, Hashtable *result, UErrorCode &status) { 262 if(U_FAILURE(status)) { 263 return; 264 } 265 //if (PROGRESS) printf("Permute: %s\n", UToS(Tr(source))); 266 int32_t i = 0; 267 268 // optimization: 269 // if zero or one character, just return a set with it 270 // we check for length < 2 to keep from counting code points all the time 271 if (source.length() <= 2 && source.countChar32() <= 1) { 272 UnicodeString *toPut = new UnicodeString(source); 273 /* test for NULL */ 274 if (toPut == 0) { 275 status = U_MEMORY_ALLOCATION_ERROR; 276 return; 277 } 278 result->put(source, toPut, status); 279 return; 280 } 281 282 // otherwise iterate through the string, and recursively permute all the other characters 283 UChar32 cp; 284 Hashtable subpermute(status); 285 if(U_FAILURE(status)) { 286 return; 287 } 288 subpermute.setValueDeleter(uhash_deleteUnicodeString); 289 290 for (i = 0; i < source.length(); i += UTF16_CHAR_LENGTH(cp)) { 291 cp = source.char32At(i); 292 const UHashElement *ne = NULL; 293 int32_t el = -1; 294 UnicodeString subPermuteString = source; 295 296 // optimization: 297 // if the character is canonical combining class zero, 298 // don't permute it 299 if (skipZeros && i != 0 && u_getCombiningClass(cp) == 0) { 300 //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i))); 301 continue; 302 } 303 304 subpermute.removeAll(); 305 306 // see what the permutations of the characters before and after this one are 307 //Hashtable *subpermute = permute(source.substring(0,i) + source.substring(i + UTF16.getCharCount(cp))); 308 permute(subPermuteString.replace(i, UTF16_CHAR_LENGTH(cp), NULL, 0), skipZeros, &subpermute, status); 309 /* Test for buffer overflows */ 310 if(U_FAILURE(status)) { 311 return; 312 } 313 // The upper replace is destructive. The question is do we have to make a copy, or we don't care about the contents 314 // of source at this point. 315 316 // prefix this character to all of them 317 ne = subpermute.nextElement(el); 318 while (ne != NULL) { 319 UnicodeString *permRes = (UnicodeString *)(ne->value.pointer); 320 UnicodeString *chStr = new UnicodeString(cp); 321 //test for NULL 322 if (chStr == NULL) { 323 status = U_MEMORY_ALLOCATION_ERROR; 324 return; 325 } 326 chStr->append(*permRes); //*((UnicodeString *)(ne->value.pointer)); 327 //if (PROGRESS) printf(" Piece: %s\n", UToS(*chStr)); 328 result->put(*chStr, chStr, status); 329 ne = subpermute.nextElement(el); 330 } 331 } 332 //return result; 333 } 334 335 // privates 336 337 // we have a segment, in NFD. Find all the strings that are canonically equivalent to it. 338 UnicodeString* CanonicalIterator::getEquivalents(const UnicodeString &segment, int32_t &result_len, UErrorCode &status) { 339 Hashtable result(status); 340 Hashtable permutations(status); 341 Hashtable basic(status); 342 if (U_FAILURE(status)) { 343 return 0; 344 } 345 result.setValueDeleter(uhash_deleteUnicodeString); 346 permutations.setValueDeleter(uhash_deleteUnicodeString); 347 basic.setValueDeleter(uhash_deleteUnicodeString); 348 349 UChar USeg[256]; 350 int32_t segLen = segment.extract(USeg, 256, status); 351 getEquivalents2(&basic, USeg, segLen, status); 352 353 // now get all the permutations 354 // add only the ones that are canonically equivalent 355 // TODO: optimize by not permuting any class zero. 356 357 const UHashElement *ne = NULL; 358 int32_t el = -1; 359 //Iterator it = basic.iterator(); 360 ne = basic.nextElement(el); 361 //while (it.hasNext()) 362 while (ne != NULL) { 363 //String item = (String) it.next(); 364 UnicodeString item = *((UnicodeString *)(ne->value.pointer)); 365 366 permutations.removeAll(); 367 permute(item, CANITER_SKIP_ZEROES, &permutations, status); 368 const UHashElement *ne2 = NULL; 369 int32_t el2 = -1; 370 //Iterator it2 = permutations.iterator(); 371 ne2 = permutations.nextElement(el2); 372 //while (it2.hasNext()) 373 while (ne2 != NULL) { 374 //String possible = (String) it2.next(); 375 //UnicodeString *possible = new UnicodeString(*((UnicodeString *)(ne2->value.pointer))); 376 UnicodeString possible(*((UnicodeString *)(ne2->value.pointer))); 377 UnicodeString attempt; 378 Normalizer::normalize(possible, UNORM_NFD, 0, attempt, status); 379 380 // TODO: check if operator == is semanticaly the same as attempt.equals(segment) 381 if (attempt==segment) { 382 //if (PROGRESS) printf("Adding Permutation: %s\n", UToS(Tr(*possible))); 383 // TODO: use the hashtable just to catch duplicates - store strings directly (somehow). 384 result.put(possible, new UnicodeString(possible), status); //add(possible); 385 } else { 386 //if (PROGRESS) printf("-Skipping Permutation: %s\n", UToS(Tr(*possible))); 387 } 388 389 ne2 = permutations.nextElement(el2); 390 } 391 ne = basic.nextElement(el); 392 } 393 394 /* Test for buffer overflows */ 395 if(U_FAILURE(status)) { 396 return 0; 397 } 398 // convert into a String[] to clean up storage 399 //String[] finalResult = new String[result.size()]; 400 UnicodeString *finalResult = NULL; 401 int32_t resultCount; 402 if((resultCount = result.count())) { 403 finalResult = new UnicodeString[resultCount]; 404 if (finalResult == 0) { 405 status = U_MEMORY_ALLOCATION_ERROR; 406 return NULL; 407 } 408 } 409 else { 410 status = U_ILLEGAL_ARGUMENT_ERROR; 411 return NULL; 412 } 413 //result.toArray(finalResult); 414 result_len = 0; 415 el = -1; 416 ne = result.nextElement(el); 417 while(ne != NULL) { 418 finalResult[result_len++] = *((UnicodeString *)(ne->value.pointer)); 419 ne = result.nextElement(el); 420 } 421 422 423 return finalResult; 424 } 425 426 Hashtable *CanonicalIterator::getEquivalents2(Hashtable *fillinResult, const UChar *segment, int32_t segLen, UErrorCode &status) { 427 428 if (U_FAILURE(status)) { 429 return NULL; 430 } 431 432 //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(segment))); 433 434 UnicodeString toPut(segment, segLen); 435 436 fillinResult->put(toPut, new UnicodeString(toPut), status); 437 438 USerializedSet starts; 439 440 // cycle through all the characters 441 UChar32 cp, end = 0; 442 int32_t i = 0, j; 443 for (i = 0; i < segLen; i += UTF16_CHAR_LENGTH(cp)) { 444 // see if any character is at the start of some decomposition 445 UTF_GET_CHAR(segment, 0, i, segLen, cp); 446 if (!unorm_getCanonStartSet(cp, &starts)) { 447 continue; 448 } 449 // if so, see which decompositions match 450 for(j = 0, cp = end+1; cp <= end || uset_getSerializedRange(&starts, j++, &cp, &end); ++cp) { 451 Hashtable remainder(status); 452 remainder.setValueDeleter(uhash_deleteUnicodeString); 453 if (extract(&remainder, cp, segment, segLen, i, status) == NULL) { 454 continue; 455 } 456 457 // there were some matches, so add all the possibilities to the set. 458 UnicodeString prefix(segment, i); 459 prefix += cp; 460 461 int32_t el = -1; 462 const UHashElement *ne = remainder.nextElement(el); 463 while (ne != NULL) { 464 UnicodeString item = *((UnicodeString *)(ne->value.pointer)); 465 UnicodeString *toAdd = new UnicodeString(prefix); 466 /* test for NULL */ 467 if (toAdd == 0) { 468 status = U_MEMORY_ALLOCATION_ERROR; 469 return NULL; 470 } 471 *toAdd += item; 472 fillinResult->put(*toAdd, toAdd, status); 473 474 //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(*toAdd))); 475 476 ne = remainder.nextElement(el); 477 } 478 } 479 } 480 481 /* Test for buffer overflows */ 482 if(U_FAILURE(status)) { 483 return NULL; 484 } 485 return fillinResult; 486 } 487 488 /** 489 * See if the decomposition of cp2 is at segment starting at segmentPos 490 * (with canonical rearrangment!) 491 * If so, take the remainder, and return the equivalents 492 */ 493 Hashtable *CanonicalIterator::extract(Hashtable *fillinResult, UChar32 comp, const UChar *segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) { 494 //Hashtable *CanonicalIterator::extract(UChar32 comp, const UnicodeString &segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) { 495 //if (PROGRESS) printf(" extract: %s, ", UToS(Tr(UnicodeString(comp)))); 496 //if (PROGRESS) printf("%s, %i\n", UToS(Tr(segment)), segmentPos); 497 498 if (U_FAILURE(status)) { 499 return NULL; 500 } 501 502 const int32_t bufSize = 256; 503 int32_t bufLen = 0; 504 UChar temp[bufSize]; 505 506 int32_t inputLen = 0, decompLen; 507 UChar stackBuffer[4]; 508 const UChar *decomp; 509 510 U16_APPEND_UNSAFE(temp, inputLen, comp); 511 decomp = unorm_getCanonicalDecomposition(comp, stackBuffer, &decompLen); 512 if(decomp == NULL) { 513 /* copy temp */ 514 stackBuffer[0] = temp[0]; 515 if(inputLen > 1) { 516 stackBuffer[1] = temp[1]; 517 } 518 decomp = stackBuffer; 519 decompLen = inputLen; 520 } 521 522 UChar *buff = temp+inputLen; 523 524 // See if it matches the start of segment (at segmentPos) 525 UBool ok = FALSE; 526 UChar32 cp; 527 int32_t decompPos = 0; 528 UChar32 decompCp; 529 UTF_NEXT_CHAR(decomp, decompPos, decompLen, decompCp); 530 531 int32_t i; 532 UBool overflow = FALSE; 533 534 i = segmentPos; 535 while(i < segLen) { 536 UTF_NEXT_CHAR(segment, i, segLen, cp); 537 538 if (cp == decompCp) { // if equal, eat another cp from decomp 539 540 //if (PROGRESS) printf(" matches: %s\n", UToS(Tr(UnicodeString(cp)))); 541 542 if (decompPos == decompLen) { // done, have all decomp characters! 543 //u_strcat(buff+bufLen, segment+i); 544 uprv_memcpy(buff+bufLen, segment+i, (segLen-i)*sizeof(UChar)); 545 bufLen+=segLen-i; 546 547 ok = TRUE; 548 break; 549 } 550 UTF_NEXT_CHAR(decomp, decompPos, decompLen, decompCp); 551 } else { 552 //if (PROGRESS) printf(" buffer: %s\n", UToS(Tr(UnicodeString(cp)))); 553 554 // brute force approach 555 556 U16_APPEND(buff, bufLen, bufSize, cp, overflow); 557 558 if(overflow) { 559 /* 560 * ### TODO handle buffer overflow 561 * The buffer is large, but an overflow may still happen with 562 * unusual input (many combining marks?). 563 * Reallocate buffer and continue. 564 * markus 20020929 565 */ 566 567 overflow = FALSE; 568 } 569 570 /* TODO: optimize 571 // since we know that the classes are monotonically increasing, after zero 572 // e.g. 0 5 7 9 0 3 573 // we can do an optimization 574 // there are only a few cases that work: zero, less, same, greater 575 // if both classes are the same, we fail 576 // if the decomp class < the segment class, we fail 577 578 segClass = getClass(cp); 579 if (decompClass <= segClass) return null; 580 */ 581 } 582 } 583 if (!ok) 584 return NULL; // we failed, characters left over 585 586 //if (PROGRESS) printf("Matches\n"); 587 588 if (bufLen == 0) { 589 fillinResult->put(UnicodeString(), new UnicodeString(), status); 590 return fillinResult; // succeed, but no remainder 591 } 592 593 // brute force approach 594 // check to make sure result is canonically equivalent 595 int32_t tempLen = inputLen + bufLen; 596 597 UChar trial[bufSize]; 598 unorm_decompose(trial, bufSize, temp, tempLen, FALSE, 0, &status); 599 600 if(U_FAILURE(status) 601 || uprv_memcmp(segment+segmentPos, trial, (segLen - segmentPos)*sizeof(UChar)) != 0) 602 { 603 return NULL; 604 } 605 606 return getEquivalents2(fillinResult, buff, bufLen, status); 607 } 608 609 U_NAMESPACE_END 610 611 #endif /* #if !UCONFIG_NO_NORMALIZATION */ 612