1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ***************************************************************************** 5 * Copyright (C) 1996-2015, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ***************************************************************************** 8 */ 9 10 #include "unicode/utypes.h" 11 12 #if !UCONFIG_NO_NORMALIZATION 13 14 #include "unicode/caniter.h" 15 #include "unicode/normalizer2.h" 16 #include "unicode/uchar.h" 17 #include "unicode/uniset.h" 18 #include "unicode/usetiter.h" 19 #include "unicode/ustring.h" 20 #include "unicode/utf16.h" 21 #include "cmemory.h" 22 #include "hash.h" 23 #include "normalizer2impl.h" 24 25 /** 26 * This class allows one to iterate through all the strings that are canonically equivalent to a given 27 * string. For example, here are some sample results: 28 Results for: {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA} 29 1: \u0041\u030A\u0064\u0307\u0327 30 = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA} 31 2: \u0041\u030A\u0064\u0327\u0307 32 = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE} 33 3: \u0041\u030A\u1E0B\u0327 34 = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA} 35 4: \u0041\u030A\u1E11\u0307 36 = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE} 37 5: \u00C5\u0064\u0307\u0327 38 = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA} 39 6: \u00C5\u0064\u0327\u0307 40 = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE} 41 7: \u00C5\u1E0B\u0327 42 = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA} 43 8: \u00C5\u1E11\u0307 44 = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE} 45 9: \u212B\u0064\u0307\u0327 46 = {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA} 47 10: \u212B\u0064\u0327\u0307 48 = {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE} 49 11: \u212B\u1E0B\u0327 50 = {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA} 51 12: \u212B\u1E11\u0307 52 = {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE} 53 *<br>Note: the code is intended for use with small strings, and is not suitable for larger ones, 54 * since it has not been optimized for that situation. 55 *@author M. Davis 56 *@draft 57 */ 58 59 // public 60 61 U_NAMESPACE_BEGIN 62 63 // TODO: add boilerplate methods. 64 65 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CanonicalIterator) 66 67 /** 68 *@param source string to get results for 69 */ 70 CanonicalIterator::CanonicalIterator(const UnicodeString &sourceStr, UErrorCode &status) : 71 pieces(NULL), 72 pieces_length(0), 73 pieces_lengths(NULL), 74 current(NULL), 75 current_length(0), 76 nfd(*Normalizer2::getNFDInstance(status)), 77 nfcImpl(*Normalizer2Factory::getNFCImpl(status)) 78 { 79 if(U_SUCCESS(status) && nfcImpl.ensureCanonIterData(status)) { 80 setSource(sourceStr, status); 81 } 82 } 83 84 CanonicalIterator::~CanonicalIterator() { 85 cleanPieces(); 86 } 87 88 void CanonicalIterator::cleanPieces() { 89 int32_t i = 0; 90 if(pieces != NULL) { 91 for(i = 0; i < pieces_length; i++) { 92 if(pieces[i] != NULL) { 93 delete[] pieces[i]; 94 } 95 } 96 uprv_free(pieces); 97 pieces = NULL; 98 pieces_length = 0; 99 } 100 if(pieces_lengths != NULL) { 101 uprv_free(pieces_lengths); 102 pieces_lengths = NULL; 103 } 104 if(current != NULL) { 105 uprv_free(current); 106 current = NULL; 107 current_length = 0; 108 } 109 } 110 111 /** 112 *@return gets the source: NOTE: it is the NFD form of source 113 */ 114 UnicodeString CanonicalIterator::getSource() { 115 return source; 116 } 117 118 /** 119 * Resets the iterator so that one can start again from the beginning. 120 */ 121 void CanonicalIterator::reset() { 122 done = FALSE; 123 for (int i = 0; i < current_length; ++i) { 124 current[i] = 0; 125 } 126 } 127 128 /** 129 *@return the next string that is canonically equivalent. The value null is returned when 130 * the iteration is done. 131 */ 132 UnicodeString CanonicalIterator::next() { 133 int32_t i = 0; 134 135 if (done) { 136 buffer.setToBogus(); 137 return buffer; 138 } 139 140 // delete old contents 141 buffer.remove(); 142 143 // construct return value 144 145 for (i = 0; i < pieces_length; ++i) { 146 buffer.append(pieces[i][current[i]]); 147 } 148 //String result = buffer.toString(); // not needed 149 150 // find next value for next time 151 152 for (i = current_length - 1; ; --i) { 153 if (i < 0) { 154 done = TRUE; 155 break; 156 } 157 current[i]++; 158 if (current[i] < pieces_lengths[i]) break; // got sequence 159 current[i] = 0; 160 } 161 return buffer; 162 } 163 164 /** 165 *@param set the source string to iterate against. This allows the same iterator to be used 166 * while changing the source string, saving object creation. 167 */ 168 void CanonicalIterator::setSource(const UnicodeString &newSource, UErrorCode &status) { 169 int32_t list_length = 0; 170 UChar32 cp = 0; 171 int32_t start = 0; 172 int32_t i = 0; 173 UnicodeString *list = NULL; 174 175 nfd.normalize(newSource, source, status); 176 if(U_FAILURE(status)) { 177 return; 178 } 179 done = FALSE; 180 181 cleanPieces(); 182 183 // catch degenerate case 184 if (newSource.length() == 0) { 185 pieces = (UnicodeString **)uprv_malloc(sizeof(UnicodeString *)); 186 pieces_lengths = (int32_t*)uprv_malloc(1 * sizeof(int32_t)); 187 pieces_length = 1; 188 current = (int32_t*)uprv_malloc(1 * sizeof(int32_t)); 189 current_length = 1; 190 if (pieces == NULL || pieces_lengths == NULL || current == NULL) { 191 status = U_MEMORY_ALLOCATION_ERROR; 192 goto CleanPartialInitialization; 193 } 194 current[0] = 0; 195 pieces[0] = new UnicodeString[1]; 196 pieces_lengths[0] = 1; 197 if (pieces[0] == 0) { 198 status = U_MEMORY_ALLOCATION_ERROR; 199 goto CleanPartialInitialization; 200 } 201 return; 202 } 203 204 205 list = new UnicodeString[source.length()]; 206 if (list == 0) { 207 status = U_MEMORY_ALLOCATION_ERROR; 208 goto CleanPartialInitialization; 209 } 210 211 // i should initialy be the number of code units at the 212 // start of the string 213 i = U16_LENGTH(source.char32At(0)); 214 //int32_t i = 1; 215 // find the segments 216 // This code iterates through the source string and 217 // extracts segments that end up on a codepoint that 218 // doesn't start any decompositions. (Analysis is done 219 // on the NFD form - see above). 220 for (; i < source.length(); i += U16_LENGTH(cp)) { 221 cp = source.char32At(i); 222 if (nfcImpl.isCanonSegmentStarter(cp)) { 223 source.extract(start, i-start, list[list_length++]); // add up to i 224 start = i; 225 } 226 } 227 source.extract(start, i-start, list[list_length++]); // add last one 228 229 230 // allocate the arrays, and find the strings that are CE to each segment 231 pieces = (UnicodeString **)uprv_malloc(list_length * sizeof(UnicodeString *)); 232 pieces_length = list_length; 233 pieces_lengths = (int32_t*)uprv_malloc(list_length * sizeof(int32_t)); 234 current = (int32_t*)uprv_malloc(list_length * sizeof(int32_t)); 235 current_length = list_length; 236 if (pieces == NULL || pieces_lengths == NULL || current == NULL) { 237 status = U_MEMORY_ALLOCATION_ERROR; 238 goto CleanPartialInitialization; 239 } 240 241 for (i = 0; i < current_length; i++) { 242 current[i] = 0; 243 } 244 // for each segment, get all the combinations that can produce 245 // it after NFD normalization 246 for (i = 0; i < pieces_length; ++i) { 247 //if (PROGRESS) printf("SEGMENT\n"); 248 pieces[i] = getEquivalents(list[i], pieces_lengths[i], status); 249 } 250 251 delete[] list; 252 return; 253 // Common section to cleanup all local variables and reset object variables. 254 CleanPartialInitialization: 255 if (list != NULL) { 256 delete[] list; 257 } 258 cleanPieces(); 259 } 260 261 /** 262 * Dumb recursive implementation of permutation. 263 * TODO: optimize 264 * @param source the string to find permutations for 265 * @return the results in a set. 266 */ 267 void U_EXPORT2 CanonicalIterator::permute(UnicodeString &source, UBool skipZeros, Hashtable *result, UErrorCode &status) { 268 if(U_FAILURE(status)) { 269 return; 270 } 271 //if (PROGRESS) printf("Permute: %s\n", UToS(Tr(source))); 272 int32_t i = 0; 273 274 // optimization: 275 // if zero or one character, just return a set with it 276 // we check for length < 2 to keep from counting code points all the time 277 if (source.length() <= 2 && source.countChar32() <= 1) { 278 UnicodeString *toPut = new UnicodeString(source); 279 /* test for NULL */ 280 if (toPut == 0) { 281 status = U_MEMORY_ALLOCATION_ERROR; 282 return; 283 } 284 result->put(source, toPut, status); 285 return; 286 } 287 288 // otherwise iterate through the string, and recursively permute all the other characters 289 UChar32 cp; 290 Hashtable subpermute(status); 291 if(U_FAILURE(status)) { 292 return; 293 } 294 subpermute.setValueDeleter(uprv_deleteUObject); 295 296 for (i = 0; i < source.length(); i += U16_LENGTH(cp)) { 297 cp = source.char32At(i); 298 const UHashElement *ne = NULL; 299 int32_t el = UHASH_FIRST; 300 UnicodeString subPermuteString = source; 301 302 // optimization: 303 // if the character is canonical combining class zero, 304 // don't permute it 305 if (skipZeros && i != 0 && u_getCombiningClass(cp) == 0) { 306 //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i))); 307 continue; 308 } 309 310 subpermute.removeAll(); 311 312 // see what the permutations of the characters before and after this one are 313 //Hashtable *subpermute = permute(source.substring(0,i) + source.substring(i + UTF16.getCharCount(cp))); 314 permute(subPermuteString.replace(i, U16_LENGTH(cp), NULL, 0), skipZeros, &subpermute, status); 315 /* Test for buffer overflows */ 316 if(U_FAILURE(status)) { 317 return; 318 } 319 // The upper replace is destructive. The question is do we have to make a copy, or we don't care about the contents 320 // of source at this point. 321 322 // prefix this character to all of them 323 ne = subpermute.nextElement(el); 324 while (ne != NULL) { 325 UnicodeString *permRes = (UnicodeString *)(ne->value.pointer); 326 UnicodeString *chStr = new UnicodeString(cp); 327 //test for NULL 328 if (chStr == NULL) { 329 status = U_MEMORY_ALLOCATION_ERROR; 330 return; 331 } 332 chStr->append(*permRes); //*((UnicodeString *)(ne->value.pointer)); 333 //if (PROGRESS) printf(" Piece: %s\n", UToS(*chStr)); 334 result->put(*chStr, chStr, status); 335 ne = subpermute.nextElement(el); 336 } 337 } 338 //return result; 339 } 340 341 // privates 342 343 // we have a segment, in NFD. Find all the strings that are canonically equivalent to it. 344 UnicodeString* CanonicalIterator::getEquivalents(const UnicodeString &segment, int32_t &result_len, UErrorCode &status) { 345 Hashtable result(status); 346 Hashtable permutations(status); 347 Hashtable basic(status); 348 if (U_FAILURE(status)) { 349 return 0; 350 } 351 result.setValueDeleter(uprv_deleteUObject); 352 permutations.setValueDeleter(uprv_deleteUObject); 353 basic.setValueDeleter(uprv_deleteUObject); 354 355 UChar USeg[256]; 356 int32_t segLen = segment.extract(USeg, 256, status); 357 getEquivalents2(&basic, USeg, segLen, status); 358 359 // now get all the permutations 360 // add only the ones that are canonically equivalent 361 // TODO: optimize by not permuting any class zero. 362 363 const UHashElement *ne = NULL; 364 int32_t el = UHASH_FIRST; 365 //Iterator it = basic.iterator(); 366 ne = basic.nextElement(el); 367 //while (it.hasNext()) 368 while (ne != NULL) { 369 //String item = (String) it.next(); 370 UnicodeString item = *((UnicodeString *)(ne->value.pointer)); 371 372 permutations.removeAll(); 373 permute(item, CANITER_SKIP_ZEROES, &permutations, status); 374 const UHashElement *ne2 = NULL; 375 int32_t el2 = UHASH_FIRST; 376 //Iterator it2 = permutations.iterator(); 377 ne2 = permutations.nextElement(el2); 378 //while (it2.hasNext()) 379 while (ne2 != NULL) { 380 //String possible = (String) it2.next(); 381 //UnicodeString *possible = new UnicodeString(*((UnicodeString *)(ne2->value.pointer))); 382 UnicodeString possible(*((UnicodeString *)(ne2->value.pointer))); 383 UnicodeString attempt; 384 nfd.normalize(possible, attempt, status); 385 386 // TODO: check if operator == is semanticaly the same as attempt.equals(segment) 387 if (attempt==segment) { 388 //if (PROGRESS) printf("Adding Permutation: %s\n", UToS(Tr(*possible))); 389 // TODO: use the hashtable just to catch duplicates - store strings directly (somehow). 390 result.put(possible, new UnicodeString(possible), status); //add(possible); 391 } else { 392 //if (PROGRESS) printf("-Skipping Permutation: %s\n", UToS(Tr(*possible))); 393 } 394 395 ne2 = permutations.nextElement(el2); 396 } 397 ne = basic.nextElement(el); 398 } 399 400 /* Test for buffer overflows */ 401 if(U_FAILURE(status)) { 402 return 0; 403 } 404 // convert into a String[] to clean up storage 405 //String[] finalResult = new String[result.size()]; 406 UnicodeString *finalResult = NULL; 407 int32_t resultCount; 408 if((resultCount = result.count())) { 409 finalResult = new UnicodeString[resultCount]; 410 if (finalResult == 0) { 411 status = U_MEMORY_ALLOCATION_ERROR; 412 return NULL; 413 } 414 } 415 else { 416 status = U_ILLEGAL_ARGUMENT_ERROR; 417 return NULL; 418 } 419 //result.toArray(finalResult); 420 result_len = 0; 421 el = UHASH_FIRST; 422 ne = result.nextElement(el); 423 while(ne != NULL) { 424 finalResult[result_len++] = *((UnicodeString *)(ne->value.pointer)); 425 ne = result.nextElement(el); 426 } 427 428 429 return finalResult; 430 } 431 432 Hashtable *CanonicalIterator::getEquivalents2(Hashtable *fillinResult, const UChar *segment, int32_t segLen, UErrorCode &status) { 433 434 if (U_FAILURE(status)) { 435 return NULL; 436 } 437 438 //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(segment))); 439 440 UnicodeString toPut(segment, segLen); 441 442 fillinResult->put(toPut, new UnicodeString(toPut), status); 443 444 UnicodeSet starts; 445 446 // cycle through all the characters 447 UChar32 cp; 448 for (int32_t i = 0; i < segLen; i += U16_LENGTH(cp)) { 449 // see if any character is at the start of some decomposition 450 U16_GET(segment, 0, i, segLen, cp); 451 if (!nfcImpl.getCanonStartSet(cp, starts)) { 452 continue; 453 } 454 // if so, see which decompositions match 455 UnicodeSetIterator iter(starts); 456 while (iter.next()) { 457 UChar32 cp2 = iter.getCodepoint(); 458 Hashtable remainder(status); 459 remainder.setValueDeleter(uprv_deleteUObject); 460 if (extract(&remainder, cp2, segment, segLen, i, status) == NULL) { 461 continue; 462 } 463 464 // there were some matches, so add all the possibilities to the set. 465 UnicodeString prefix(segment, i); 466 prefix += cp2; 467 468 int32_t el = UHASH_FIRST; 469 const UHashElement *ne = remainder.nextElement(el); 470 while (ne != NULL) { 471 UnicodeString item = *((UnicodeString *)(ne->value.pointer)); 472 UnicodeString *toAdd = new UnicodeString(prefix); 473 /* test for NULL */ 474 if (toAdd == 0) { 475 status = U_MEMORY_ALLOCATION_ERROR; 476 return NULL; 477 } 478 *toAdd += item; 479 fillinResult->put(*toAdd, toAdd, status); 480 481 //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(*toAdd))); 482 483 ne = remainder.nextElement(el); 484 } 485 } 486 } 487 488 /* Test for buffer overflows */ 489 if(U_FAILURE(status)) { 490 return NULL; 491 } 492 return fillinResult; 493 } 494 495 /** 496 * See if the decomposition of cp2 is at segment starting at segmentPos 497 * (with canonical rearrangment!) 498 * If so, take the remainder, and return the equivalents 499 */ 500 Hashtable *CanonicalIterator::extract(Hashtable *fillinResult, UChar32 comp, const UChar *segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) { 501 //Hashtable *CanonicalIterator::extract(UChar32 comp, const UnicodeString &segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) { 502 //if (PROGRESS) printf(" extract: %s, ", UToS(Tr(UnicodeString(comp)))); 503 //if (PROGRESS) printf("%s, %i\n", UToS(Tr(segment)), segmentPos); 504 505 if (U_FAILURE(status)) { 506 return NULL; 507 } 508 509 UnicodeString temp(comp); 510 int32_t inputLen=temp.length(); 511 UnicodeString decompString; 512 nfd.normalize(temp, decompString, status); 513 if (U_FAILURE(status)) { 514 return NULL; 515 } 516 if (decompString.isBogus()) { 517 status = U_MEMORY_ALLOCATION_ERROR; 518 return NULL; 519 } 520 const UChar *decomp=decompString.getBuffer(); 521 int32_t decompLen=decompString.length(); 522 523 // See if it matches the start of segment (at segmentPos) 524 UBool ok = FALSE; 525 UChar32 cp; 526 int32_t decompPos = 0; 527 UChar32 decompCp; 528 U16_NEXT(decomp, decompPos, decompLen, decompCp); 529 530 int32_t i = segmentPos; 531 while(i < segLen) { 532 U16_NEXT(segment, i, segLen, cp); 533 534 if (cp == decompCp) { // if equal, eat another cp from decomp 535 536 //if (PROGRESS) printf(" matches: %s\n", UToS(Tr(UnicodeString(cp)))); 537 538 if (decompPos == decompLen) { // done, have all decomp characters! 539 temp.append(segment+i, segLen-i); 540 ok = TRUE; 541 break; 542 } 543 U16_NEXT(decomp, decompPos, decompLen, decompCp); 544 } else { 545 //if (PROGRESS) printf(" buffer: %s\n", UToS(Tr(UnicodeString(cp)))); 546 547 // brute force approach 548 temp.append(cp); 549 550 /* TODO: optimize 551 // since we know that the classes are monotonically increasing, after zero 552 // e.g. 0 5 7 9 0 3 553 // we can do an optimization 554 // there are only a few cases that work: zero, less, same, greater 555 // if both classes are the same, we fail 556 // if the decomp class < the segment class, we fail 557 558 segClass = getClass(cp); 559 if (decompClass <= segClass) return null; 560 */ 561 } 562 } 563 if (!ok) 564 return NULL; // we failed, characters left over 565 566 //if (PROGRESS) printf("Matches\n"); 567 568 if (inputLen == temp.length()) { 569 fillinResult->put(UnicodeString(), new UnicodeString(), status); 570 return fillinResult; // succeed, but no remainder 571 } 572 573 // brute force approach 574 // check to make sure result is canonically equivalent 575 UnicodeString trial; 576 nfd.normalize(temp, trial, status); 577 if(U_FAILURE(status) || trial.compare(segment+segmentPos, segLen - segmentPos) != 0) { 578 return NULL; 579 } 580 581 return getEquivalents2(fillinResult, temp.getBuffer()+inputLen, temp.length()-inputLen, status); 582 } 583 584 U_NAMESPACE_END 585 586 #endif /* #if !UCONFIG_NO_NORMALIZATION */ 587