Home | History | Annotate | Download | only in intltest
      1 /*
      2 **********************************************************************
      3 *   Copyright (C) 1999-2014, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 **********************************************************************
      6 *   Date        Name        Description
      7 *   12/09/99    aliu        Ported from Java.
      8 **********************************************************************
      9 */
     10 
     11 #include "unicode/utypes.h"
     12 
     13 #if !UCONFIG_NO_COLLATION
     14 
     15 #include "thcoll.h"
     16 #include "unicode/utypes.h"
     17 #include "unicode/coll.h"
     18 #include "unicode/localpointer.h"
     19 #include "unicode/sortkey.h"
     20 #include "unicode/tblcoll.h"
     21 #include "unicode/ustring.h"
     22 #include "cstring.h"
     23 #include "filestrm.h"
     24 #include "textfile.h"
     25 
     26 /**
     27  * The TestDictionary test expects a file of this name, with this
     28  * encoding, to be present in the directory $ICU/source/test/testdata.
     29  */
     30 //#define TEST_FILE           "th18057.txt"
     31 
     32 /**
     33  * This is the most failures we show in TestDictionary.  If this number
     34  * is < 0, we show all failures.
     35  */
     36 #define MAX_FAILURES_TO_SHOW -1
     37 
     38 CollationThaiTest::CollationThaiTest() {
     39     UErrorCode status = U_ZERO_ERROR;
     40     coll = Collator::createInstance(Locale("th", "TH", ""), status);
     41     if (coll && U_SUCCESS(status)) {
     42         //coll->setStrength(Collator::TERTIARY);
     43     } else {
     44         delete coll;
     45         coll = 0;
     46     }
     47 }
     48 
     49 CollationThaiTest::~CollationThaiTest() {
     50     delete coll;
     51 }
     52 
     53 void CollationThaiTest::runIndexedTest(int32_t index, UBool exec, const char* &name,
     54                                        char* /*par*/) {
     55 
     56     if((!coll) && exec) {
     57       dataerrln(__FILE__ " cannot test - failed to create collator.");
     58       name = "some test";
     59       return;
     60     }
     61 
     62     switch (index) {
     63         TESTCASE(0,TestDictionary);
     64         TESTCASE(1,TestCornerCases);
     65         TESTCASE(2,TestNamesList);
     66         TESTCASE(3,TestInvalidThai);
     67         TESTCASE(4,TestReordering);
     68         default: name = ""; break;
     69     }
     70 }
     71 
     72 /**
     73  * Read the external names list, and confirms that the collator
     74  * gets the same results when comparing lines one to another
     75  * using regular and iterative comparison.
     76  */
     77 void CollationThaiTest::TestNamesList(void) {
     78     if (coll == 0) {
     79         errln("Error: could not construct Thai collator");
     80         return;
     81     }
     82 
     83     UErrorCode ec = U_ZERO_ERROR;
     84     TextFile names("TestNames_Thai.txt", "UTF16LE", ec);
     85     if (U_FAILURE(ec)) {
     86         logln("Can't open TestNames_Thai.txt: %s; skipping test",
     87               u_errorName(ec));
     88         return;
     89     }
     90 
     91     //
     92     // Loop through each word in the dictionary and compare it to the previous
     93     // word.  They should be in sorted order.
     94     //
     95     UnicodeString lastWord, word;
     96     //int32_t failed = 0;
     97     int32_t wordCount = 0;
     98     while (names.readLineSkippingComments(word, ec, FALSE) && U_SUCCESS(ec)) {
     99 
    100         // Show the first 8 words being compared, so we can see what's happening
    101         ++wordCount;
    102         if (wordCount <= 8) {
    103             UnicodeString str;
    104             logln((UnicodeString)"Word " + wordCount + ": " + IntlTest::prettify(word, str));
    105         }
    106 
    107         if (lastWord.length() > 0) {
    108             Collator::EComparisonResult result = coll->compare(lastWord, word);
    109             doTest(coll, lastWord, word, result);
    110         }
    111         lastWord = word;
    112     }
    113 
    114     assertSuccess("readLine", ec);
    115 
    116     logln((UnicodeString)"Words checked: " + wordCount);
    117 }
    118 
    119 /**
    120  * Read the external dictionary file, which is already in proper
    121  * sorted order, and confirm that the collator compares each line as
    122  * preceding the following line.
    123  */
    124 void CollationThaiTest::TestDictionary(void) {
    125     if (coll == 0) {
    126         errln("Error: could not construct Thai collator");
    127         return;
    128     }
    129 
    130     UErrorCode ec = U_ZERO_ERROR;
    131     TextFile riwords("riwords.txt", "UTF8", ec);
    132     if (U_FAILURE(ec)) {
    133         logln("Can't open riwords.txt: %s; skipping test",
    134               u_errorName(ec));
    135         return;
    136     }
    137 
    138     //
    139     // Loop through each word in the dictionary and compare it to the previous
    140     // word.  They should be in sorted order.
    141     //
    142     UnicodeString lastWord, word;
    143     int32_t failed = 0;
    144     int32_t wordCount = 0;
    145     while (riwords.readLineSkippingComments(word, ec, FALSE) && U_SUCCESS(ec)) {
    146 
    147         // Show the first 8 words being compared, so we can see what's happening
    148         ++wordCount;
    149         if (wordCount <= 8) {
    150             UnicodeString str;
    151             logln((UnicodeString)"Word " + wordCount + ": " + IntlTest::prettify(word, str));
    152         }
    153 
    154         if (lastWord.length() > 0) {
    155             int32_t result = coll->compare(lastWord, word);
    156 
    157             if (result > 0) {
    158                 failed++;
    159                 if (MAX_FAILURES_TO_SHOW < 0 || failed <= MAX_FAILURES_TO_SHOW) {
    160                     UnicodeString str;
    161                     UnicodeString msg =
    162                         UnicodeString("--------------------------------------------\n")
    163                         + riwords.getLineNumber()
    164                         + " compare(" + IntlTest::prettify(lastWord, str);
    165                     msg += UnicodeString(", ")
    166                         + IntlTest::prettify(word, str) + ") returned " + result
    167                         + ", expected -1\n";
    168                     UErrorCode status = U_ZERO_ERROR;
    169                     CollationKey k1, k2;
    170                     coll->getCollationKey(lastWord, k1, status);
    171                     coll->getCollationKey(word, k2, status);
    172                     if (U_FAILURE(status)) {
    173                         errln((UnicodeString)"Fail: getCollationKey returned " + u_errorName(status));
    174                         return;
    175                     }
    176                     msg.append("key1: ").append(prettify(k1, str)).append("\n");
    177                     msg.append("key2: ").append(prettify(k2, str));
    178                     errln(msg);
    179                 }
    180             }
    181         }
    182         lastWord = word;
    183     }
    184 
    185     assertSuccess("readLine", ec);
    186 
    187     if (failed != 0) {
    188         if (failed > MAX_FAILURES_TO_SHOW) {
    189             errln((UnicodeString)"Too many failures; only the first " +
    190                   MAX_FAILURES_TO_SHOW + " failures were shown");
    191         }
    192         errln((UnicodeString)"Summary: " + failed + " of " + (riwords.getLineNumber() - 1) +
    193               " comparisons failed");
    194     }
    195 
    196     logln((UnicodeString)"Words checked: " + wordCount);
    197 }
    198 
    199 /**
    200  * Odd corner conditions taken from "How to Sort Thai Without Rewriting Sort",
    201  * by Doug Cooper, http://seasrc.th.net/paper/thaisort.zip
    202  */
    203 void CollationThaiTest::TestCornerCases(void) {
    204     const char* TESTS[] = {
    205         // Shorter words precede longer
    206         "\\u0e01",                               "<",    "\\u0e01\\u0e01",
    207 
    208         // Tone marks are considered after letters (i.e. are primary ignorable)
    209         "\\u0e01\\u0e32",                        "<",    "\\u0e01\\u0e49\\u0e32",
    210 
    211         // ditto for other over-marks
    212         "\\u0e01\\u0e32",                        "<",    "\\u0e01\\u0e32\\u0e4c",
    213 
    214         // commonly used mark-in-context order.
    215         // In effect, marks are sorted after each syllable.
    216         "\\u0e01\\u0e32\\u0e01\\u0e49\\u0e32",   "<",    "\\u0e01\\u0e48\\u0e32\\u0e01\\u0e49\\u0e32",
    217 
    218         // Hyphens and other punctuation follow whitespace but come before letters
    219         "\\u0e01\\u0e32",                        "=",    "\\u0e01\\u0e32-",
    220         "\\u0e01\\u0e32-",                       "<",    "\\u0e01\\u0e32\\u0e01\\u0e32",
    221 
    222         // Doubler follows an indentical word without the doubler
    223         "\\u0e01\\u0e32",                        "=",    "\\u0e01\\u0e32\\u0e46",
    224         "\\u0e01\\u0e32\\u0e46",                 "<",    "\\u0e01\\u0e32\\u0e01\\u0e32",
    225 
    226 
    227         // \\u0e45 after either \\u0e24 or \\u0e26 is treated as a single
    228         // combining character, similar to "c < ch" in traditional spanish.
    229         // TODO: beef up this case
    230         "\\u0e24\\u0e29\\u0e35",                 "<",    "\\u0e24\\u0e45\\u0e29\\u0e35",
    231         "\\u0e26\\u0e29\\u0e35",                 "<",    "\\u0e26\\u0e45\\u0e29\\u0e35",
    232 
    233         // Vowels reorder, should compare \\u0e2d and \\u0e34
    234         "\\u0e40\\u0e01\\u0e2d",                 "<",    "\\u0e40\\u0e01\\u0e34",
    235 
    236         // Tones are compared after the rest of the word (e.g. primary ignorable)
    237         "\\u0e01\\u0e32\\u0e01\\u0e48\\u0e32",   "<",    "\\u0e01\\u0e49\\u0e32\\u0e01\\u0e32",
    238 
    239         // Periods are ignored entirely
    240         "\\u0e01.\\u0e01.",                      "<",    "\\u0e01\\u0e32",
    241     };
    242     const int32_t TESTS_length = (int32_t)(sizeof(TESTS)/sizeof(TESTS[0]));
    243 
    244     if (coll == 0) {
    245         errln("Error: could not construct Thai collator");
    246         return;
    247     }
    248     compareArray(*coll, TESTS, TESTS_length);
    249 }
    250 
    251 //------------------------------------------------------------------------
    252 // Internal utilities
    253 //------------------------------------------------------------------------
    254 
    255 void CollationThaiTest::compareArray(Collator& c, const char* tests[],
    256                                      int32_t testsLength) {
    257     for (int32_t i = 0; i < testsLength; i += 3) {
    258 
    259         Collator::EComparisonResult expect;
    260         if (tests[i+1][0] == '<') {
    261           expect = Collator::LESS;
    262         } else if (tests[i+1][0] == '>') {
    263           expect = Collator::GREATER;
    264         } else if (tests[i+1][0] == '=') {
    265           expect = Collator::EQUAL;
    266         } else {
    267             // expect = Integer.decode(tests[i+1]).intValue();
    268             errln((UnicodeString)"Error: unknown operator " + tests[i+1]);
    269             return;
    270         }
    271 
    272         UnicodeString s1, s2;
    273         parseChars(s1, tests[i]);
    274         parseChars(s2, tests[i+2]);
    275 
    276         doTest(&c, s1, s2, expect);
    277 #if 0
    278         UErrorCode status = U_ZERO_ERROR;
    279         int32_t result = c.compare(s1, s2);
    280         if (sign(result) != sign(expect))
    281         {
    282             UnicodeString t1, t2;
    283             errln(UnicodeString("") +
    284                   i/3 + ": compare(" + IntlTest::prettify(s1, t1)
    285                   + " , " + IntlTest::prettify(s2, t2)
    286                   + ") got " + result + "; expected " + expect);
    287 
    288             CollationKey k1, k2;
    289             c.getCollationKey(s1, k1, status);
    290             c.getCollationKey(s2, k2, status);
    291             if (U_FAILURE(status)) {
    292                 errln((UnicodeString)"Fail: getCollationKey returned " + u_errorName(status));
    293                 return;
    294             }
    295             errln((UnicodeString)"  key1: " + prettify(k1, t1) );
    296             errln((UnicodeString)"  key2: " + prettify(k2, t2) );
    297         }
    298         else
    299         {
    300             // Collator.compare worked OK; now try the collation keys
    301             CollationKey k1, k2;
    302             c.getCollationKey(s1, k1, status);
    303             c.getCollationKey(s2, k2, status);
    304             if (U_FAILURE(status)) {
    305                 errln((UnicodeString)"Fail: getCollationKey returned " + u_errorName(status));
    306                 return;
    307             }
    308 
    309             result = k1.compareTo(k2);
    310             if (sign(result) != sign(expect)) {
    311                 UnicodeString t1, t2;
    312                 errln(UnicodeString("") +
    313                       i/3 + ": key(" + IntlTest::prettify(s1, t1)
    314                       + ").compareTo(key(" + IntlTest::prettify(s2, t2)
    315                       + ")) got " + result + "; expected " + expect);
    316 
    317                 errln((UnicodeString)"  " + prettify(k1, t1) + " vs. " + prettify(k2, t2));
    318             }
    319         }
    320 #endif
    321     }
    322 }
    323 
    324 int8_t CollationThaiTest::sign(int32_t i) {
    325     if (i < 0) return -1;
    326     if (i > 0) return 1;
    327     return 0;
    328 }
    329 
    330 /**
    331  * Set a UnicodeString corresponding to the given string.  Use
    332  * UnicodeString and the default converter, unless we see the sequence
    333  * "\\u", in which case we interpret the subsequent escape.
    334  */
    335 UnicodeString& CollationThaiTest::parseChars(UnicodeString& result,
    336                                              const char* chars) {
    337     return result = CharsToUnicodeString(chars);
    338 }
    339 
    340 UCollator *thaiColl = NULL;
    341 
    342 U_CDECL_BEGIN
    343 static int U_CALLCONV
    344 StrCmp(const void *p1, const void *p2) {
    345   return ucol_strcoll(thaiColl, *(UChar **) p1, -1,  *(UChar **)p2, -1);
    346 }
    347 U_CDECL_END
    348 
    349 
    350 #define LINES 6
    351 
    352 void CollationThaiTest::TestInvalidThai(void) {
    353   const char *tests[LINES] = {
    354     "\\u0E44\\u0E01\\u0E44\\u0E01",
    355     "\\u0E44\\u0E01\\u0E01\\u0E44",
    356     "\\u0E01\\u0E44\\u0E01\\u0E44",
    357     "\\u0E01\\u0E01\\u0E44\\u0E44",
    358     "\\u0E44\\u0E44\\u0E01\\u0E01",
    359     "\\u0E01\\u0E44\\u0E44\\u0E01",
    360   };
    361 
    362   UChar strings[LINES][20];
    363 
    364   UChar *toSort[LINES];
    365 
    366   int32_t i = 0, j = 0, len = 0;
    367 
    368   UErrorCode coll_status = U_ZERO_ERROR;
    369   UnicodeString iteratorText;
    370 
    371   thaiColl = ucol_open ("th_TH", &coll_status);
    372   if (U_FAILURE(coll_status)) {
    373     errln("Error opening Thai collator: %s", u_errorName(coll_status));
    374     return;
    375   }
    376 
    377   CollationElementIterator* c = ((RuleBasedCollator *)coll)->createCollationElementIterator( iteratorText );
    378 
    379   for(i = 0; i < (int32_t)(sizeof(tests)/sizeof(tests[0])); i++) {
    380     len = u_unescape(tests[i], strings[i], 20);
    381     strings[i][len] = 0;
    382     toSort[i] = strings[i];
    383   }
    384 
    385   qsort (toSort, LINES, sizeof (UChar *), StrCmp);
    386 
    387   for (i=0; i < LINES; i++)
    388   {
    389     logln("%i", i);
    390       for (j=i+1; j < LINES; j++) {
    391           if (ucol_strcoll (thaiColl, toSort[i], -1, toSort[j], -1) == UCOL_GREATER)
    392           {
    393               // inconsistency ordering found!
    394             errln("Inconsistent ordering between strings %i and %i", i, j);
    395           }
    396       }
    397       iteratorText.setTo(toSort[i]);
    398       c->setText(iteratorText, coll_status);
    399       backAndForth(*c);
    400   }
    401 
    402 
    403   ucol_close(thaiColl);
    404   delete c;
    405 }
    406 
    407 void CollationThaiTest::TestReordering(void) {
    408   // Until UCA 4.1, the collation code swapped Thai/Lao prevowels with the following consonants,
    409   // resulting in consonant+prevowel == prevowel+consonant.
    410   // From UCA 5.0 on, there are order-reversing contractions for prevowel+consonant.
    411   // From UCA 5.0 until UCA 6.1, there was a tertiary difference between
    412   // consonant+prevowel and prevowel+consonant.
    413   // In UCA 6.2, they compare equal again.
    414   // The test was modified to using a collator with strength=secondary,
    415   // ignoring possible tertiary differences.
    416   const char *tests[] = {
    417     "\\u0E41c\\u0301",       "=", "\\u0E41\\u0107", // composition
    418     "\\u0E41\\U0001D7CE",    "<", "\\u0E41\\U0001D7CF", // supplementaries
    419     "\\u0E41\\U0001D15F",    "=", "\\u0E41\\U0001D158\\U0001D165", // supplementary composition decomps to supplementary
    420     "\\u0E41\\U0002F802",    "=", "\\u0E41\\u4E41", // supplementary composition decomps to BMP
    421     "\\u0E41\\u0301",        "=", "\\u0E41\\u0301", // unsafe (just checking backwards iteration)
    422     "\\u0E41\\u0301\\u0316", "=", "\\u0E41\\u0316\\u0301",
    423 
    424     "\\u0e24\\u0e41",        "=", "\\u0e41\\u0e24", // exiting contraction bug
    425     "\\u0e3f\\u0e3f\\u0e24\\u0e41", "=", "\\u0e3f\\u0e3f\\u0e41\\u0e24",
    426 
    427     "abc\\u0E41c\\u0301",       "=", "abc\\u0E41\\u0107", // composition
    428     "abc\\u0E41\\U0001D000",    "<", "abc\\u0E41\\U0001D001", // supplementaries
    429     "abc\\u0E41\\U0001D15F",    "=", "abc\\u0E41\\U0001D158\\U0001D165", // supplementary composition decomps to supplementary
    430     "abc\\u0E41\\U0002F802",    "=", "abc\\u0E41\\u4E41", // supplementary composition decomps to BMP
    431     "abc\\u0E41\\u0301",        "=", "abc\\u0E41\\u0301", // unsafe (just checking backwards iteration)
    432     "abc\\u0E41\\u0301\\u0316", "=", "abc\\u0E41\\u0316\\u0301",
    433 
    434     "\\u0E41c\\u0301abc",       "=", "\\u0E41\\u0107abc", // composition
    435     "\\u0E41\\U0001D000abc",    "<", "\\u0E41\\U0001D001abc", // supplementaries
    436     "\\u0E41\\U0001D15Fabc",    "=", "\\u0E41\\U0001D158\\U0001D165abc", // supplementary composition decomps to supplementary
    437     "\\u0E41\\U0002F802abc",    "=", "\\u0E41\\u4E41abc", // supplementary composition decomps to BMP
    438     "\\u0E41\\u0301abc",        "=", "\\u0E41\\u0301abc", // unsafe (just checking backwards iteration)
    439     "\\u0E41\\u0301\\u0316abc", "=", "\\u0E41\\u0316\\u0301abc",
    440 
    441     "abc\\u0E41c\\u0301abc",       "=", "abc\\u0E41\\u0107abc", // composition
    442     "abc\\u0E41\\U0001D000abc",    "<", "abc\\u0E41\\U0001D001abc", // supplementaries
    443     "abc\\u0E41\\U0001D15Fabc",    "=", "abc\\u0E41\\U0001D158\\U0001D165abc", // supplementary composition decomps to supplementary
    444     "abc\\u0E41\\U0002F802abc",    "=", "abc\\u0E41\\u4E41abc", // supplementary composition decomps to BMP
    445     "abc\\u0E41\\u0301abc",        "=", "abc\\u0E41\\u0301abc", // unsafe (just checking backwards iteration)
    446     "abc\\u0E41\\u0301\\u0316abc", "=", "abc\\u0E41\\u0316\\u0301abc",
    447   };
    448 
    449   LocalPointer<Collator> coll2(coll->clone());
    450   UErrorCode status = U_ZERO_ERROR;
    451   coll2->setAttribute(UCOL_STRENGTH, UCOL_SECONDARY, status);
    452   if(U_FAILURE(status)) {
    453     errln("Unable to set the Thai collator clone to secondary strength");
    454     return;
    455   }
    456   compareArray(*coll2, tests, sizeof(tests)/sizeof(tests[0]));
    457 
    458   const char *rule = "& c < ab";
    459   const char *testcontraction[] = { "\\u0E41ab", ">", "\\u0E41c"}; // After UCA 4.1 Thai are normal so won't break a contraction
    460   UnicodeString rules;
    461   parseChars(rules, rule);
    462   LocalPointer<RuleBasedCollator> rcoll(new RuleBasedCollator(rules, status), status);
    463   if(U_SUCCESS(status)) {
    464     compareArray(*rcoll, testcontraction, 3);
    465   } else {
    466     errln("Couldn't instantiate collator from rules");
    467   }
    468 
    469 }
    470 
    471 
    472 #endif /* #if !UCONFIG_NO_COLLATION */
    473