Home | History | Annotate | Download | only in intltest
      1 /*
      2 **********************************************************************
      3 *   Copyright (C) 1999-2009, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 **********************************************************************
      6 *   Date        Name        Description
      7 *   12/09/99    aliu        Ported from Java.
      8 **********************************************************************
      9 */
     10 
     11 #include "unicode/utypes.h"
     12 
     13 #if !UCONFIG_NO_COLLATION
     14 
     15 #include "thcoll.h"
     16 #include "unicode/utypes.h"
     17 #include "unicode/coll.h"
     18 #include "unicode/sortkey.h"
     19 #include "unicode/ustring.h"
     20 #include "cstring.h"
     21 #include "filestrm.h"
     22 #include "textfile.h"
     23 
     24 /**
     25  * The TestDictionary test expects a file of this name, with this
     26  * encoding, to be present in the directory $ICU/source/test/testdata.
     27  */
     28 //#define TEST_FILE           "th18057.txt"
     29 
     30 /**
     31  * This is the most failures we show in TestDictionary.  If this number
     32  * is < 0, we show all failures.
     33  */
     34 #define MAX_FAILURES_TO_SHOW -1
     35 
     36 CollationThaiTest::CollationThaiTest() {
     37     UErrorCode status = U_ZERO_ERROR;
     38     coll = Collator::createInstance(Locale("th", "TH", ""), status);
     39     if (coll && U_SUCCESS(status)) {
     40         //coll->setStrength(Collator::TERTIARY);
     41     } else {
     42         delete coll;
     43         coll = 0;
     44     }
     45 }
     46 
     47 CollationThaiTest::~CollationThaiTest() {
     48     delete coll;
     49 }
     50 
     51 void CollationThaiTest::runIndexedTest(int32_t index, UBool exec, const char* &name,
     52                                        char* /*par*/) {
     53 
     54     if((!coll) && exec) {
     55       dataerrln(__FILE__ " cannot test - failed to create collator.");
     56       name = "some test";
     57       return;
     58     }
     59 
     60     switch (index) {
     61         TESTCASE(0,TestDictionary);
     62         TESTCASE(1,TestCornerCases);
     63         TESTCASE(2,TestNamesList);
     64         TESTCASE(3,TestInvalidThai);
     65         TESTCASE(4,TestReordering);
     66         default: name = ""; break;
     67     }
     68 }
     69 
     70 /**
     71  * Read the external names list, and confirms that the collator
     72  * gets the same results when comparing lines one to another
     73  * using regular and iterative comparison.
     74  */
     75 void CollationThaiTest::TestNamesList(void) {
     76     if (coll == 0) {
     77         errln("Error: could not construct Thai collator");
     78         return;
     79     }
     80 
     81     UErrorCode ec = U_ZERO_ERROR;
     82     TextFile names("TestNames_Thai.txt", "UTF16LE", ec);
     83     if (U_FAILURE(ec)) {
     84         logln("Can't open TestNames_Thai.txt: %s; skipping test",
     85               u_errorName(ec));
     86         return;
     87     }
     88 
     89     //
     90     // Loop through each word in the dictionary and compare it to the previous
     91     // word.  They should be in sorted order.
     92     //
     93     UnicodeString lastWord, word;
     94     //int32_t failed = 0;
     95     int32_t wordCount = 0;
     96     while (names.readLineSkippingComments(word, ec, FALSE) && U_SUCCESS(ec)) {
     97 
     98         // Show the first 8 words being compared, so we can see what's happening
     99         ++wordCount;
    100         if (wordCount <= 8) {
    101             UnicodeString str;
    102             logln((UnicodeString)"Word " + wordCount + ": " + IntlTest::prettify(word, str));
    103         }
    104 
    105         if (lastWord.length() > 0) {
    106             Collator::EComparisonResult result = coll->compare(lastWord, word);
    107             doTest(coll, lastWord, word, result);
    108         }
    109         lastWord = word;
    110     }
    111 
    112     assertSuccess("readLine", ec);
    113 
    114     logln((UnicodeString)"Words checked: " + wordCount);
    115 }
    116 
    117 /**
    118  * Read the external dictionary file, which is already in proper
    119  * sorted order, and confirm that the collator compares each line as
    120  * preceding the following line.
    121  */
    122 void CollationThaiTest::TestDictionary(void) {
    123     if (coll == 0) {
    124         errln("Error: could not construct Thai collator");
    125         return;
    126     }
    127 
    128     UErrorCode ec = U_ZERO_ERROR;
    129     TextFile riwords("riwords.txt", "UTF8", ec);
    130     if (U_FAILURE(ec)) {
    131         logln("Can't open riwords.txt: %s; skipping test",
    132               u_errorName(ec));
    133         return;
    134     }
    135 
    136     //
    137     // Loop through each word in the dictionary and compare it to the previous
    138     // word.  They should be in sorted order.
    139     //
    140     UnicodeString lastWord, word;
    141     int32_t failed = 0;
    142     int32_t wordCount = 0;
    143     while (riwords.readLineSkippingComments(word, ec, FALSE) && U_SUCCESS(ec)) {
    144 
    145         // Show the first 8 words being compared, so we can see what's happening
    146         ++wordCount;
    147         if (wordCount <= 8) {
    148             UnicodeString str;
    149             logln((UnicodeString)"Word " + wordCount + ": " + IntlTest::prettify(word, str));
    150         }
    151 
    152         if (lastWord.length() > 0) {
    153             // line enabled for j2720
    154             doTest(coll, lastWord, word, Collator::LESS);
    155             int32_t result = coll->compare(lastWord, word);
    156 
    157             if (result >= 0) {
    158                 failed++;
    159                 if (MAX_FAILURES_TO_SHOW < 0 || failed <= MAX_FAILURES_TO_SHOW) {
    160                     UnicodeString str;
    161                     UnicodeString msg =
    162                         UnicodeString("--------------------------------------------\n")
    163                         + riwords.getLineNumber()
    164                         + " compare(" + IntlTest::prettify(lastWord, str);
    165                     msg += UnicodeString(", ")
    166                         + IntlTest::prettify(word, str) + ") returned " + result
    167                         + ", expected -1\n";
    168                     UErrorCode status = U_ZERO_ERROR;
    169                     CollationKey k1, k2;
    170                     coll->getCollationKey(lastWord, k1, status);
    171                     coll->getCollationKey(word, k2, status);
    172                     if (U_FAILURE(status)) {
    173                         errln((UnicodeString)"Fail: getCollationKey returned " + u_errorName(status));
    174                         return;
    175                     }
    176                     msg.append("key1: ").append(prettify(k1, str)).append("\n");
    177                     msg.append("key2: ").append(prettify(k2, str));
    178                     errln(msg);
    179                 }
    180             }
    181         }
    182         lastWord = word;
    183     }
    184 
    185     assertSuccess("readLine", ec);
    186 
    187     if (failed != 0) {
    188         if (failed > MAX_FAILURES_TO_SHOW) {
    189             errln((UnicodeString)"Too many failures; only the first " +
    190                   MAX_FAILURES_TO_SHOW + " failures were shown");
    191         }
    192         errln((UnicodeString)"Summary: " + failed + " of " + (riwords.getLineNumber() - 1) +
    193               " comparisons failed");
    194     }
    195 
    196     logln((UnicodeString)"Words checked: " + wordCount);
    197 }
    198 
    199 /**
    200  * Odd corner conditions taken from "How to Sort Thai Without Rewriting Sort",
    201  * by Doug Cooper, http://seasrc.th.net/paper/thaisort.zip
    202  */
    203 void CollationThaiTest::TestCornerCases(void) {
    204     const char* TESTS[] = {
    205         // Shorter words precede longer
    206         "\\u0e01",                               "<",    "\\u0e01\\u0e01",
    207 
    208         // Tone marks are considered after letters (i.e. are primary ignorable)
    209         "\\u0e01\\u0e32",                        "<",    "\\u0e01\\u0e49\\u0e32",
    210 
    211         // ditto for other over-marks
    212         "\\u0e01\\u0e32",                        "<",    "\\u0e01\\u0e32\\u0e4c",
    213 
    214         // commonly used mark-in-context order.
    215         // In effect, marks are sorted after each syllable.
    216         "\\u0e01\\u0e32\\u0e01\\u0e49\\u0e32",   "<",    "\\u0e01\\u0e48\\u0e32\\u0e01\\u0e49\\u0e32",
    217 
    218         // Hyphens and other punctuation follow whitespace but come before letters
    219         "\\u0e01\\u0e32",                        "<",    "\\u0e01\\u0e32-",
    220         "\\u0e01\\u0e32-",                       "<",    "\\u0e01\\u0e32\\u0e01\\u0e32",
    221 
    222         // Doubler follows an indentical word without the doubler
    223         "\\u0e01\\u0e32",                        "<",    "\\u0e01\\u0e32\\u0e46",
    224         "\\u0e01\\u0e32\\u0e46",                 "<",    "\\u0e01\\u0e32\\u0e01\\u0e32",
    225 
    226 
    227         // \\u0e45 after either \\u0e24 or \\u0e26 is treated as a single
    228         // combining character, similar to "c < ch" in traditional spanish.
    229         // TODO: beef up this case
    230         "\\u0e24\\u0e29\\u0e35",                 "<",    "\\u0e24\\u0e45\\u0e29\\u0e35",
    231         "\\u0e26\\u0e29\\u0e35",                 "<",    "\\u0e26\\u0e45\\u0e29\\u0e35",
    232 
    233         // Vowels reorder, should compare \\u0e2d and \\u0e34
    234         "\\u0e40\\u0e01\\u0e2d",                 "<",    "\\u0e40\\u0e01\\u0e34",
    235 
    236         // Tones are compared after the rest of the word (e.g. primary ignorable)
    237         "\\u0e01\\u0e32\\u0e01\\u0e48\\u0e32",   "<",    "\\u0e01\\u0e49\\u0e32\\u0e01\\u0e32",
    238 
    239         // Periods are ignored entirely
    240         "\\u0e01.\\u0e01.",                      "<",    "\\u0e01\\u0e32",
    241     };
    242     const int32_t TESTS_length = (int32_t)(sizeof(TESTS)/sizeof(TESTS[0]));
    243 
    244     if (coll == 0) {
    245         errln("Error: could not construct Thai collator");
    246         return;
    247     }
    248     compareArray(*coll, TESTS, TESTS_length);
    249 }
    250 
    251 //------------------------------------------------------------------------
    252 // Internal utilities
    253 //------------------------------------------------------------------------
    254 
    255 void CollationThaiTest::compareArray(Collator& c, const char* tests[],
    256                                      int32_t testsLength) {
    257     for (int32_t i = 0; i < testsLength; i += 3) {
    258 
    259         Collator::EComparisonResult expect;
    260         if (tests[i+1][0] == '<') {
    261           expect = Collator::LESS;
    262         } else if (tests[i+1][0] == '>') {
    263           expect = Collator::GREATER;
    264         } else if (tests[i+1][0] == '=') {
    265           expect = Collator::EQUAL;
    266         } else {
    267             // expect = Integer.decode(tests[i+1]).intValue();
    268             errln((UnicodeString)"Error: unknown operator " + tests[i+1]);
    269             return;
    270         }
    271 
    272         UnicodeString s1, s2;
    273         parseChars(s1, tests[i]);
    274         parseChars(s2, tests[i+2]);
    275 
    276         doTest(&c, s1, s2, expect);
    277 #if 0
    278         UErrorCode status = U_ZERO_ERROR;
    279         int32_t result = c.compare(s1, s2);
    280         if (sign(result) != sign(expect))
    281         {
    282             UnicodeString t1, t2;
    283             errln(UnicodeString("") +
    284                   i/3 + ": compare(" + IntlTest::prettify(s1, t1)
    285                   + " , " + IntlTest::prettify(s2, t2)
    286                   + ") got " + result + "; expected " + expect);
    287 
    288             CollationKey k1, k2;
    289             c.getCollationKey(s1, k1, status);
    290             c.getCollationKey(s2, k2, status);
    291             if (U_FAILURE(status)) {
    292                 errln((UnicodeString)"Fail: getCollationKey returned " + u_errorName(status));
    293                 return;
    294             }
    295             errln((UnicodeString)"  key1: " + prettify(k1, t1) );
    296             errln((UnicodeString)"  key2: " + prettify(k2, t2) );
    297         }
    298         else
    299         {
    300             // Collator.compare worked OK; now try the collation keys
    301             CollationKey k1, k2;
    302             c.getCollationKey(s1, k1, status);
    303             c.getCollationKey(s2, k2, status);
    304             if (U_FAILURE(status)) {
    305                 errln((UnicodeString)"Fail: getCollationKey returned " + u_errorName(status));
    306                 return;
    307             }
    308 
    309             result = k1.compareTo(k2);
    310             if (sign(result) != sign(expect)) {
    311                 UnicodeString t1, t2;
    312                 errln(UnicodeString("") +
    313                       i/3 + ": key(" + IntlTest::prettify(s1, t1)
    314                       + ").compareTo(key(" + IntlTest::prettify(s2, t2)
    315                       + ")) got " + result + "; expected " + expect);
    316 
    317                 errln((UnicodeString)"  " + prettify(k1, t1) + " vs. " + prettify(k2, t2));
    318             }
    319         }
    320 #endif
    321     }
    322 }
    323 
    324 int8_t CollationThaiTest::sign(int32_t i) {
    325     if (i < 0) return -1;
    326     if (i > 0) return 1;
    327     return 0;
    328 }
    329 
    330 /**
    331  * Set a UnicodeString corresponding to the given string.  Use
    332  * UnicodeString and the default converter, unless we see the sequence
    333  * "\\u", in which case we interpret the subsequent escape.
    334  */
    335 UnicodeString& CollationThaiTest::parseChars(UnicodeString& result,
    336                                              const char* chars) {
    337     return result = CharsToUnicodeString(chars);
    338 }
    339 
    340 UCollator *thaiColl = NULL;
    341 
    342 U_CDECL_BEGIN
    343 static int U_CALLCONV
    344 StrCmp(const void *p1, const void *p2) {
    345   return ucol_strcoll(thaiColl, *(UChar **) p1, -1,  *(UChar **)p2, -1);
    346 }
    347 U_CDECL_END
    348 
    349 
    350 #define LINES 6
    351 
    352 void CollationThaiTest::TestInvalidThai(void) {
    353   const char *tests[LINES] = {
    354     "\\u0E44\\u0E01\\u0E44\\u0E01",
    355     "\\u0E44\\u0E01\\u0E01\\u0E44",
    356     "\\u0E01\\u0E44\\u0E01\\u0E44",
    357     "\\u0E01\\u0E01\\u0E44\\u0E44",
    358     "\\u0E44\\u0E44\\u0E01\\u0E01",
    359     "\\u0E01\\u0E44\\u0E44\\u0E01",
    360   };
    361 
    362   UChar strings[LINES][20];
    363 
    364   UChar *toSort[LINES];
    365 
    366   int32_t i = 0, j = 0, len = 0;
    367 
    368   UErrorCode coll_status = U_ZERO_ERROR;
    369   UnicodeString iteratorText;
    370 
    371   thaiColl = ucol_open ("th_TH", &coll_status);
    372   if (U_FAILURE(coll_status)) {
    373     errln("Error opening Thai collator: %s", u_errorName(coll_status));
    374     return;
    375   }
    376 
    377   CollationElementIterator* c = ((RuleBasedCollator *)coll)->createCollationElementIterator( iteratorText );
    378 
    379   for(i = 0; i < (int32_t)(sizeof(tests)/sizeof(tests[0])); i++) {
    380     len = u_unescape(tests[i], strings[i], 20);
    381     strings[i][len] = 0;
    382     toSort[i] = strings[i];
    383   }
    384 
    385   qsort (toSort, LINES, sizeof (UChar *), StrCmp);
    386 
    387   for (i=0; i < LINES; i++)
    388   {
    389     logln("%i", i);
    390       for (j=i+1; j < LINES; j++) {
    391           if (ucol_strcoll (thaiColl, toSort[i], -1, toSort[j], -1) == UCOL_GREATER)
    392           {
    393               // inconsistency ordering found!
    394             errln("Inconsistent ordering between strings %i and %i", i, j);
    395           }
    396       }
    397       iteratorText.setTo(toSort[i]);
    398       c->setText(iteratorText, coll_status);
    399       backAndForth(*c);
    400   }
    401 
    402 
    403   ucol_close(thaiColl);
    404   delete c;
    405 }
    406 
    407 void CollationThaiTest::TestReordering(void) {
    408   const char *tests[] = {
    409                           "\\u0E41c\\u0301",       "=", "\\u0E41\\u0107", // composition
    410                           "\\u0E41\\uD835\\uDFCE", "<", "\\u0E41\\uD835\\uDFCF", // supplementaries
    411                           "\\u0E41\\uD834\\uDD5F", "=", "\\u0E41\\uD834\\uDD58\\uD834\\uDD65", // supplementary composition decomps to supplementary
    412                           "\\u0E41\\uD87E\\uDC02", "=", "\\u0E41\\u4E41", // supplementary composition decomps to BMP
    413                           "\\u0E41\\u0301",        "=", "\\u0E41\\u0301", // unsafe (just checking backwards iteration)
    414                           "\\u0E41\\u0301\\u0316", "=", "\\u0E41\\u0316\\u0301",
    415                           // after UCA 4.1, the two lines below are not equal anymore do not have equal sign
    416                           "\\u0e24\\u0e41",        "<", "\\u0e41\\u0e24", // exiting contraction bug
    417                           "\\u0e3f\\u0e3f\\u0e24\\u0e41", "<", "\\u0e3f\\u0e3f\\u0e41\\u0e24",
    418 
    419                           "abc\\u0E41c\\u0301",       "=", "abc\\u0E41\\u0107", // composition
    420                           "abc\\u0E41\\uD834\\uDC00", "<", "abc\\u0E41\\uD834\\uDC01", // supplementaries
    421                           "abc\\u0E41\\uD834\\uDD5F", "=", "abc\\u0E41\\uD834\\uDD58\\uD834\\uDD65", // supplementary composition decomps to supplementary
    422                           "abc\\u0E41\\uD87E\\uDC02", "=", "abc\\u0E41\\u4E41", // supplementary composition decomps to BMP
    423                           "abc\\u0E41\\u0301",        "=", "abc\\u0E41\\u0301", // unsafe (just checking backwards iteration)
    424                           "abc\\u0E41\\u0301\\u0316", "=", "abc\\u0E41\\u0316\\u0301",
    425 
    426                           "\\u0E41c\\u0301abc",       "=", "\\u0E41\\u0107abc", // composition
    427                           "\\u0E41\\uD834\\uDC00abc", "<", "\\u0E41\\uD834\\uDC01abc", // supplementaries
    428                           "\\u0E41\\uD834\\uDD5Fabc", "=", "\\u0E41\\uD834\\uDD58\\uD834\\uDD65abc", // supplementary composition decomps to supplementary
    429                           "\\u0E41\\uD87E\\uDC02abc", "=", "\\u0E41\\u4E41abc", // supplementary composition decomps to BMP
    430                           "\\u0E41\\u0301abc",        "=", "\\u0E41\\u0301abc", // unsafe (just checking backwards iteration)
    431                           "\\u0E41\\u0301\\u0316abc", "=", "\\u0E41\\u0316\\u0301abc",
    432 
    433                           "abc\\u0E41c\\u0301abc",       "=", "abc\\u0E41\\u0107abc", // composition
    434                           "abc\\u0E41\\uD834\\uDC00abc", "<", "abc\\u0E41\\uD834\\uDC01abc", // supplementaries
    435                           "abc\\u0E41\\uD834\\uDD5Fabc", "=", "abc\\u0E41\\uD834\\uDD58\\uD834\\uDD65abc", // supplementary composition decomps to supplementary
    436                           "abc\\u0E41\\uD87E\\uDC02abc", "=", "abc\\u0E41\\u4E41abc", // supplementary composition decomps to BMP
    437                           "abc\\u0E41\\u0301abc",        "=", "abc\\u0E41\\u0301abc", // unsafe (just checking backwards iteration)
    438                           "abc\\u0E41\\u0301\\u0316abc", "=", "abc\\u0E41\\u0316\\u0301abc",
    439                         };
    440 
    441   compareArray(*coll, tests, sizeof(tests)/sizeof(tests[0]));
    442 
    443   const char *rule = "& c < ab";
    444   const char *testcontraction[] = { "\\u0E41ab", ">", "\\u0E41c"}; // After UCA 4.1 Thai are normal so won't break a contraction
    445   UnicodeString rules;
    446   UErrorCode status = U_ZERO_ERROR;
    447   parseChars(rules, rule);
    448   RuleBasedCollator *rcoll = new RuleBasedCollator(rules, status);
    449   if(U_SUCCESS(status)) {
    450     compareArray(*rcoll, testcontraction, 3);
    451     delete rcoll;
    452   } else {
    453     errln("Couldn't instantiate collator from rules");
    454   }
    455 
    456 }
    457 
    458 
    459 #endif /* #if !UCONFIG_NO_COLLATION */
    460