Home | History | Annotate | Download | only in intltest
      1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (C) 1999-2016, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 *   Date        Name        Description
      9 *   12/09/99    aliu        Ported from Java.
     10 **********************************************************************
     11 */
     12 
     13 #include "unicode/utypes.h"
     14 
     15 #if !UCONFIG_NO_COLLATION
     16 
     17 #include "thcoll.h"
     18 #include "unicode/utypes.h"
     19 #include "unicode/coll.h"
     20 #include "unicode/localpointer.h"
     21 #include "unicode/sortkey.h"
     22 #include "unicode/tblcoll.h"
     23 #include "unicode/ustring.h"
     24 #include "cmemory.h"
     25 #include "cstring.h"
     26 #include "filestrm.h"
     27 #include "textfile.h"
     28 
     29 /**
     30  * The TestDictionary test expects a file of this name, with this
     31  * encoding, to be present in the directory $ICU/source/test/testdata.
     32  */
     33 //#define TEST_FILE           "th18057.txt"
     34 
     35 /**
     36  * This is the most failures we show in TestDictionary.  If this number
     37  * is < 0, we show all failures.
     38  */
     39 #define MAX_FAILURES_TO_SHOW -1
     40 
     41 CollationThaiTest::CollationThaiTest() {
     42     UErrorCode status = U_ZERO_ERROR;
     43     coll = Collator::createInstance(Locale("th", "TH", ""), status);
     44     if (coll && U_SUCCESS(status)) {
     45         //coll->setStrength(Collator::TERTIARY);
     46     } else {
     47         delete coll;
     48         coll = 0;
     49     }
     50 }
     51 
     52 CollationThaiTest::~CollationThaiTest() {
     53     delete coll;
     54 }
     55 
     56 void CollationThaiTest::runIndexedTest(int32_t index, UBool exec, const char* &name,
     57                                        char* /*par*/) {
     58 
     59     if((!coll) && exec) {
     60       dataerrln(__FILE__ " cannot test - failed to create collator.");
     61       name = "some test";
     62       return;
     63     }
     64 
     65     switch (index) {
     66         TESTCASE(0,TestDictionary);
     67         TESTCASE(1,TestCornerCases);
     68         TESTCASE(2,TestNamesList);
     69         TESTCASE(3,TestInvalidThai);
     70         TESTCASE(4,TestReordering);
     71         default: name = ""; break;
     72     }
     73 }
     74 
     75 /**
     76  * Read the external names list, and confirms that the collator
     77  * gets the same results when comparing lines one to another
     78  * using regular and iterative comparison.
     79  */
     80 void CollationThaiTest::TestNamesList(void) {
     81     if (coll == 0) {
     82         errln("Error: could not construct Thai collator");
     83         return;
     84     }
     85 
     86     UErrorCode ec = U_ZERO_ERROR;
     87     TextFile names("TestNames_Thai.txt", "UTF16LE", ec);
     88     if (U_FAILURE(ec)) {
     89         logln("Can't open TestNames_Thai.txt: %s; skipping test",
     90               u_errorName(ec));
     91         return;
     92     }
     93 
     94     //
     95     // Loop through each word in the dictionary and compare it to the previous
     96     // word.  They should be in sorted order.
     97     //
     98     UnicodeString lastWord, word;
     99     //int32_t failed = 0;
    100     int32_t wordCount = 0;
    101     while (names.readLineSkippingComments(word, ec, FALSE) && U_SUCCESS(ec)) {
    102 
    103         // Show the first 8 words being compared, so we can see what's happening
    104         ++wordCount;
    105         if (wordCount <= 8) {
    106             UnicodeString str;
    107             logln((UnicodeString)"Word " + wordCount + ": " + IntlTest::prettify(word, str));
    108         }
    109 
    110         if (lastWord.length() > 0) {
    111             Collator::EComparisonResult result = coll->compare(lastWord, word);
    112             doTest(coll, lastWord, word, result);
    113         }
    114         lastWord = word;
    115     }
    116 
    117     assertSuccess("readLine", ec);
    118 
    119     logln((UnicodeString)"Words checked: " + wordCount);
    120 }
    121 
    122 /**
    123  * Read the external dictionary file, which is already in proper
    124  * sorted order, and confirm that the collator compares each line as
    125  * preceding the following line.
    126  */
    127 void CollationThaiTest::TestDictionary(void) {
    128     if (coll == 0) {
    129         errln("Error: could not construct Thai collator");
    130         return;
    131     }
    132 
    133     UErrorCode ec = U_ZERO_ERROR;
    134     TextFile riwords("riwords.txt", "UTF8", ec);
    135     if (U_FAILURE(ec)) {
    136         logln("Can't open riwords.txt: %s; skipping test",
    137               u_errorName(ec));
    138         return;
    139     }
    140 
    141     //
    142     // Loop through each word in the dictionary and compare it to the previous
    143     // word.  They should be in sorted order.
    144     //
    145     UnicodeString lastWord, word;
    146     int32_t failed = 0;
    147     int32_t wordCount = 0;
    148     while (riwords.readLineSkippingComments(word, ec, FALSE) && U_SUCCESS(ec)) {
    149 
    150         // Show the first 8 words being compared, so we can see what's happening
    151         ++wordCount;
    152         if (wordCount <= 8) {
    153             UnicodeString str;
    154             logln((UnicodeString)"Word " + wordCount + ": " + IntlTest::prettify(word, str));
    155         }
    156 
    157         if (lastWord.length() > 0) {
    158             int32_t result = coll->compare(lastWord, word);
    159 
    160             if (result > 0) {
    161                 failed++;
    162                 if (MAX_FAILURES_TO_SHOW < 0 || failed <= MAX_FAILURES_TO_SHOW) {
    163                     UnicodeString str;
    164                     UnicodeString msg =
    165                         UnicodeString("--------------------------------------------\n")
    166                         + riwords.getLineNumber()
    167                         + " compare(" + IntlTest::prettify(lastWord, str);
    168                     msg += UnicodeString(", ")
    169                         + IntlTest::prettify(word, str) + ") returned " + result
    170                         + ", expected -1\n";
    171                     UErrorCode status = U_ZERO_ERROR;
    172                     CollationKey k1, k2;
    173                     coll->getCollationKey(lastWord, k1, status);
    174                     coll->getCollationKey(word, k2, status);
    175                     if (U_FAILURE(status)) {
    176                         errln((UnicodeString)"Fail: getCollationKey returned " + u_errorName(status));
    177                         return;
    178                     }
    179                     msg.append("key1: ").append(prettify(k1, str)).append("\n");
    180                     msg.append("key2: ").append(prettify(k2, str));
    181                     errln(msg);
    182                 }
    183             }
    184         }
    185         lastWord = word;
    186     }
    187 
    188     assertSuccess("readLine", ec);
    189 
    190     if (failed != 0) {
    191         if (failed > MAX_FAILURES_TO_SHOW) {
    192             errln((UnicodeString)"Too many failures; only the first " +
    193                   MAX_FAILURES_TO_SHOW + " failures were shown");
    194         }
    195         errln((UnicodeString)"Summary: " + failed + " of " + (riwords.getLineNumber() - 1) +
    196               " comparisons failed");
    197     }
    198 
    199     logln((UnicodeString)"Words checked: " + wordCount);
    200 }
    201 
    202 /**
    203  * Odd corner conditions taken from "How to Sort Thai Without Rewriting Sort",
    204  * by Doug Cooper, http://seasrc.th.net/paper/thaisort.zip
    205  */
    206 void CollationThaiTest::TestCornerCases(void) {
    207     const char* TESTS[] = {
    208         // Shorter words precede longer
    209         "\\u0e01",                               "<",    "\\u0e01\\u0e01",
    210 
    211         // Tone marks are considered after letters (i.e. are primary ignorable)
    212         "\\u0e01\\u0e32",                        "<",    "\\u0e01\\u0e49\\u0e32",
    213 
    214         // ditto for other over-marks
    215         "\\u0e01\\u0e32",                        "<",    "\\u0e01\\u0e32\\u0e4c",
    216 
    217         // commonly used mark-in-context order.
    218         // In effect, marks are sorted after each syllable.
    219         "\\u0e01\\u0e32\\u0e01\\u0e49\\u0e32",   "<",    "\\u0e01\\u0e48\\u0e32\\u0e01\\u0e49\\u0e32",
    220 
    221         // Hyphens and other punctuation follow whitespace but come before letters
    222         "\\u0e01\\u0e32",                        "=",    "\\u0e01\\u0e32-",
    223         "\\u0e01\\u0e32-",                       "<",    "\\u0e01\\u0e32\\u0e01\\u0e32",
    224 
    225         // Doubler follows an indentical word without the doubler
    226         "\\u0e01\\u0e32",                        "=",    "\\u0e01\\u0e32\\u0e46",
    227         "\\u0e01\\u0e32\\u0e46",                 "<",    "\\u0e01\\u0e32\\u0e01\\u0e32",
    228 
    229 
    230         // \\u0e45 after either \\u0e24 or \\u0e26 is treated as a single
    231         // combining character, similar to "c < ch" in traditional spanish.
    232         // TODO: beef up this case
    233         "\\u0e24\\u0e29\\u0e35",                 "<",    "\\u0e24\\u0e45\\u0e29\\u0e35",
    234         "\\u0e26\\u0e29\\u0e35",                 "<",    "\\u0e26\\u0e45\\u0e29\\u0e35",
    235 
    236         // Vowels reorder, should compare \\u0e2d and \\u0e34
    237         "\\u0e40\\u0e01\\u0e2d",                 "<",    "\\u0e40\\u0e01\\u0e34",
    238 
    239         // Tones are compared after the rest of the word (e.g. primary ignorable)
    240         "\\u0e01\\u0e32\\u0e01\\u0e48\\u0e32",   "<",    "\\u0e01\\u0e49\\u0e32\\u0e01\\u0e32",
    241 
    242         // Periods are ignored entirely
    243         "\\u0e01.\\u0e01.",                      "<",    "\\u0e01\\u0e32",
    244     };
    245     const int32_t TESTS_length = UPRV_LENGTHOF(TESTS);
    246 
    247     if (coll == 0) {
    248         errln("Error: could not construct Thai collator");
    249         return;
    250     }
    251     compareArray(*coll, TESTS, TESTS_length);
    252 }
    253 
    254 //------------------------------------------------------------------------
    255 // Internal utilities
    256 //------------------------------------------------------------------------
    257 
    258 void CollationThaiTest::compareArray(Collator& c, const char* tests[],
    259                                      int32_t testsLength) {
    260     for (int32_t i = 0; i < testsLength; i += 3) {
    261 
    262         Collator::EComparisonResult expect;
    263         if (tests[i+1][0] == '<') {
    264           expect = Collator::LESS;
    265         } else if (tests[i+1][0] == '>') {
    266           expect = Collator::GREATER;
    267         } else if (tests[i+1][0] == '=') {
    268           expect = Collator::EQUAL;
    269         } else {
    270             // expect = Integer.decode(tests[i+1]).intValue();
    271             errln((UnicodeString)"Error: unknown operator " + tests[i+1]);
    272             return;
    273         }
    274 
    275         UnicodeString s1, s2;
    276         parseChars(s1, tests[i]);
    277         parseChars(s2, tests[i+2]);
    278 
    279         doTest(&c, s1, s2, expect);
    280 #if 0
    281         UErrorCode status = U_ZERO_ERROR;
    282         int32_t result = c.compare(s1, s2);
    283         if (sign(result) != sign(expect))
    284         {
    285             UnicodeString t1, t2;
    286             errln(UnicodeString("") +
    287                   i/3 + ": compare(" + IntlTest::prettify(s1, t1)
    288                   + " , " + IntlTest::prettify(s2, t2)
    289                   + ") got " + result + "; expected " + expect);
    290 
    291             CollationKey k1, k2;
    292             c.getCollationKey(s1, k1, status);
    293             c.getCollationKey(s2, k2, status);
    294             if (U_FAILURE(status)) {
    295                 errln((UnicodeString)"Fail: getCollationKey returned " + u_errorName(status));
    296                 return;
    297             }
    298             errln((UnicodeString)"  key1: " + prettify(k1, t1) );
    299             errln((UnicodeString)"  key2: " + prettify(k2, t2) );
    300         }
    301         else
    302         {
    303             // Collator.compare worked OK; now try the collation keys
    304             CollationKey k1, k2;
    305             c.getCollationKey(s1, k1, status);
    306             c.getCollationKey(s2, k2, status);
    307             if (U_FAILURE(status)) {
    308                 errln((UnicodeString)"Fail: getCollationKey returned " + u_errorName(status));
    309                 return;
    310             }
    311 
    312             result = k1.compareTo(k2);
    313             if (sign(result) != sign(expect)) {
    314                 UnicodeString t1, t2;
    315                 errln(UnicodeString("") +
    316                       i/3 + ": key(" + IntlTest::prettify(s1, t1)
    317                       + ").compareTo(key(" + IntlTest::prettify(s2, t2)
    318                       + ")) got " + result + "; expected " + expect);
    319 
    320                 errln((UnicodeString)"  " + prettify(k1, t1) + " vs. " + prettify(k2, t2));
    321             }
    322         }
    323 #endif
    324     }
    325 }
    326 
    327 int8_t CollationThaiTest::sign(int32_t i) {
    328     if (i < 0) return -1;
    329     if (i > 0) return 1;
    330     return 0;
    331 }
    332 
    333 /**
    334  * Set a UnicodeString corresponding to the given string.  Use
    335  * UnicodeString and the default converter, unless we see the sequence
    336  * "\\u", in which case we interpret the subsequent escape.
    337  */
    338 UnicodeString& CollationThaiTest::parseChars(UnicodeString& result,
    339                                              const char* chars) {
    340     return result = CharsToUnicodeString(chars);
    341 }
    342 
    343 UCollator *thaiColl = NULL;
    344 
    345 U_CDECL_BEGIN
    346 static int U_CALLCONV
    347 StrCmp(const void *p1, const void *p2) {
    348   return ucol_strcoll(thaiColl, *(UChar **) p1, -1,  *(UChar **)p2, -1);
    349 }
    350 U_CDECL_END
    351 
    352 
    353 #define LINES 6
    354 
    355 void CollationThaiTest::TestInvalidThai(void) {
    356   const char *tests[LINES] = {
    357     "\\u0E44\\u0E01\\u0E44\\u0E01",
    358     "\\u0E44\\u0E01\\u0E01\\u0E44",
    359     "\\u0E01\\u0E44\\u0E01\\u0E44",
    360     "\\u0E01\\u0E01\\u0E44\\u0E44",
    361     "\\u0E44\\u0E44\\u0E01\\u0E01",
    362     "\\u0E01\\u0E44\\u0E44\\u0E01",
    363   };
    364 
    365   UChar strings[LINES][20];
    366 
    367   UChar *toSort[LINES];
    368 
    369   int32_t i = 0, j = 0, len = 0;
    370 
    371   UErrorCode coll_status = U_ZERO_ERROR;
    372   UnicodeString iteratorText;
    373 
    374   thaiColl = ucol_open ("th_TH", &coll_status);
    375   if (U_FAILURE(coll_status)) {
    376     errln("Error opening Thai collator: %s", u_errorName(coll_status));
    377     return;
    378   }
    379 
    380   CollationElementIterator* c = ((RuleBasedCollator *)coll)->createCollationElementIterator( iteratorText );
    381 
    382   for(i = 0; i < UPRV_LENGTHOF(tests); i++) {
    383     len = u_unescape(tests[i], strings[i], 20);
    384     strings[i][len] = 0;
    385     toSort[i] = strings[i];
    386   }
    387 
    388   qsort (toSort, LINES, sizeof (UChar *), StrCmp);
    389 
    390   for (i=0; i < LINES; i++)
    391   {
    392     logln("%i", i);
    393       for (j=i+1; j < LINES; j++) {
    394           if (ucol_strcoll (thaiColl, toSort[i], -1, toSort[j], -1) == UCOL_GREATER)
    395           {
    396               // inconsistency ordering found!
    397             errln("Inconsistent ordering between strings %i and %i", i, j);
    398           }
    399       }
    400       iteratorText.setTo(toSort[i]);
    401       c->setText(iteratorText, coll_status);
    402       backAndForth(*c);
    403   }
    404 
    405 
    406   ucol_close(thaiColl);
    407   delete c;
    408 }
    409 
    410 void CollationThaiTest::TestReordering(void) {
    411   // Until UCA 4.1, the collation code swapped Thai/Lao prevowels with the following consonants,
    412   // resulting in consonant+prevowel == prevowel+consonant.
    413   // From UCA 5.0 on, there are order-reversing contractions for prevowel+consonant.
    414   // From UCA 5.0 until UCA 6.1, there was a tertiary difference between
    415   // consonant+prevowel and prevowel+consonant.
    416   // In UCA 6.2, they compare equal again.
    417   // The test was modified to using a collator with strength=secondary,
    418   // ignoring possible tertiary differences.
    419   const char *tests[] = {
    420     "\\u0E41c\\u0301",       "=", "\\u0E41\\u0107", // composition
    421     "\\u0E41\\U0001D7CE",    "<", "\\u0E41\\U0001D7CF", // supplementaries
    422     "\\u0E41\\U0001D15F",    "=", "\\u0E41\\U0001D158\\U0001D165", // supplementary composition decomps to supplementary
    423     "\\u0E41\\U0002F802",    "=", "\\u0E41\\u4E41", // supplementary composition decomps to BMP
    424     "\\u0E41\\u0301",        "=", "\\u0E41\\u0301", // unsafe (just checking backwards iteration)
    425     "\\u0E41\\u0301\\u0316", "=", "\\u0E41\\u0316\\u0301",
    426 
    427     "\\u0e24\\u0e41",        "=", "\\u0e41\\u0e24", // exiting contraction bug
    428     "\\u0e3f\\u0e3f\\u0e24\\u0e41", "=", "\\u0e3f\\u0e3f\\u0e41\\u0e24",
    429 
    430     "abc\\u0E41c\\u0301",       "=", "abc\\u0E41\\u0107", // composition
    431     "abc\\u0E41\\U0001D000",    "<", "abc\\u0E41\\U0001D001", // supplementaries
    432     "abc\\u0E41\\U0001D15F",    "=", "abc\\u0E41\\U0001D158\\U0001D165", // supplementary composition decomps to supplementary
    433     "abc\\u0E41\\U0002F802",    "=", "abc\\u0E41\\u4E41", // supplementary composition decomps to BMP
    434     "abc\\u0E41\\u0301",        "=", "abc\\u0E41\\u0301", // unsafe (just checking backwards iteration)
    435     "abc\\u0E41\\u0301\\u0316", "=", "abc\\u0E41\\u0316\\u0301",
    436 
    437     "\\u0E41c\\u0301abc",       "=", "\\u0E41\\u0107abc", // composition
    438     "\\u0E41\\U0001D000abc",    "<", "\\u0E41\\U0001D001abc", // supplementaries
    439     "\\u0E41\\U0001D15Fabc",    "=", "\\u0E41\\U0001D158\\U0001D165abc", // supplementary composition decomps to supplementary
    440     "\\u0E41\\U0002F802abc",    "=", "\\u0E41\\u4E41abc", // supplementary composition decomps to BMP
    441     "\\u0E41\\u0301abc",        "=", "\\u0E41\\u0301abc", // unsafe (just checking backwards iteration)
    442     "\\u0E41\\u0301\\u0316abc", "=", "\\u0E41\\u0316\\u0301abc",
    443 
    444     "abc\\u0E41c\\u0301abc",       "=", "abc\\u0E41\\u0107abc", // composition
    445     "abc\\u0E41\\U0001D000abc",    "<", "abc\\u0E41\\U0001D001abc", // supplementaries
    446     "abc\\u0E41\\U0001D15Fabc",    "=", "abc\\u0E41\\U0001D158\\U0001D165abc", // supplementary composition decomps to supplementary
    447     "abc\\u0E41\\U0002F802abc",    "=", "abc\\u0E41\\u4E41abc", // supplementary composition decomps to BMP
    448     "abc\\u0E41\\u0301abc",        "=", "abc\\u0E41\\u0301abc", // unsafe (just checking backwards iteration)
    449     "abc\\u0E41\\u0301\\u0316abc", "=", "abc\\u0E41\\u0316\\u0301abc",
    450   };
    451 
    452   LocalPointer<Collator> coll2(coll->clone());
    453   UErrorCode status = U_ZERO_ERROR;
    454   coll2->setAttribute(UCOL_STRENGTH, UCOL_SECONDARY, status);
    455   if(U_FAILURE(status)) {
    456     errln("Unable to set the Thai collator clone to secondary strength");
    457     return;
    458   }
    459   compareArray(*coll2, tests, UPRV_LENGTHOF(tests));
    460 
    461   const char *rule = "& c < ab";
    462   const char *testcontraction[] = { "\\u0E41ab", ">", "\\u0E41c"}; // After UCA 4.1 Thai are normal so won't break a contraction
    463   UnicodeString rules;
    464   parseChars(rules, rule);
    465   LocalPointer<RuleBasedCollator> rcoll(new RuleBasedCollator(rules, status), status);
    466   if(U_SUCCESS(status)) {
    467     compareArray(*rcoll, testcontraction, 3);
    468   } else {
    469     errln("Couldn't instantiate collator from rules");
    470   }
    471 
    472 }
    473 
    474 
    475 #endif /* #if !UCONFIG_NO_COLLATION */
    476