Home | History | Annotate | Download | only in intltest
      1 /********************************************************************
      2  * COPYRIGHT:
      3  * Copyright (c) 1997-2010, International Business Machines Corporation and
      4  * others. All Rights Reserved.
      5  ********************************************************************/
      6 
      7 #include "unicode/ustring.h"
      8 #include "unicode/uchar.h"
      9 #include "unicode/uniset.h"
     10 #include "unicode/putil.h"
     11 #include "cstring.h"
     12 #include "hash.h"
     13 #include "normalizer2impl.h"
     14 #include "uparse.h"
     15 #include "ucdtest.h"
     16 
     17 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof(array[0]))
     18 
     19 static const char *ignorePropNames[]={
     20     "FC_NFKC",
     21     "NFD_QC",
     22     "NFC_QC",
     23     "NFKD_QC",
     24     "NFKC_QC",
     25     "Expands_On_NFD",
     26     "Expands_On_NFC",
     27     "Expands_On_NFKD",
     28     "Expands_On_NFKC",
     29     "NFKC_CF"
     30 };
     31 
     32 UnicodeTest::UnicodeTest()
     33 {
     34     UErrorCode errorCode=U_ZERO_ERROR;
     35     unknownPropertyNames=new U_NAMESPACE_QUALIFIER Hashtable(errorCode);
     36     if(U_FAILURE(errorCode)) {
     37         delete unknownPropertyNames;
     38         unknownPropertyNames=NULL;
     39     }
     40     // Ignore some property names altogether.
     41     for(int32_t i=0; i<LENGTHOF(ignorePropNames); ++i) {
     42         unknownPropertyNames->puti(UnicodeString(ignorePropNames[i], -1, US_INV), 1, errorCode);
     43     }
     44 }
     45 
     46 UnicodeTest::~UnicodeTest()
     47 {
     48     delete unknownPropertyNames;
     49 }
     50 
     51 void UnicodeTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
     52 {
     53     if (exec) logln("TestSuite UnicodeTest: ");
     54     switch (index) {
     55         case 0: name = "TestAdditionalProperties"; if(exec) TestAdditionalProperties(); break;
     56         case 1: name = "TestBinaryValues"; if(exec) TestBinaryValues(); break;
     57         case 2: name = "TestConsistency"; if(exec) TestConsistency(); break;
     58         default: name = ""; break; //needed to end loop
     59     }
     60 }
     61 
     62 //====================================================
     63 // private data used by the tests
     64 //====================================================
     65 
     66 // test DerivedCoreProperties.txt -------------------------------------------
     67 
     68 // copied from genprops.c
     69 static int32_t
     70 getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
     71     const char *t, *z;
     72     int32_t i, j;
     73 
     74     s=u_skipWhitespace(s);
     75     for(i=0; i<countTokens; ++i) {
     76         t=tokens[i];
     77         if(t!=NULL) {
     78             for(j=0;; ++j) {
     79                 if(t[j]!=0) {
     80                     if(s[j]!=t[j]) {
     81                         break;
     82                     }
     83                 } else {
     84                     z=u_skipWhitespace(s+j);
     85                     if(*z==';' || *z==0) {
     86                         return i;
     87                     } else {
     88                         break;
     89                     }
     90                 }
     91             }
     92         }
     93     }
     94     return -1;
     95 }
     96 
     97 static const char *const
     98 derivedPropsNames[]={
     99     "Math",
    100     "Alphabetic",
    101     "Lowercase",
    102     "Uppercase",
    103     "ID_Start",
    104     "ID_Continue",
    105     "XID_Start",
    106     "XID_Continue",
    107     "Default_Ignorable_Code_Point",
    108     "Full_Composition_Exclusion",
    109     "Grapheme_Extend",
    110     "Grapheme_Link", /* Unicode 5 moves this property here from PropList.txt */
    111     "Grapheme_Base",
    112     "Cased",
    113     "Case_Ignorable",
    114     "Changes_When_Lowercased",
    115     "Changes_When_Uppercased",
    116     "Changes_When_Titlecased",
    117     "Changes_When_Casefolded",
    118     "Changes_When_Casemapped",
    119     "Changes_When_NFKC_Casefolded"
    120 };
    121 
    122 static const UProperty
    123 derivedPropsIndex[]={
    124     UCHAR_MATH,
    125     UCHAR_ALPHABETIC,
    126     UCHAR_LOWERCASE,
    127     UCHAR_UPPERCASE,
    128     UCHAR_ID_START,
    129     UCHAR_ID_CONTINUE,
    130     UCHAR_XID_START,
    131     UCHAR_XID_CONTINUE,
    132     UCHAR_DEFAULT_IGNORABLE_CODE_POINT,
    133     UCHAR_FULL_COMPOSITION_EXCLUSION,
    134     UCHAR_GRAPHEME_EXTEND,
    135     UCHAR_GRAPHEME_LINK,
    136     UCHAR_GRAPHEME_BASE,
    137     UCHAR_CASED,
    138     UCHAR_CASE_IGNORABLE,
    139     UCHAR_CHANGES_WHEN_LOWERCASED,
    140     UCHAR_CHANGES_WHEN_UPPERCASED,
    141     UCHAR_CHANGES_WHEN_TITLECASED,
    142     UCHAR_CHANGES_WHEN_CASEFOLDED,
    143     UCHAR_CHANGES_WHEN_CASEMAPPED,
    144     UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED
    145 };
    146 
    147 static int32_t numErrors[LENGTHOF(derivedPropsIndex)]={ 0 };
    148 
    149 enum { MAX_ERRORS=50 };
    150 
    151 U_CFUNC void U_CALLCONV
    152 derivedPropsLineFn(void *context,
    153                    char *fields[][2], int32_t /* fieldCount */,
    154                    UErrorCode *pErrorCode)
    155 {
    156     UnicodeTest *me=(UnicodeTest *)context;
    157     uint32_t start, end;
    158     int32_t i;
    159 
    160     u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
    161     if(U_FAILURE(*pErrorCode)) {
    162         me->errln("UnicodeTest: syntax error in DerivedCoreProperties.txt or DerivedNormalizationProps.txt field 0 at %s\n", fields[0][0]);
    163         return;
    164     }
    165 
    166     /* parse derived binary property name, ignore unknown names */
    167     i=getTokenIndex(derivedPropsNames, LENGTHOF(derivedPropsNames), fields[1][0]);
    168     if(i<0) {
    169         UnicodeString propName(fields[1][0], (int32_t)(fields[1][1]-fields[1][0]));
    170         propName.trim();
    171         if(me->unknownPropertyNames->find(propName)==NULL) {
    172             UErrorCode errorCode=U_ZERO_ERROR;
    173             me->unknownPropertyNames->puti(propName, 1, errorCode);
    174             me->errln("UnicodeTest warning: unknown property name '%s' in DerivedCoreProperties.txt or DerivedNormalizationProps.txt\n", fields[1][0]);
    175         }
    176         return;
    177     }
    178 
    179     me->derivedProps[i].add(start, end);
    180 }
    181 
    182 void UnicodeTest::TestAdditionalProperties() {
    183 #if !UCONFIG_NO_NORMALIZATION
    184     // test DerivedCoreProperties.txt and DerivedNormalizationProps.txt
    185     if(LENGTHOF(derivedProps)<LENGTHOF(derivedPropsNames)) {
    186         errln("error: UnicodeTest::derivedProps[] too short, need at least %d UnicodeSets\n",
    187               LENGTHOF(derivedPropsNames));
    188         return;
    189     }
    190     if(LENGTHOF(derivedPropsIndex)!=LENGTHOF(derivedPropsNames)) {
    191         errln("error in ucdtest.cpp: LENGTHOF(derivedPropsIndex)!=LENGTHOF(derivedPropsNames)\n");
    192         return;
    193     }
    194 
    195     char newPath[256];
    196     char backupPath[256];
    197     char *fields[2][2];
    198     UErrorCode errorCode=U_ZERO_ERROR;
    199 
    200     /* Look inside ICU_DATA first */
    201     strcpy(newPath, pathToDataDirectory());
    202     strcat(newPath, "unidata" U_FILE_SEP_STRING "DerivedCoreProperties.txt");
    203 
    204     // As a fallback, try to guess where the source data was located
    205     // at the time ICU was built, and look there.
    206 #   ifdef U_TOPSRCDIR
    207         strcpy(backupPath, U_TOPSRCDIR  U_FILE_SEP_STRING "data");
    208 #   else
    209         strcpy(backupPath, loadTestData(errorCode));
    210         strcat(backupPath, U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING "data");
    211 #   endif
    212     strcat(backupPath, U_FILE_SEP_STRING);
    213     strcat(backupPath, "unidata" U_FILE_SEP_STRING "DerivedCoreProperties.txt");
    214 
    215     char *path=newPath;
    216     u_parseDelimitedFile(newPath, ';', fields, 2, derivedPropsLineFn, this, &errorCode);
    217 
    218     if(errorCode==U_FILE_ACCESS_ERROR) {
    219         errorCode=U_ZERO_ERROR;
    220         path=backupPath;
    221         u_parseDelimitedFile(backupPath, ';', fields, 2, derivedPropsLineFn, this, &errorCode);
    222     }
    223     if(U_FAILURE(errorCode)) {
    224         errln("error parsing DerivedCoreProperties.txt: %s\n", u_errorName(errorCode));
    225         return;
    226     }
    227     char *basename=path+strlen(path)-strlen("DerivedCoreProperties.txt");
    228     strcpy(basename, "DerivedNormalizationProps.txt");
    229     u_parseDelimitedFile(path, ';', fields, 2, derivedPropsLineFn, this, &errorCode);
    230     if(U_FAILURE(errorCode)) {
    231         errln("error parsing DerivedNormalizationProps.txt: %s\n", u_errorName(errorCode));
    232         return;
    233     }
    234 
    235     // now we have all derived core properties in the UnicodeSets
    236     // run them all through the API
    237     int32_t rangeCount, range;
    238     uint32_t i;
    239     UChar32 start, end;
    240 
    241     // test all TRUE properties
    242     for(i=0; i<LENGTHOF(derivedPropsNames); ++i) {
    243         rangeCount=derivedProps[i].getRangeCount();
    244         for(range=0; range<rangeCount && numErrors[i]<MAX_ERRORS; ++range) {
    245             start=derivedProps[i].getRangeStart(range);
    246             end=derivedProps[i].getRangeEnd(range);
    247             for(; start<=end; ++start) {
    248                 if(!u_hasBinaryProperty(start, derivedPropsIndex[i])) {
    249                     dataerrln("UnicodeTest error: u_hasBinaryProperty(U+%04lx, %s)==FALSE is wrong", start, derivedPropsNames[i]);
    250                     if(++numErrors[i]>=MAX_ERRORS) {
    251                       dataerrln("Too many errors, moving to the next test");
    252                       break;
    253                     }
    254                 }
    255             }
    256         }
    257     }
    258 
    259     // invert all properties
    260     for(i=0; i<LENGTHOF(derivedPropsNames); ++i) {
    261         derivedProps[i].complement();
    262     }
    263 
    264     // test all FALSE properties
    265     for(i=0; i<LENGTHOF(derivedPropsNames); ++i) {
    266         rangeCount=derivedProps[i].getRangeCount();
    267         for(range=0; range<rangeCount && numErrors[i]<MAX_ERRORS; ++range) {
    268             start=derivedProps[i].getRangeStart(range);
    269             end=derivedProps[i].getRangeEnd(range);
    270             for(; start<=end; ++start) {
    271                 if(u_hasBinaryProperty(start, derivedPropsIndex[i])) {
    272                     errln("UnicodeTest error: u_hasBinaryProperty(U+%04lx, %s)==TRUE is wrong\n", start, derivedPropsNames[i]);
    273                     if(++numErrors[i]>=MAX_ERRORS) {
    274                       errln("Too many errors, moving to the next test");
    275                       break;
    276                     }
    277                 }
    278             }
    279         }
    280     }
    281 #endif /* !UCONFIG_NO_NORMALIZATION */
    282 }
    283 
    284 void UnicodeTest::TestBinaryValues() {
    285     /*
    286      * Unicode 5.1 explicitly defines binary property value aliases.
    287      * Verify that they are all recognized.
    288      */
    289     UErrorCode errorCode=U_ZERO_ERROR;
    290     UnicodeSet alpha(UNICODE_STRING_SIMPLE("[:Alphabetic:]"), errorCode);
    291     if(U_FAILURE(errorCode)) {
    292         dataerrln("UnicodeSet([:Alphabetic:]) failed - %s", u_errorName(errorCode));
    293         return;
    294     }
    295 
    296     static const char *const falseValues[]={ "N", "No", "F", "False" };
    297     static const char *const trueValues[]={ "Y", "Yes", "T", "True" };
    298     int32_t i;
    299     for(i=0; i<LENGTHOF(falseValues); ++i) {
    300         UnicodeString pattern=UNICODE_STRING_SIMPLE("[:Alphabetic=:]");
    301         pattern.insert(pattern.length()-2, UnicodeString(falseValues[i], -1, US_INV));
    302         errorCode=U_ZERO_ERROR;
    303         UnicodeSet set(pattern, errorCode);
    304         if(U_FAILURE(errorCode)) {
    305             errln("UnicodeSet([:Alphabetic=%s:]) failed - %s\n", falseValues[i], u_errorName(errorCode));
    306             continue;
    307         }
    308         set.complement();
    309         if(set!=alpha) {
    310             errln("UnicodeSet([:Alphabetic=%s:]).complement()!=UnicodeSet([:Alphabetic:])\n", falseValues[i]);
    311         }
    312     }
    313     for(i=0; i<LENGTHOF(trueValues); ++i) {
    314         UnicodeString pattern=UNICODE_STRING_SIMPLE("[:Alphabetic=:]");
    315         pattern.insert(pattern.length()-2, UnicodeString(trueValues[i], -1, US_INV));
    316         errorCode=U_ZERO_ERROR;
    317         UnicodeSet set(pattern, errorCode);
    318         if(U_FAILURE(errorCode)) {
    319             errln("UnicodeSet([:Alphabetic=%s:]) failed - %s\n", trueValues[i], u_errorName(errorCode));
    320             continue;
    321         }
    322         if(set!=alpha) {
    323             errln("UnicodeSet([:Alphabetic=%s:])!=UnicodeSet([:Alphabetic:])\n", trueValues[i]);
    324         }
    325     }
    326 }
    327 
    328 void UnicodeTest::TestConsistency() {
    329 #if !UCONFIG_NO_NORMALIZATION
    330     /*
    331      * Test for an example that getCanonStartSet() delivers
    332      * all characters that compose from the input one,
    333      * even in multiple steps.
    334      * For example, the set for "I" (0049) should contain both
    335      * I-diaeresis (00CF) and I-diaeresis-acute (1E2E).
    336      * In general, the set for the middle such character should be a subset
    337      * of the set for the first.
    338      */
    339     IcuTestErrorCode errorCode(*this, "TestConsistency");
    340     const Normalizer2 *nfd=Normalizer2::getInstance(NULL, "nfc", UNORM2_DECOMPOSE, errorCode);
    341     const Normalizer2Impl *nfcImpl=Normalizer2Factory::getNFCImpl(errorCode);
    342     if(errorCode.isFailure()) {
    343         dataerrln("Normalizer2::getInstance(NFD) or Normalizer2Factory::getNFCImpl() failed - %s\n",
    344                   errorCode.errorName());
    345         errorCode.reset();
    346         return;
    347     }
    348 
    349     UnicodeSet set1, set2;
    350     if (nfcImpl->getCanonStartSet(0x49, set1)) {
    351         /* enumerate all characters that are plausible to be latin letters */
    352         for(UChar start=0xa0; start<0x2000; ++start) {
    353             UnicodeString decomp=nfd->normalize(UnicodeString(start), errorCode);
    354             if(decomp.length()>1 && decomp[0]==0x49) {
    355                 set2.add(start);
    356             }
    357         }
    358 
    359         if (set1!=set2) {
    360             errln("[canon start set of 0049] != [all c with canon decomp with 0049]");
    361         }
    362         // This was available in cucdtst.c but the test had to move to intltest
    363         // because the new internal normalization functions are in C++.
    364         //compareUSets(set1, set2,
    365         //             "[canon start set of 0049]", "[all c with canon decomp with 0049]",
    366         //             TRUE);
    367     } else {
    368         errln("NFC.getCanonStartSet() returned FALSE");
    369     }
    370 #endif
    371 }
    372