Home | History | Annotate | Download | only in intltest
      1 /********************************************************************
      2  * COPYRIGHT:
      3  * Copyright (c) 1997-2011, International Business Machines Corporation and
      4  * others. All Rights Reserved.
      5  ********************************************************************/
      6 
      7 #include "unicode/ustring.h"
      8 #include "unicode/uchar.h"
      9 #include "unicode/uniset.h"
     10 #include "unicode/putil.h"
     11 #include "cstring.h"
     12 #include "hash.h"
     13 #include "patternprops.h"
     14 #include "normalizer2impl.h"
     15 #include "uparse.h"
     16 #include "ucdtest.h"
     17 
     18 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof(array[0]))
     19 
     20 static const char *ignorePropNames[]={
     21     "FC_NFKC",
     22     "NFD_QC",
     23     "NFC_QC",
     24     "NFKD_QC",
     25     "NFKC_QC",
     26     "Expands_On_NFD",
     27     "Expands_On_NFC",
     28     "Expands_On_NFKD",
     29     "Expands_On_NFKC",
     30     "NFKC_CF"
     31 };
     32 
     33 UnicodeTest::UnicodeTest()
     34 {
     35     UErrorCode errorCode=U_ZERO_ERROR;
     36     unknownPropertyNames=new U_NAMESPACE_QUALIFIER Hashtable(errorCode);
     37     if(U_FAILURE(errorCode)) {
     38         delete unknownPropertyNames;
     39         unknownPropertyNames=NULL;
     40     }
     41     // Ignore some property names altogether.
     42     for(int32_t i=0; i<LENGTHOF(ignorePropNames); ++i) {
     43         unknownPropertyNames->puti(UnicodeString(ignorePropNames[i], -1, US_INV), 1, errorCode);
     44     }
     45 }
     46 
     47 UnicodeTest::~UnicodeTest()
     48 {
     49     delete unknownPropertyNames;
     50 }
     51 
     52 void UnicodeTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
     53 {
     54     if(exec) {
     55         logln("TestSuite UnicodeTest: ");
     56     }
     57     TESTCASE_AUTO_BEGIN;
     58     TESTCASE_AUTO(TestAdditionalProperties);
     59     TESTCASE_AUTO(TestBinaryValues);
     60     TESTCASE_AUTO(TestConsistency);
     61     TESTCASE_AUTO(TestPatternProperties);
     62     TESTCASE_AUTO_END;
     63 }
     64 
     65 //====================================================
     66 // private data used by the tests
     67 //====================================================
     68 
     69 // test DerivedCoreProperties.txt -------------------------------------------
     70 
     71 // copied from genprops.c
     72 static int32_t
     73 getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
     74     const char *t, *z;
     75     int32_t i, j;
     76 
     77     s=u_skipWhitespace(s);
     78     for(i=0; i<countTokens; ++i) {
     79         t=tokens[i];
     80         if(t!=NULL) {
     81             for(j=0;; ++j) {
     82                 if(t[j]!=0) {
     83                     if(s[j]!=t[j]) {
     84                         break;
     85                     }
     86                 } else {
     87                     z=u_skipWhitespace(s+j);
     88                     if(*z==';' || *z==0) {
     89                         return i;
     90                     } else {
     91                         break;
     92                     }
     93                 }
     94             }
     95         }
     96     }
     97     return -1;
     98 }
     99 
    100 static const char *const
    101 derivedPropsNames[]={
    102     "Math",
    103     "Alphabetic",
    104     "Lowercase",
    105     "Uppercase",
    106     "ID_Start",
    107     "ID_Continue",
    108     "XID_Start",
    109     "XID_Continue",
    110     "Default_Ignorable_Code_Point",
    111     "Full_Composition_Exclusion",
    112     "Grapheme_Extend",
    113     "Grapheme_Link", /* Unicode 5 moves this property here from PropList.txt */
    114     "Grapheme_Base",
    115     "Cased",
    116     "Case_Ignorable",
    117     "Changes_When_Lowercased",
    118     "Changes_When_Uppercased",
    119     "Changes_When_Titlecased",
    120     "Changes_When_Casefolded",
    121     "Changes_When_Casemapped",
    122     "Changes_When_NFKC_Casefolded"
    123 };
    124 
    125 static const UProperty
    126 derivedPropsIndex[]={
    127     UCHAR_MATH,
    128     UCHAR_ALPHABETIC,
    129     UCHAR_LOWERCASE,
    130     UCHAR_UPPERCASE,
    131     UCHAR_ID_START,
    132     UCHAR_ID_CONTINUE,
    133     UCHAR_XID_START,
    134     UCHAR_XID_CONTINUE,
    135     UCHAR_DEFAULT_IGNORABLE_CODE_POINT,
    136     UCHAR_FULL_COMPOSITION_EXCLUSION,
    137     UCHAR_GRAPHEME_EXTEND,
    138     UCHAR_GRAPHEME_LINK,
    139     UCHAR_GRAPHEME_BASE,
    140     UCHAR_CASED,
    141     UCHAR_CASE_IGNORABLE,
    142     UCHAR_CHANGES_WHEN_LOWERCASED,
    143     UCHAR_CHANGES_WHEN_UPPERCASED,
    144     UCHAR_CHANGES_WHEN_TITLECASED,
    145     UCHAR_CHANGES_WHEN_CASEFOLDED,
    146     UCHAR_CHANGES_WHEN_CASEMAPPED,
    147     UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED
    148 };
    149 
    150 static int32_t numErrors[LENGTHOF(derivedPropsIndex)]={ 0 };
    151 
    152 enum { MAX_ERRORS=50 };
    153 
    154 U_CFUNC void U_CALLCONV
    155 derivedPropsLineFn(void *context,
    156                    char *fields[][2], int32_t /* fieldCount */,
    157                    UErrorCode *pErrorCode)
    158 {
    159     UnicodeTest *me=(UnicodeTest *)context;
    160     uint32_t start, end;
    161     int32_t i;
    162 
    163     u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
    164     if(U_FAILURE(*pErrorCode)) {
    165         me->errln("UnicodeTest: syntax error in DerivedCoreProperties.txt or DerivedNormalizationProps.txt field 0 at %s\n", fields[0][0]);
    166         return;
    167     }
    168 
    169     /* parse derived binary property name, ignore unknown names */
    170     i=getTokenIndex(derivedPropsNames, LENGTHOF(derivedPropsNames), fields[1][0]);
    171     if(i<0) {
    172         UnicodeString propName(fields[1][0], (int32_t)(fields[1][1]-fields[1][0]));
    173         propName.trim();
    174         if(me->unknownPropertyNames->find(propName)==NULL) {
    175             UErrorCode errorCode=U_ZERO_ERROR;
    176             me->unknownPropertyNames->puti(propName, 1, errorCode);
    177             me->errln("UnicodeTest warning: unknown property name '%s' in DerivedCoreProperties.txt or DerivedNormalizationProps.txt\n", fields[1][0]);
    178         }
    179         return;
    180     }
    181 
    182     me->derivedProps[i].add(start, end);
    183 }
    184 
    185 void UnicodeTest::TestAdditionalProperties() {
    186 #if !UCONFIG_NO_NORMALIZATION
    187     // test DerivedCoreProperties.txt and DerivedNormalizationProps.txt
    188     if(LENGTHOF(derivedProps)<LENGTHOF(derivedPropsNames)) {
    189         errln("error: UnicodeTest::derivedProps[] too short, need at least %d UnicodeSets\n",
    190               LENGTHOF(derivedPropsNames));
    191         return;
    192     }
    193     if(LENGTHOF(derivedPropsIndex)!=LENGTHOF(derivedPropsNames)) {
    194         errln("error in ucdtest.cpp: LENGTHOF(derivedPropsIndex)!=LENGTHOF(derivedPropsNames)\n");
    195         return;
    196     }
    197 
    198     char newPath[256];
    199     char backupPath[256];
    200     char *fields[2][2];
    201     UErrorCode errorCode=U_ZERO_ERROR;
    202 
    203     /* Look inside ICU_DATA first */
    204     strcpy(newPath, pathToDataDirectory());
    205     strcat(newPath, "unidata" U_FILE_SEP_STRING "DerivedCoreProperties.txt");
    206 
    207     // As a fallback, try to guess where the source data was located
    208     // at the time ICU was built, and look there.
    209 #   ifdef U_TOPSRCDIR
    210         strcpy(backupPath, U_TOPSRCDIR  U_FILE_SEP_STRING "data");
    211 #   else
    212         strcpy(backupPath, loadTestData(errorCode));
    213         strcat(backupPath, U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING "data");
    214 #   endif
    215     strcat(backupPath, U_FILE_SEP_STRING);
    216     strcat(backupPath, "unidata" U_FILE_SEP_STRING "DerivedCoreProperties.txt");
    217 
    218     char *path=newPath;
    219     u_parseDelimitedFile(newPath, ';', fields, 2, derivedPropsLineFn, this, &errorCode);
    220 
    221     if(errorCode==U_FILE_ACCESS_ERROR) {
    222         errorCode=U_ZERO_ERROR;
    223         path=backupPath;
    224         u_parseDelimitedFile(backupPath, ';', fields, 2, derivedPropsLineFn, this, &errorCode);
    225     }
    226     if(U_FAILURE(errorCode)) {
    227         errln("error parsing DerivedCoreProperties.txt: %s\n", u_errorName(errorCode));
    228         return;
    229     }
    230     char *basename=path+strlen(path)-strlen("DerivedCoreProperties.txt");
    231     strcpy(basename, "DerivedNormalizationProps.txt");
    232     u_parseDelimitedFile(path, ';', fields, 2, derivedPropsLineFn, this, &errorCode);
    233     if(U_FAILURE(errorCode)) {
    234         errln("error parsing DerivedNormalizationProps.txt: %s\n", u_errorName(errorCode));
    235         return;
    236     }
    237 
    238     // now we have all derived core properties in the UnicodeSets
    239     // run them all through the API
    240     int32_t rangeCount, range;
    241     uint32_t i;
    242     UChar32 start, end;
    243 
    244     // test all TRUE properties
    245     for(i=0; i<LENGTHOF(derivedPropsNames); ++i) {
    246         rangeCount=derivedProps[i].getRangeCount();
    247         for(range=0; range<rangeCount && numErrors[i]<MAX_ERRORS; ++range) {
    248             start=derivedProps[i].getRangeStart(range);
    249             end=derivedProps[i].getRangeEnd(range);
    250             for(; start<=end; ++start) {
    251                 if(!u_hasBinaryProperty(start, derivedPropsIndex[i])) {
    252                     dataerrln("UnicodeTest error: u_hasBinaryProperty(U+%04lx, %s)==FALSE is wrong", start, derivedPropsNames[i]);
    253                     if(++numErrors[i]>=MAX_ERRORS) {
    254                       dataerrln("Too many errors, moving to the next test");
    255                       break;
    256                     }
    257                 }
    258             }
    259         }
    260     }
    261 
    262     // invert all properties
    263     for(i=0; i<LENGTHOF(derivedPropsNames); ++i) {
    264         derivedProps[i].complement();
    265     }
    266 
    267     // test all FALSE properties
    268     for(i=0; i<LENGTHOF(derivedPropsNames); ++i) {
    269         rangeCount=derivedProps[i].getRangeCount();
    270         for(range=0; range<rangeCount && numErrors[i]<MAX_ERRORS; ++range) {
    271             start=derivedProps[i].getRangeStart(range);
    272             end=derivedProps[i].getRangeEnd(range);
    273             for(; start<=end; ++start) {
    274                 if(u_hasBinaryProperty(start, derivedPropsIndex[i])) {
    275                     errln("UnicodeTest error: u_hasBinaryProperty(U+%04lx, %s)==TRUE is wrong\n", start, derivedPropsNames[i]);
    276                     if(++numErrors[i]>=MAX_ERRORS) {
    277                       errln("Too many errors, moving to the next test");
    278                       break;
    279                     }
    280                 }
    281             }
    282         }
    283     }
    284 #endif /* !UCONFIG_NO_NORMALIZATION */
    285 }
    286 
    287 void UnicodeTest::TestBinaryValues() {
    288     /*
    289      * Unicode 5.1 explicitly defines binary property value aliases.
    290      * Verify that they are all recognized.
    291      */
    292     UErrorCode errorCode=U_ZERO_ERROR;
    293     UnicodeSet alpha(UNICODE_STRING_SIMPLE("[:Alphabetic:]"), errorCode);
    294     if(U_FAILURE(errorCode)) {
    295         dataerrln("UnicodeSet([:Alphabetic:]) failed - %s", u_errorName(errorCode));
    296         return;
    297     }
    298 
    299     static const char *const falseValues[]={ "N", "No", "F", "False" };
    300     static const char *const trueValues[]={ "Y", "Yes", "T", "True" };
    301     int32_t i;
    302     for(i=0; i<LENGTHOF(falseValues); ++i) {
    303         UnicodeString pattern=UNICODE_STRING_SIMPLE("[:Alphabetic=:]");
    304         pattern.insert(pattern.length()-2, UnicodeString(falseValues[i], -1, US_INV));
    305         errorCode=U_ZERO_ERROR;
    306         UnicodeSet set(pattern, errorCode);
    307         if(U_FAILURE(errorCode)) {
    308             errln("UnicodeSet([:Alphabetic=%s:]) failed - %s\n", falseValues[i], u_errorName(errorCode));
    309             continue;
    310         }
    311         set.complement();
    312         if(set!=alpha) {
    313             errln("UnicodeSet([:Alphabetic=%s:]).complement()!=UnicodeSet([:Alphabetic:])\n", falseValues[i]);
    314         }
    315     }
    316     for(i=0; i<LENGTHOF(trueValues); ++i) {
    317         UnicodeString pattern=UNICODE_STRING_SIMPLE("[:Alphabetic=:]");
    318         pattern.insert(pattern.length()-2, UnicodeString(trueValues[i], -1, US_INV));
    319         errorCode=U_ZERO_ERROR;
    320         UnicodeSet set(pattern, errorCode);
    321         if(U_FAILURE(errorCode)) {
    322             errln("UnicodeSet([:Alphabetic=%s:]) failed - %s\n", trueValues[i], u_errorName(errorCode));
    323             continue;
    324         }
    325         if(set!=alpha) {
    326             errln("UnicodeSet([:Alphabetic=%s:])!=UnicodeSet([:Alphabetic:])\n", trueValues[i]);
    327         }
    328     }
    329 }
    330 
    331 void UnicodeTest::TestConsistency() {
    332 #if !UCONFIG_NO_NORMALIZATION
    333     /*
    334      * Test for an example that getCanonStartSet() delivers
    335      * all characters that compose from the input one,
    336      * even in multiple steps.
    337      * For example, the set for "I" (0049) should contain both
    338      * I-diaeresis (00CF) and I-diaeresis-acute (1E2E).
    339      * In general, the set for the middle such character should be a subset
    340      * of the set for the first.
    341      */
    342     IcuTestErrorCode errorCode(*this, "TestConsistency");
    343     const Normalizer2 *nfd=Normalizer2::getNFDInstance(errorCode);
    344     const Normalizer2Impl *nfcImpl=Normalizer2Factory::getNFCImpl(errorCode);
    345     if(!nfcImpl->ensureCanonIterData(errorCode) || errorCode.isFailure()) {
    346         dataerrln("Normalizer2::getInstance(NFD) or Normalizer2Factory::getNFCImpl() failed - %s\n",
    347                   errorCode.errorName());
    348         errorCode.reset();
    349         return;
    350     }
    351 
    352     UnicodeSet set1, set2;
    353     if (nfcImpl->getCanonStartSet(0x49, set1)) {
    354         /* enumerate all characters that are plausible to be latin letters */
    355         for(UChar start=0xa0; start<0x2000; ++start) {
    356             UnicodeString decomp=nfd->normalize(UnicodeString(start), errorCode);
    357             if(decomp.length()>1 && decomp[0]==0x49) {
    358                 set2.add(start);
    359             }
    360         }
    361 
    362         if (set1!=set2) {
    363             errln("[canon start set of 0049] != [all c with canon decomp with 0049]");
    364         }
    365         // This was available in cucdtst.c but the test had to move to intltest
    366         // because the new internal normalization functions are in C++.
    367         //compareUSets(set1, set2,
    368         //             "[canon start set of 0049]", "[all c with canon decomp with 0049]",
    369         //             TRUE);
    370     } else {
    371         errln("NFC.getCanonStartSet() returned FALSE");
    372     }
    373 #endif
    374 }
    375 
    376 /**
    377  * Test various implementations of Pattern_Syntax & Pattern_White_Space.
    378  */
    379 void UnicodeTest::TestPatternProperties() {
    380     IcuTestErrorCode errorCode(*this, "TestPatternProperties()");
    381     UnicodeSet syn_pp;
    382     UnicodeSet syn_prop(UNICODE_STRING_SIMPLE("[:Pattern_Syntax:]"), errorCode);
    383     UnicodeSet syn_list(
    384         "[!-/\\:-@\\[-\\^`\\{-~"
    385         "\\u00A1-\\u00A7\\u00A9\\u00AB\\u00AC\\u00AE\\u00B0\\u00B1\\u00B6\\u00BB\\u00BF\\u00D7\\u00F7"
    386         "\\u2010-\\u2027\\u2030-\\u203E\\u2041-\\u2053\\u2055-\\u205E\\u2190-\\u245F\\u2500-\\u2775"
    387         "\\u2794-\\u2BFF\\u2E00-\\u2E7F\\u3001-\\u3003\\u3008-\\u3020\\u3030\\uFD3E\\uFD3F\\uFE45\\uFE46]", errorCode);
    388     UnicodeSet ws_pp;
    389     UnicodeSet ws_prop(UNICODE_STRING_SIMPLE("[:Pattern_White_Space:]"), errorCode);
    390     UnicodeSet ws_list(UNICODE_STRING_SIMPLE("[\\u0009-\\u000D\\ \\u0085\\u200E\\u200F\\u2028\\u2029]"), errorCode);
    391     UnicodeSet syn_ws_pp;
    392     UnicodeSet syn_ws_prop(syn_prop);
    393     syn_ws_prop.addAll(ws_prop);
    394     for(UChar32 c=0; c<=0xffff; ++c) {
    395         if(PatternProps::isSyntax(c)) {
    396             syn_pp.add(c);
    397         }
    398         if(PatternProps::isWhiteSpace(c)) {
    399             ws_pp.add(c);
    400         }
    401         if(PatternProps::isSyntaxOrWhiteSpace(c)) {
    402             syn_ws_pp.add(c);
    403         }
    404     }
    405     compareUSets(syn_pp, syn_prop,
    406                  "PatternProps.isSyntax()", "[:Pattern_Syntax:]", TRUE);
    407     compareUSets(syn_pp, syn_list,
    408                  "PatternProps.isSyntax()", "[Pattern_Syntax ranges]", TRUE);
    409     compareUSets(ws_pp, ws_prop,
    410                  "PatternProps.isWhiteSpace()", "[:Pattern_White_Space:]", TRUE);
    411     compareUSets(ws_pp, ws_list,
    412                  "PatternProps.isWhiteSpace()", "[Pattern_White_Space ranges]", TRUE);
    413     compareUSets(syn_ws_pp, syn_ws_prop,
    414                  "PatternProps.isSyntaxOrWhiteSpace()",
    415                  "[[:Pattern_Syntax:][:Pattern_White_Space:]]", TRUE);
    416 }
    417 
    418 // So far only minimal port of Java & cucdtst.c compareUSets().
    419 UBool
    420 UnicodeTest::compareUSets(const UnicodeSet &a, const UnicodeSet &b,
    421                           const char *a_name, const char *b_name,
    422                           UBool diffIsError) {
    423     UBool same= a==b;
    424     if(!same && diffIsError) {
    425         errln("Sets are different: %s vs. %s\n", a_name, b_name);
    426     }
    427     return same;
    428 }
    429