Home | History | Annotate | Download | only in gencase
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2004-2009, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  gencase.c
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2004aug28
     14 *   created by: Markus W. Scherer
     15 *
     16 *   This program reads several of the Unicode character database text files,
     17 *   parses them, and the case mapping properties for each character.
     18 *   It then writes a binary file containing the properties
     19 *   that is designed to be used directly for random-access to
     20 *   the properties of each Unicode character.
     21 */
     22 
     23 #include <stdio.h>
     24 #include "unicode/utypes.h"
     25 #include "unicode/uchar.h"
     26 #include "unicode/uset.h"
     27 #include "unicode/putil.h"
     28 #include "unicode/uclean.h"
     29 #include "cmemory.h"
     30 #include "cstring.h"
     31 #include "uarrsort.h"
     32 #include "unewdata.h"
     33 #include "uoptions.h"
     34 #include "uparse.h"
     35 #include "uprops.h"
     36 #include "propsvec.h"
     37 #include "gencase.h"
     38 
     39 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
     40 
     41 /* data --------------------------------------------------------------------- */
     42 
     43 UPropsVectors *pv;
     44 
     45 UBool beVerbose=FALSE, haveCopyright=TRUE;
     46 
     47 /*
     48  * Unicode set collecting the case-sensitive characters;
     49  * see uchar.h UCHAR_CASE_SENSITIVE.
     50  * Add code points from case mappings/foldings in
     51  * the root locale and with default options.
     52  */
     53 static USet *caseSensitive;
     54 
     55 /* prototypes --------------------------------------------------------------- */
     56 
     57 static void
     58 parseSpecialCasing(const char *filename, UErrorCode *pErrorCode);
     59 
     60 static void
     61 parseCaseFolding(const char *filename, UErrorCode *pErrorCode);
     62 
     63 static void
     64 parseDB(const char *filename, UErrorCode *pErrorCode);
     65 
     66 /* parse files with multiple binary properties ------------------------------ */
     67 
     68 /* TODO: more common code, move functions to uparse.h|c */
     69 
     70 /* TODO: similar to genprops/props2.c but not the same */
     71 
     72 struct Binary {
     73     const char *propName;
     74     int32_t vecWord;
     75     uint32_t vecValue, vecMask;
     76 };
     77 typedef struct Binary Binary;
     78 
     79 struct Binaries {
     80     const char *ucdFile;
     81     const Binary *binaries;
     82     int32_t binariesCount;
     83 };
     84 typedef struct Binaries Binaries;
     85 
     86 static const Binary
     87 propListNames[]={
     88     { "Soft_Dotted",                        0, UCASE_SOFT_DOTTED,   UCASE_DOT_MASK }
     89 };
     90 
     91 static const Binaries
     92 propListBinaries={
     93     "PropList", propListNames, LENGTHOF(propListNames)
     94 };
     95 
     96 static const Binary
     97 derCorePropsNames[]={
     98     { "Lowercase",                          0, UCASE_LOWER,         UCASE_TYPE_MASK },
     99     { "Uppercase",                          0, UCASE_UPPER,         UCASE_TYPE_MASK },
    100     /* Unicode 5.2 adds Case_Ignorable as a public property. See comments in store.c. */
    101     { "Case_Ignorable",                     1, U_MASK(UGENCASE_IS_MID_LETTER_SHIFT), U_MASK(UGENCASE_IS_MID_LETTER_SHIFT) }
    102 };
    103 
    104 static const Binaries
    105 derCorePropsBinaries={
    106     "DerivedCoreProperties", derCorePropsNames, LENGTHOF(derCorePropsNames)
    107 };
    108 
    109 /*
    110  * Treat Word_Break=MidLetter and MidNumLet as a single binary property.
    111  * We need not distinguish between them because both add to case-ignorable.
    112  * We ignore all other Word_Break values.
    113  */
    114 static const Binary
    115 wordBreakNames[]={
    116     { "MidLetter",                          1, U_MASK(UGENCASE_IS_MID_LETTER_SHIFT), U_MASK(UGENCASE_IS_MID_LETTER_SHIFT) },
    117     { "MidNumLet",                          1, U_MASK(UGENCASE_IS_MID_LETTER_SHIFT), U_MASK(UGENCASE_IS_MID_LETTER_SHIFT) }
    118 };
    119 
    120 static const Binaries
    121 wordBreakBinaries={
    122     "WordBreakProperty", wordBreakNames, LENGTHOF(wordBreakNames)
    123 };
    124 
    125 static void U_CALLCONV
    126 binariesLineFn(void *context,
    127                char *fields[][2], int32_t fieldCount,
    128                UErrorCode *pErrorCode) {
    129     const Binaries *bin;
    130     char *s;
    131     uint32_t start, end;
    132     int32_t i;
    133 
    134     bin=(const Binaries *)context;
    135 
    136     u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
    137     if(U_FAILURE(*pErrorCode)) {
    138         fprintf(stderr, "gencase: syntax error in %s.txt field 0 at %s\n", bin->ucdFile, fields[0][0]);
    139         exit(*pErrorCode);
    140     }
    141 
    142     /* parse binary property name */
    143     s=(char *)u_skipWhitespace(fields[1][0]);
    144     for(i=0;; ++i) {
    145         if(i==bin->binariesCount) {
    146             /* ignore unrecognized properties */
    147             return;
    148         }
    149         if(isToken(bin->binaries[i].propName, s)) {
    150             break;
    151         }
    152     }
    153 
    154     if(bin->binaries[i].vecMask==0) {
    155         fprintf(stderr, "gencase error: mask value %d==0 for %s %s\n",
    156                         (int)bin->binaries[i].vecMask, bin->ucdFile, bin->binaries[i].propName);
    157         exit(U_INTERNAL_PROGRAM_ERROR);
    158     }
    159 
    160     upvec_setValue(pv, start, end, bin->binaries[i].vecWord, bin->binaries[i].vecValue, bin->binaries[i].vecMask, pErrorCode);
    161     if(U_FAILURE(*pErrorCode)) {
    162         fprintf(stderr, "gencase error: unable to set %s, code: %s\n",
    163                         bin->binaries[i].propName, u_errorName(*pErrorCode));
    164         exit(*pErrorCode);
    165     }
    166 }
    167 
    168 static void
    169 parseBinariesFile(char *filename, char *basename, const char *suffix,
    170                   const Binaries *bin,
    171                   UErrorCode *pErrorCode) {
    172     char *fields[2][2];
    173 
    174     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    175         return;
    176     }
    177 
    178     writeUCDFilename(basename, bin->ucdFile, suffix);
    179 
    180     u_parseDelimitedFile(filename, ';', fields, 2, binariesLineFn, (void *)bin, pErrorCode);
    181     if(U_FAILURE(*pErrorCode)) {
    182         fprintf(stderr, "error parsing %s.txt: %s\n", bin->ucdFile, u_errorName(*pErrorCode));
    183     }
    184 }
    185 
    186 /* -------------------------------------------------------------------------- */
    187 
    188 enum
    189 {
    190     HELP_H,
    191     HELP_QUESTION_MARK,
    192     VERBOSE,
    193     COPYRIGHT,
    194     DESTDIR,
    195     SOURCEDIR,
    196     UNICODE_VERSION,
    197     ICUDATADIR,
    198     CSOURCE
    199 };
    200 
    201 /* Keep these values in sync with the above enums */
    202 static UOption options[]={
    203     UOPTION_HELP_H,
    204     UOPTION_HELP_QUESTION_MARK,
    205     UOPTION_VERBOSE,
    206     UOPTION_COPYRIGHT,
    207     UOPTION_DESTDIR,
    208     UOPTION_SOURCEDIR,
    209     UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
    210     UOPTION_ICUDATADIR,
    211     UOPTION_DEF("csource", 'C', UOPT_NO_ARG)
    212 };
    213 
    214 extern int
    215 main(int argc, char* argv[]) {
    216     char filename[300];
    217     const char *srcDir=NULL, *destDir=NULL, *suffix=NULL;
    218     char *basename=NULL;
    219     UErrorCode errorCode=U_ZERO_ERROR;
    220 
    221     U_MAIN_INIT_ARGS(argc, argv);
    222 
    223     /* preset then read command line options */
    224     options[DESTDIR].value=u_getDataDirectory();
    225     options[SOURCEDIR].value="";
    226     options[UNICODE_VERSION].value="";
    227     options[ICUDATADIR].value=u_getDataDirectory();
    228     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
    229 
    230     /* error handling, printing usage message */
    231     if(argc<0) {
    232         fprintf(stderr,
    233             "error in command line argument \"%s\"\n",
    234             argv[-argc]);
    235     }
    236     if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) {
    237         /*
    238          * Broken into chunks because the C89 standard says the minimum
    239          * required supported string length is 509 bytes.
    240          */
    241         fprintf(stderr,
    242             "Usage: %s [-options] [suffix]\n"
    243             "\n"
    244             "read the UnicodeData.txt file and other Unicode properties files and\n"
    245             "create a binary file " UCASE_DATA_NAME "." UCASE_DATA_TYPE " with the case mapping properties\n"
    246             "\n",
    247             argv[0]);
    248         fprintf(stderr,
    249             "Options:\n"
    250             "\t-h or -? or --help  this usage text\n"
    251             "\t-v or --verbose     verbose output\n"
    252             "\t-c or --copyright   include a copyright notice\n"
    253             "\t-u or --unicode     Unicode version, followed by the version like 3.0.0\n"
    254             "\t-C or --csource     generate a .c source file rather than the .icu binary\n");
    255         fprintf(stderr,
    256             "\t-d or --destdir     destination directory, followed by the path\n"
    257             "\t-s or --sourcedir   source directory, followed by the path\n"
    258             "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n"
    259             "\t                    followed by path, defaults to %s\n"
    260             "\tsuffix              suffix that is to be appended with a '-'\n"
    261             "\t                    to the source file basenames before opening;\n"
    262             "\t                    'gencase new' will read UnicodeData-new.txt etc.\n",
    263             u_getDataDirectory());
    264         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
    265     }
    266 
    267     /* get the options values */
    268     beVerbose=options[VERBOSE].doesOccur;
    269     haveCopyright=options[COPYRIGHT].doesOccur;
    270     srcDir=options[SOURCEDIR].value;
    271     destDir=options[DESTDIR].value;
    272 
    273     if(argc>=2) {
    274         suffix=argv[1];
    275     } else {
    276         suffix=NULL;
    277     }
    278 
    279     if(options[UNICODE_VERSION].doesOccur) {
    280         setUnicodeVersion(options[UNICODE_VERSION].value);
    281     }
    282     /* else use the default dataVersion in store.c */
    283 
    284     if (options[ICUDATADIR].doesOccur) {
    285         u_setDataDirectory(options[ICUDATADIR].value);
    286     }
    287 
    288     /* prepare the filename beginning with the source dir */
    289     uprv_strcpy(filename, srcDir);
    290     basename=filename+uprv_strlen(filename);
    291     if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
    292         *basename++=U_FILE_SEP_CHAR;
    293     }
    294 
    295     /* initialize */
    296     pv=upvec_open(2, &errorCode);
    297     caseSensitive=uset_open(1, 0); /* empty set (start>end) */
    298 
    299     /* process SpecialCasing.txt */
    300     writeUCDFilename(basename, "SpecialCasing", suffix);
    301     parseSpecialCasing(filename, &errorCode);
    302 
    303     /* process CaseFolding.txt */
    304     writeUCDFilename(basename, "CaseFolding", suffix);
    305     parseCaseFolding(filename, &errorCode);
    306 
    307     /* process additional properties files */
    308     *basename=0;
    309 
    310     parseBinariesFile(filename, basename, suffix, &propListBinaries, &errorCode);
    311 
    312     parseBinariesFile(filename, basename, suffix, &derCorePropsBinaries, &errorCode);
    313 
    314     if(ucdVersion>=UNI_4_1) {
    315         parseBinariesFile(filename, basename, suffix, &wordBreakBinaries, &errorCode);
    316     }
    317 
    318     /* process UnicodeData.txt */
    319     writeUCDFilename(basename, "UnicodeData", suffix);
    320     parseDB(filename, &errorCode);
    321 
    322     /* process parsed data */
    323     makeCaseClosure();
    324 
    325     makeExceptions();
    326 
    327     if(U_SUCCESS(errorCode)) {
    328         /* write the properties data file */
    329         generateData(destDir, options[CSOURCE].doesOccur);
    330     }
    331 
    332     u_cleanup();
    333     return errorCode;
    334 }
    335 
    336 U_CFUNC void
    337 writeUCDFilename(char *basename, const char *filename, const char *suffix) {
    338     int32_t length=(int32_t)uprv_strlen(filename);
    339     uprv_strcpy(basename, filename);
    340     if(suffix!=NULL) {
    341         basename[length++]='-';
    342         uprv_strcpy(basename+length, suffix);
    343         length+=(int32_t)uprv_strlen(suffix);
    344     }
    345     uprv_strcpy(basename+length, ".txt");
    346 }
    347 
    348 /* TODO: move to toolutil */
    349 U_CFUNC UBool
    350 isToken(const char *token, const char *s) {
    351     const char *z;
    352     int32_t j;
    353 
    354     s=u_skipWhitespace(s);
    355     for(j=0;; ++j) {
    356         if(token[j]!=0) {
    357             if(s[j]!=token[j]) {
    358                 break;
    359             }
    360         } else {
    361             z=u_skipWhitespace(s+j);
    362             if(*z==';' || *z==0) {
    363                 return TRUE;
    364             } else {
    365                 break;
    366             }
    367         }
    368     }
    369 
    370     return FALSE;
    371 }
    372 
    373 static int32_t
    374 getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
    375     const char *t, *z;
    376     int32_t i, j;
    377 
    378     s=u_skipWhitespace(s);
    379     for(i=0; i<countTokens; ++i) {
    380         t=tokens[i];
    381         if(t!=NULL) {
    382             for(j=0;; ++j) {
    383                 if(t[j]!=0) {
    384                     if(s[j]!=t[j]) {
    385                         break;
    386                     }
    387                 } else {
    388                     z=u_skipWhitespace(s+j);
    389                     if(*z==';' || *z==0 || *z=='#' || *z=='\r' || *z=='\n') {
    390                         return i;
    391                     } else {
    392                         break;
    393                     }
    394                 }
    395             }
    396         }
    397     }
    398     return -1;
    399 }
    400 
    401 static void
    402 _set_addAll(USet *set, const UChar *s, int32_t length) {
    403     UChar32 c;
    404     int32_t i;
    405 
    406     /* needs length>=0 */
    407     for(i=0; i<length; /* U16_NEXT advances i */) {
    408         U16_NEXT(s, i, length, c);
    409         uset_add(set, c);
    410     }
    411 }
    412 
    413 /* parser for SpecialCasing.txt --------------------------------------------- */
    414 
    415 #define MAX_SPECIAL_CASING_COUNT 500
    416 
    417 static SpecialCasing specialCasings[MAX_SPECIAL_CASING_COUNT];
    418 static int32_t specialCasingCount=0;
    419 
    420 static void U_CALLCONV
    421 specialCasingLineFn(void *context,
    422                     char *fields[][2], int32_t fieldCount,
    423                     UErrorCode *pErrorCode) {
    424     char *end;
    425 
    426     /* get code point */
    427     specialCasings[specialCasingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
    428     end=(char *)u_skipWhitespace(end);
    429     if(end<=fields[0][0] || end!=fields[0][1]) {
    430         fprintf(stderr, "gencase: syntax error in SpecialCasing.txt field 0 at %s\n", fields[0][0]);
    431         *pErrorCode=U_PARSE_ERROR;
    432         exit(U_PARSE_ERROR);
    433     }
    434 
    435     /* is this a complex mapping? */
    436     if(*(end=(char *)u_skipWhitespace(fields[4][0]))!=0 && *end!=';' && *end!='#') {
    437         /* there is some condition text in the fifth field */
    438         specialCasings[specialCasingCount].isComplex=TRUE;
    439 
    440         /* do not store any actual mappings for this */
    441         specialCasings[specialCasingCount].lowerCase[0]=0;
    442         specialCasings[specialCasingCount].upperCase[0]=0;
    443         specialCasings[specialCasingCount].titleCase[0]=0;
    444     } else {
    445         /* just set the "complex" flag and get the case mappings */
    446         specialCasings[specialCasingCount].isComplex=FALSE;
    447         specialCasings[specialCasingCount].lowerCase[0]=
    448             (UChar)u_parseString(fields[1][0], specialCasings[specialCasingCount].lowerCase+1, 31, NULL, pErrorCode);
    449         specialCasings[specialCasingCount].upperCase[0]=
    450             (UChar)u_parseString(fields[3][0], specialCasings[specialCasingCount].upperCase+1, 31, NULL, pErrorCode);
    451         specialCasings[specialCasingCount].titleCase[0]=
    452             (UChar)u_parseString(fields[2][0], specialCasings[specialCasingCount].titleCase+1, 31, NULL, pErrorCode);
    453         if(U_FAILURE(*pErrorCode)) {
    454             fprintf(stderr, "gencase: error parsing special casing at %s\n", fields[0][0]);
    455             exit(*pErrorCode);
    456         }
    457 
    458         uset_add(caseSensitive, (UChar32)specialCasings[specialCasingCount].code);
    459         _set_addAll(caseSensitive, specialCasings[specialCasingCount].lowerCase+1, specialCasings[specialCasingCount].lowerCase[0]);
    460         _set_addAll(caseSensitive, specialCasings[specialCasingCount].upperCase+1, specialCasings[specialCasingCount].upperCase[0]);
    461         _set_addAll(caseSensitive, specialCasings[specialCasingCount].titleCase+1, specialCasings[specialCasingCount].titleCase[0]);
    462     }
    463 
    464     if(++specialCasingCount==MAX_SPECIAL_CASING_COUNT) {
    465         fprintf(stderr, "gencase: too many special casing mappings\n");
    466         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    467         exit(U_INDEX_OUTOFBOUNDS_ERROR);
    468     }
    469 }
    470 
    471 static int32_t U_CALLCONV
    472 compareSpecialCasings(const void *context, const void *left, const void *right) {
    473     return ((const SpecialCasing *)left)->code-((const SpecialCasing *)right)->code;
    474 }
    475 
    476 static void
    477 parseSpecialCasing(const char *filename, UErrorCode *pErrorCode) {
    478     char *fields[5][2];
    479     int32_t i, j;
    480 
    481     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    482         return;
    483     }
    484 
    485     u_parseDelimitedFile(filename, ';', fields, 5, specialCasingLineFn, NULL, pErrorCode);
    486 
    487     /* sort the special casing entries by code point */
    488     if(specialCasingCount>0) {
    489         uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing),
    490                        compareSpecialCasings, NULL, FALSE, pErrorCode);
    491     }
    492     if(U_FAILURE(*pErrorCode)) {
    493         return;
    494     }
    495 
    496     /* replace multiple entries for any code point by one "complex" one */
    497     j=0;
    498     for(i=1; i<specialCasingCount; ++i) {
    499         if(specialCasings[i-1].code==specialCasings[i].code) {
    500             /* there is a duplicate code point */
    501             specialCasings[i-1].code=0x7fffffff;    /* remove this entry in the following sorting */
    502             specialCasings[i].isComplex=TRUE;       /* make the following one complex */
    503             specialCasings[i].lowerCase[0]=0;
    504             specialCasings[i].upperCase[0]=0;
    505             specialCasings[i].titleCase[0]=0;
    506             ++j;
    507         }
    508     }
    509 
    510     /* if some entries just were removed, then re-sort */
    511     if(j>0) {
    512         uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing),
    513                        compareSpecialCasings, NULL, FALSE, pErrorCode);
    514         specialCasingCount-=j;
    515     }
    516     if(U_FAILURE(*pErrorCode)) {
    517         return;
    518     }
    519 
    520     /*
    521      * Add one complex mapping to caseSensitive that was filtered out above:
    522      * Greek final Sigma has a conditional mapping but not locale-sensitive,
    523      * and it is taken when lowercasing just U+03A3 alone.
    524      * 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
    525      */
    526     uset_add(caseSensitive, 0x3c2);
    527 }
    528 
    529 /* parser for CaseFolding.txt ----------------------------------------------- */
    530 
    531 #define MAX_CASE_FOLDING_COUNT 2000
    532 
    533 static CaseFolding caseFoldings[MAX_CASE_FOLDING_COUNT];
    534 static int32_t caseFoldingCount=0;
    535 
    536 static void U_CALLCONV
    537 caseFoldingLineFn(void *context,
    538                   char *fields[][2], int32_t fieldCount,
    539                   UErrorCode *pErrorCode) {
    540     char *end;
    541     static UChar32 prevCode=0;
    542     int32_t count;
    543     char status;
    544 
    545     /* get code point */
    546     caseFoldings[caseFoldingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
    547     end=(char *)u_skipWhitespace(end);
    548     if(end<=fields[0][0] || end!=fields[0][1]) {
    549         fprintf(stderr, "gencase: syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
    550         *pErrorCode=U_PARSE_ERROR;
    551         exit(U_PARSE_ERROR);
    552     }
    553 
    554     /* get the status of this mapping */
    555     caseFoldings[caseFoldingCount].status=status=*u_skipWhitespace(fields[1][0]);
    556     if(status!='L' && status!='E' && status!='C' && status!='S' && status!='F' && status!='I' && status!='T') {
    557         fprintf(stderr, "gencase: unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
    558         *pErrorCode=U_PARSE_ERROR;
    559         exit(U_PARSE_ERROR);
    560     }
    561 
    562     /* ignore all case folding mappings that are the same as the UnicodeData.txt lowercase mappings */
    563     if(status=='L') {
    564         return;
    565     }
    566 
    567     /* get the mapping */
    568     count=caseFoldings[caseFoldingCount].full[0]=
    569         (UChar)u_parseString(fields[2][0], caseFoldings[caseFoldingCount].full+1, 31, (uint32_t *)&caseFoldings[caseFoldingCount].simple, pErrorCode);
    570     if(U_FAILURE(*pErrorCode)) {
    571         fprintf(stderr, "gencase: error parsing CaseFolding.txt mapping at %s\n", fields[0][0]);
    572         exit(*pErrorCode);
    573     }
    574 
    575     /* there is a simple mapping only if there is exactly one code point (count is in UChars) */
    576     if(count==0 || count>2 || (count==2 && UTF_IS_SINGLE(caseFoldings[caseFoldingCount].full[1]))) {
    577         caseFoldings[caseFoldingCount].simple=0;
    578     }
    579 
    580     /* update the case-sensitive set */
    581     if(status!='T') {
    582         uset_add(caseSensitive, (UChar32)caseFoldings[caseFoldingCount].code);
    583         _set_addAll(caseSensitive, caseFoldings[caseFoldingCount].full+1, caseFoldings[caseFoldingCount].full[0]);
    584     }
    585 
    586     /* check the status */
    587     if(status=='S') {
    588         /* check if there was a full mapping for this code point before */
    589         if( caseFoldingCount>0 &&
    590             caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
    591             caseFoldings[caseFoldingCount-1].status=='F'
    592         ) {
    593             /* merge the two entries */
    594             caseFoldings[caseFoldingCount-1].simple=caseFoldings[caseFoldingCount].simple;
    595             return;
    596         }
    597     } else if(status=='F') {
    598         /* check if there was a simple mapping for this code point before */
    599         if( caseFoldingCount>0 &&
    600             caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
    601             caseFoldings[caseFoldingCount-1].status=='S'
    602         ) {
    603             /* merge the two entries */
    604             uprv_memcpy(caseFoldings[caseFoldingCount-1].full, caseFoldings[caseFoldingCount].full, 32*U_SIZEOF_UCHAR);
    605             return;
    606         }
    607     } else if(status=='I' || status=='T') {
    608         /* check if there was a default mapping for this code point before (remove it) */
    609         while(caseFoldingCount>0 &&
    610               caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code
    611         ) {
    612             prevCode=0;
    613             --caseFoldingCount;
    614         }
    615         /* store only a marker for special handling for cases like dotless i */
    616         caseFoldings[caseFoldingCount].simple=0;
    617         caseFoldings[caseFoldingCount].full[0]=0;
    618     }
    619 
    620     /* check that the code points (caseFoldings[caseFoldingCount].code) are in ascending order */
    621     if(caseFoldings[caseFoldingCount].code<=prevCode && caseFoldings[caseFoldingCount].code>0) {
    622         fprintf(stderr, "gencase: error - CaseFolding entries out of order, U+%04lx after U+%04lx\n",
    623                 (unsigned long)caseFoldings[caseFoldingCount].code,
    624                 (unsigned long)prevCode);
    625         *pErrorCode=U_PARSE_ERROR;
    626         exit(U_PARSE_ERROR);
    627     }
    628     prevCode=caseFoldings[caseFoldingCount].code;
    629 
    630     if(++caseFoldingCount==MAX_CASE_FOLDING_COUNT) {
    631         fprintf(stderr, "gencase: too many case folding mappings\n");
    632         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    633         exit(U_INDEX_OUTOFBOUNDS_ERROR);
    634     }
    635 }
    636 
    637 static void
    638 parseCaseFolding(const char *filename, UErrorCode *pErrorCode) {
    639     char *fields[3][2];
    640 
    641     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    642         return;
    643     }
    644 
    645     u_parseDelimitedFile(filename, ';', fields, 3, caseFoldingLineFn, NULL, pErrorCode);
    646 }
    647 
    648 /* parser for UnicodeData.txt ----------------------------------------------- */
    649 
    650 /* general categories */
    651 const char *const
    652 genCategoryNames[U_CHAR_CATEGORY_COUNT]={
    653     "Cn",
    654     "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me",
    655     "Mc", "Nd", "Nl", "No",
    656     "Zs", "Zl", "Zp",
    657     "Cc", "Cf", "Co", "Cs",
    658     "Pd", "Ps", "Pe", "Pc", "Po",
    659     "Sm", "Sc", "Sk", "So",
    660     "Pi", "Pf"
    661 };
    662 
    663 static int32_t specialCasingIndex=0, caseFoldingIndex=0;
    664 
    665 static void U_CALLCONV
    666 unicodeDataLineFn(void *context,
    667                   char *fields[][2], int32_t fieldCount,
    668                   UErrorCode *pErrorCode) {
    669     Props p;
    670     char *end;
    671     static UChar32 prevCode=0;
    672     UChar32 value;
    673     int32_t i;
    674 
    675     /* reset the properties */
    676     uprv_memset(&p, 0, sizeof(Props));
    677 
    678     /* get the character code, field 0 */
    679     p.code=(UChar32)uprv_strtoul(fields[0][0], &end, 16);
    680     if(end<=fields[0][0] || end!=fields[0][1]) {
    681         fprintf(stderr, "gencase: syntax error in field 0 at %s\n", fields[0][0]);
    682         *pErrorCode=U_PARSE_ERROR;
    683         exit(U_PARSE_ERROR);
    684     }
    685 
    686     /* get general category, field 2 */
    687     i=getTokenIndex(genCategoryNames, U_CHAR_CATEGORY_COUNT, fields[2][0]);
    688     if(i>=0) {
    689         p.gc=(uint8_t)i;
    690     } else {
    691         fprintf(stderr, "gencase: unknown general category \"%s\" at code 0x%lx\n",
    692             fields[2][0], (unsigned long)p.code);
    693         *pErrorCode=U_PARSE_ERROR;
    694         exit(U_PARSE_ERROR);
    695     }
    696 
    697     /* get canonical combining class, field 3 */
    698     value=(UChar32)uprv_strtoul(fields[3][0], &end, 10);
    699     if(end<=fields[3][0] || end!=fields[3][1] || value>0xff) {
    700         fprintf(stderr, "gencase: syntax error in field 3 at %s\n", fields[0][0]);
    701         *pErrorCode=U_PARSE_ERROR;
    702         exit(U_PARSE_ERROR);
    703     }
    704     p.cc=(uint8_t)value;
    705 
    706     /* get uppercase mapping, field 12 */
    707     value=(UChar32)uprv_strtoul(fields[12][0], &end, 16);
    708     if(end!=fields[12][1]) {
    709         fprintf(stderr, "gencase: syntax error in field 12 at code 0x%lx\n",
    710             (unsigned long)p.code);
    711         *pErrorCode=U_PARSE_ERROR;
    712         exit(U_PARSE_ERROR);
    713     }
    714     if(value!=0 && value!=p.code) {
    715         p.upperCase=value;
    716         uset_add(caseSensitive, p.code);
    717         uset_add(caseSensitive, value);
    718     }
    719 
    720     /* get lowercase value, field 13 */
    721     value=(UChar32)uprv_strtoul(fields[13][0], &end, 16);
    722     if(end!=fields[13][1]) {
    723         fprintf(stderr, "gencase: syntax error in field 13 at code 0x%lx\n",
    724             (unsigned long)p.code);
    725         *pErrorCode=U_PARSE_ERROR;
    726         exit(U_PARSE_ERROR);
    727     }
    728     if(value!=0 && value!=p.code) {
    729         p.lowerCase=value;
    730         uset_add(caseSensitive, p.code);
    731         uset_add(caseSensitive, value);
    732     }
    733 
    734     /* get titlecase value, field 14 */
    735     value=(UChar32)uprv_strtoul(fields[14][0], &end, 16);
    736     if(end!=fields[14][1]) {
    737         fprintf(stderr, "gencase: syntax error in field 14 at code 0x%lx\n",
    738             (unsigned long)p.code);
    739         *pErrorCode=U_PARSE_ERROR;
    740         exit(U_PARSE_ERROR);
    741     }
    742     if(value!=0 && value!=p.code) {
    743         p.titleCase=value;
    744         uset_add(caseSensitive, p.code);
    745         uset_add(caseSensitive, value);
    746     }
    747 
    748     /* set additional properties from previously parsed files */
    749     if(specialCasingIndex<specialCasingCount && p.code==specialCasings[specialCasingIndex].code) {
    750         p.specialCasing=specialCasings+specialCasingIndex++;
    751     } else {
    752         p.specialCasing=NULL;
    753     }
    754     if(caseFoldingIndex<caseFoldingCount && p.code==caseFoldings[caseFoldingIndex].code) {
    755         p.caseFolding=caseFoldings+caseFoldingIndex++;
    756 
    757         /* ignore "Common" mappings (simple==full) that map to the same code point as the regular lowercase mapping */
    758         if( p.caseFolding->status=='C' &&
    759             p.caseFolding->simple==p.lowerCase
    760         ) {
    761             p.caseFolding=NULL;
    762         }
    763     } else {
    764         p.caseFolding=NULL;
    765     }
    766 
    767     /* check for non-character code points */
    768     if((p.code&0xfffe)==0xfffe || (uint32_t)(p.code-0xfdd0)<0x20) {
    769         fprintf(stderr, "gencase: error - properties for non-character code point U+%04lx\n",
    770                 (unsigned long)p.code);
    771         *pErrorCode=U_PARSE_ERROR;
    772         exit(U_PARSE_ERROR);
    773     }
    774 
    775     /* check that the code points (p.code) are in ascending order */
    776     if(p.code<=prevCode && p.code>0) {
    777         fprintf(stderr, "gencase: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
    778                 (unsigned long)p.code, (unsigned long)prevCode);
    779         *pErrorCode=U_PARSE_ERROR;
    780         exit(U_PARSE_ERROR);
    781     }
    782 
    783     /* properties for a single code point */
    784     setProps(&p);
    785 
    786     prevCode=p.code;
    787 }
    788 
    789 static void
    790 parseDB(const char *filename, UErrorCode *pErrorCode) {
    791     char *fields[15][2];
    792     UChar32 start, end;
    793     int32_t i;
    794 
    795     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    796         return;
    797     }
    798 
    799     u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode);
    800 
    801     /* are all sub-properties consumed? */
    802     if(specialCasingIndex<specialCasingCount) {
    803         fprintf(stderr, "gencase: error - some code points in SpecialCasing.txt are missing from UnicodeData.txt\n");
    804         *pErrorCode=U_PARSE_ERROR;
    805         exit(U_PARSE_ERROR);
    806     }
    807     if(caseFoldingIndex<caseFoldingCount) {
    808         fprintf(stderr, "gencase: error - some code points in CaseFolding.txt are missing from UnicodeData.txt\n");
    809         *pErrorCode=U_PARSE_ERROR;
    810         exit(U_PARSE_ERROR);
    811     }
    812 
    813     if(U_FAILURE(*pErrorCode)) {
    814         return;
    815     }
    816 
    817     for(i=0;
    818         0==uset_getItem(caseSensitive, i, &start, &end, NULL, 0, pErrorCode) && U_SUCCESS(*pErrorCode);
    819         ++i
    820     ) {
    821         addCaseSensitive(start, end);
    822     }
    823     if(*pErrorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
    824         *pErrorCode=U_ZERO_ERROR;
    825     }
    826 }
    827 
    828 /*
    829  * Hey, Emacs, please set the following:
    830  *
    831  * Local Variables:
    832  * indent-tabs-mode: nil
    833  * End:
    834  *
    835  */
    836