Home | History | Annotate | Download | only in gencase
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2004-2008, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  gencase.c
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2004aug28
     14 *   created by: Markus W. Scherer
     15 *
     16 *   This program reads several of the Unicode character database text files,
     17 *   parses them, and the case mapping properties for each character.
     18 *   It then writes a binary file containing the properties
     19 *   that is designed to be used directly for random-access to
     20 *   the properties of each Unicode character.
     21 */
     22 
     23 #include <stdio.h>
     24 #include "unicode/utypes.h"
     25 #include "unicode/uchar.h"
     26 #include "unicode/uset.h"
     27 #include "unicode/putil.h"
     28 #include "unicode/uclean.h"
     29 #include "cmemory.h"
     30 #include "cstring.h"
     31 #include "uarrsort.h"
     32 #include "unewdata.h"
     33 #include "uoptions.h"
     34 #include "uparse.h"
     35 #include "uprops.h"
     36 #include "propsvec.h"
     37 #include "gencase.h"
     38 
     39 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
     40 
     41 /* data --------------------------------------------------------------------- */
     42 
     43 UPropsVectors *pv;
     44 
     45 UBool beVerbose=FALSE, haveCopyright=TRUE;
     46 
     47 /*
     48  * Unicode set collecting the case-sensitive characters;
     49  * see uchar.h UCHAR_CASE_SENSITIVE.
     50  * Add code points from case mappings/foldings in
     51  * the root locale and with default options.
     52  */
     53 static USet *caseSensitive;
     54 
     55 /* prototypes --------------------------------------------------------------- */
     56 
     57 static void
     58 parseSpecialCasing(const char *filename, UErrorCode *pErrorCode);
     59 
     60 static void
     61 parseCaseFolding(const char *filename, UErrorCode *pErrorCode);
     62 
     63 static void
     64 parseDB(const char *filename, UErrorCode *pErrorCode);
     65 
     66 /* parse files with multiple binary properties ------------------------------ */
     67 
     68 /* TODO: more common code, move functions to uparse.h|c */
     69 
     70 /* TODO: similar to genprops/props2.c but not the same */
     71 
     72 struct Binary {
     73     const char *propName;
     74     int32_t vecWord;
     75     uint32_t vecValue, vecMask;
     76 };
     77 typedef struct Binary Binary;
     78 
     79 struct Binaries {
     80     const char *ucdFile;
     81     const Binary *binaries;
     82     int32_t binariesCount;
     83 };
     84 typedef struct Binaries Binaries;
     85 
     86 static const Binary
     87 propListNames[]={
     88     { "Soft_Dotted",                        0, UCASE_SOFT_DOTTED,   UCASE_DOT_MASK }
     89 };
     90 
     91 static const Binaries
     92 propListBinaries={
     93     "PropList", propListNames, LENGTHOF(propListNames)
     94 };
     95 
     96 static const Binary
     97 derCorePropsNames[]={
     98     { "Lowercase",                          0, UCASE_LOWER,         UCASE_TYPE_MASK },
     99     { "Uppercase",                          0, UCASE_UPPER,         UCASE_TYPE_MASK }
    100 };
    101 
    102 static const Binaries
    103 derCorePropsBinaries={
    104     "DerivedCoreProperties", derCorePropsNames, LENGTHOF(derCorePropsNames)
    105 };
    106 
    107 /*
    108  * Treat Word_Break=MidLetter and MidNumLet as a single binary property.
    109  * We need not distinguish between them because both add to case-ignorable.
    110  * We ignore all other Word_Break values.
    111  */
    112 static const Binary
    113 wordBreakNames[]={
    114     { "MidLetter",                          1, U_MASK(UGENCASE_IS_MID_LETTER_SHIFT), U_MASK(UGENCASE_IS_MID_LETTER_SHIFT) },
    115     { "MidNumLet",                          1, U_MASK(UGENCASE_IS_MID_LETTER_SHIFT), U_MASK(UGENCASE_IS_MID_LETTER_SHIFT) }
    116 };
    117 
    118 static const Binaries
    119 wordBreakBinaries={
    120     "WordBreakProperty", wordBreakNames, LENGTHOF(wordBreakNames)
    121 };
    122 
    123 static void U_CALLCONV
    124 binariesLineFn(void *context,
    125                char *fields[][2], int32_t fieldCount,
    126                UErrorCode *pErrorCode) {
    127     const Binaries *bin;
    128     char *s;
    129     uint32_t start, end;
    130     int32_t i;
    131 
    132     bin=(const Binaries *)context;
    133 
    134     u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
    135     if(U_FAILURE(*pErrorCode)) {
    136         fprintf(stderr, "gencase: syntax error in %s.txt field 0 at %s\n", bin->ucdFile, fields[0][0]);
    137         exit(*pErrorCode);
    138     }
    139 
    140     /* parse binary property name */
    141     s=(char *)u_skipWhitespace(fields[1][0]);
    142     for(i=0;; ++i) {
    143         if(i==bin->binariesCount) {
    144             /* ignore unrecognized properties */
    145             return;
    146         }
    147         if(isToken(bin->binaries[i].propName, s)) {
    148             break;
    149         }
    150     }
    151 
    152     if(bin->binaries[i].vecMask==0) {
    153         fprintf(stderr, "gencase error: mask value %d==0 for %s %s\n",
    154                         (int)bin->binaries[i].vecMask, bin->ucdFile, bin->binaries[i].propName);
    155         exit(U_INTERNAL_PROGRAM_ERROR);
    156     }
    157 
    158     upvec_setValue(pv, start, end, bin->binaries[i].vecWord, bin->binaries[i].vecValue, bin->binaries[i].vecMask, pErrorCode);
    159     if(U_FAILURE(*pErrorCode)) {
    160         fprintf(stderr, "gencase error: unable to set %s, code: %s\n",
    161                         bin->binaries[i].propName, u_errorName(*pErrorCode));
    162         exit(*pErrorCode);
    163     }
    164 }
    165 
    166 static void
    167 parseBinariesFile(char *filename, char *basename, const char *suffix,
    168                   const Binaries *bin,
    169                   UErrorCode *pErrorCode) {
    170     char *fields[2][2];
    171 
    172     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    173         return;
    174     }
    175 
    176     writeUCDFilename(basename, bin->ucdFile, suffix);
    177 
    178     u_parseDelimitedFile(filename, ';', fields, 2, binariesLineFn, (void *)bin, pErrorCode);
    179     if(U_FAILURE(*pErrorCode)) {
    180         fprintf(stderr, "error parsing %s.txt: %s\n", bin->ucdFile, u_errorName(*pErrorCode));
    181     }
    182 }
    183 
    184 /* -------------------------------------------------------------------------- */
    185 
    186 enum
    187 {
    188     HELP_H,
    189     HELP_QUESTION_MARK,
    190     VERBOSE,
    191     COPYRIGHT,
    192     DESTDIR,
    193     SOURCEDIR,
    194     UNICODE_VERSION,
    195     ICUDATADIR,
    196     CSOURCE
    197 };
    198 
    199 /* Keep these values in sync with the above enums */
    200 static UOption options[]={
    201     UOPTION_HELP_H,
    202     UOPTION_HELP_QUESTION_MARK,
    203     UOPTION_VERBOSE,
    204     UOPTION_COPYRIGHT,
    205     UOPTION_DESTDIR,
    206     UOPTION_SOURCEDIR,
    207     UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
    208     UOPTION_ICUDATADIR,
    209     UOPTION_DEF("csource", 'C', UOPT_NO_ARG)
    210 };
    211 
    212 extern int
    213 main(int argc, char* argv[]) {
    214     char filename[300];
    215     const char *srcDir=NULL, *destDir=NULL, *suffix=NULL;
    216     char *basename=NULL;
    217     UErrorCode errorCode=U_ZERO_ERROR;
    218 
    219     U_MAIN_INIT_ARGS(argc, argv);
    220 
    221     /* preset then read command line options */
    222     options[DESTDIR].value=u_getDataDirectory();
    223     options[SOURCEDIR].value="";
    224     options[UNICODE_VERSION].value="";
    225     options[ICUDATADIR].value=u_getDataDirectory();
    226     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
    227 
    228     /* error handling, printing usage message */
    229     if(argc<0) {
    230         fprintf(stderr,
    231             "error in command line argument \"%s\"\n",
    232             argv[-argc]);
    233     }
    234     if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) {
    235         /*
    236          * Broken into chucks because the C89 standard says the minimum
    237          * required supported string length is 509 bytes.
    238          */
    239         fprintf(stderr,
    240             "Usage: %s [-options] [suffix]\n"
    241             "\n"
    242             "read the UnicodeData.txt file and other Unicode properties files and\n"
    243             "create a binary file " UCASE_DATA_NAME "." UCASE_DATA_TYPE " with the case mapping properties\n"
    244             "\n",
    245             argv[0]);
    246         fprintf(stderr,
    247             "Options:\n"
    248             "\t-h or -? or --help  this usage text\n"
    249             "\t-v or --verbose     verbose output\n"
    250             "\t-c or --copyright   include a copyright notice\n"
    251             "\t-u or --unicode     Unicode version, followed by the version like 3.0.0\n"
    252             "\t-C or --csource     generate a .c source file rather than the .icu binary\n");
    253         fprintf(stderr,
    254             "\t-d or --destdir     destination directory, followed by the path\n"
    255             "\t-s or --sourcedir   source directory, followed by the path\n"
    256             "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n"
    257             "\t                    followed by path, defaults to %s\n"
    258             "\tsuffix              suffix that is to be appended with a '-'\n"
    259             "\t                    to the source file basenames before opening;\n"
    260             "\t                    'gencase new' will read UnicodeData-new.txt etc.\n",
    261             u_getDataDirectory());
    262         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
    263     }
    264 
    265     /* get the options values */
    266     beVerbose=options[VERBOSE].doesOccur;
    267     haveCopyright=options[COPYRIGHT].doesOccur;
    268     srcDir=options[SOURCEDIR].value;
    269     destDir=options[DESTDIR].value;
    270 
    271     if(argc>=2) {
    272         suffix=argv[1];
    273     } else {
    274         suffix=NULL;
    275     }
    276 
    277     if(options[UNICODE_VERSION].doesOccur) {
    278         setUnicodeVersion(options[UNICODE_VERSION].value);
    279     }
    280     /* else use the default dataVersion in store.c */
    281 
    282     if (options[ICUDATADIR].doesOccur) {
    283         u_setDataDirectory(options[ICUDATADIR].value);
    284     }
    285 
    286     /* prepare the filename beginning with the source dir */
    287     uprv_strcpy(filename, srcDir);
    288     basename=filename+uprv_strlen(filename);
    289     if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
    290         *basename++=U_FILE_SEP_CHAR;
    291     }
    292 
    293     /* initialize */
    294     pv=upvec_open(2, &errorCode);
    295     caseSensitive=uset_open(1, 0); /* empty set (start>end) */
    296 
    297     /* process SpecialCasing.txt */
    298     writeUCDFilename(basename, "SpecialCasing", suffix);
    299     parseSpecialCasing(filename, &errorCode);
    300 
    301     /* process CaseFolding.txt */
    302     writeUCDFilename(basename, "CaseFolding", suffix);
    303     parseCaseFolding(filename, &errorCode);
    304 
    305     /* process additional properties files */
    306     *basename=0;
    307 
    308     parseBinariesFile(filename, basename, suffix, &propListBinaries, &errorCode);
    309 
    310     parseBinariesFile(filename, basename, suffix, &derCorePropsBinaries, &errorCode);
    311 
    312     if(ucdVersion>=UNI_4_1) {
    313         parseBinariesFile(filename, basename, suffix, &wordBreakBinaries, &errorCode);
    314     }
    315 
    316     /* process UnicodeData.txt */
    317     writeUCDFilename(basename, "UnicodeData", suffix);
    318     parseDB(filename, &errorCode);
    319 
    320     /* process parsed data */
    321     makeCaseClosure();
    322 
    323     makeExceptions();
    324 
    325     if(U_SUCCESS(errorCode)) {
    326         /* write the properties data file */
    327         generateData(destDir, options[CSOURCE].doesOccur);
    328     }
    329 
    330     u_cleanup();
    331     return errorCode;
    332 }
    333 
    334 U_CFUNC void
    335 writeUCDFilename(char *basename, const char *filename, const char *suffix) {
    336     int32_t length=(int32_t)uprv_strlen(filename);
    337     uprv_strcpy(basename, filename);
    338     if(suffix!=NULL) {
    339         basename[length++]='-';
    340         uprv_strcpy(basename+length, suffix);
    341         length+=(int32_t)uprv_strlen(suffix);
    342     }
    343     uprv_strcpy(basename+length, ".txt");
    344 }
    345 
    346 /* TODO: move to toolutil */
    347 U_CFUNC UBool
    348 isToken(const char *token, const char *s) {
    349     const char *z;
    350     int32_t j;
    351 
    352     s=u_skipWhitespace(s);
    353     for(j=0;; ++j) {
    354         if(token[j]!=0) {
    355             if(s[j]!=token[j]) {
    356                 break;
    357             }
    358         } else {
    359             z=u_skipWhitespace(s+j);
    360             if(*z==';' || *z==0) {
    361                 return TRUE;
    362             } else {
    363                 break;
    364             }
    365         }
    366     }
    367 
    368     return FALSE;
    369 }
    370 
    371 static int32_t
    372 getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
    373     const char *t, *z;
    374     int32_t i, j;
    375 
    376     s=u_skipWhitespace(s);
    377     for(i=0; i<countTokens; ++i) {
    378         t=tokens[i];
    379         if(t!=NULL) {
    380             for(j=0;; ++j) {
    381                 if(t[j]!=0) {
    382                     if(s[j]!=t[j]) {
    383                         break;
    384                     }
    385                 } else {
    386                     z=u_skipWhitespace(s+j);
    387                     if(*z==';' || *z==0 || *z=='#' || *z=='\r' || *z=='\n') {
    388                         return i;
    389                     } else {
    390                         break;
    391                     }
    392                 }
    393             }
    394         }
    395     }
    396     return -1;
    397 }
    398 
    399 static void
    400 _set_addAll(USet *set, const UChar *s, int32_t length) {
    401     UChar32 c;
    402     int32_t i;
    403 
    404     /* needs length>=0 */
    405     for(i=0; i<length; /* U16_NEXT advances i */) {
    406         U16_NEXT(s, i, length, c);
    407         uset_add(set, c);
    408     }
    409 }
    410 
    411 /* parser for SpecialCasing.txt --------------------------------------------- */
    412 
    413 #define MAX_SPECIAL_CASING_COUNT 500
    414 
    415 static SpecialCasing specialCasings[MAX_SPECIAL_CASING_COUNT];
    416 static int32_t specialCasingCount=0;
    417 
    418 static void U_CALLCONV
    419 specialCasingLineFn(void *context,
    420                     char *fields[][2], int32_t fieldCount,
    421                     UErrorCode *pErrorCode) {
    422     char *end;
    423 
    424     /* get code point */
    425     specialCasings[specialCasingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
    426     end=(char *)u_skipWhitespace(end);
    427     if(end<=fields[0][0] || end!=fields[0][1]) {
    428         fprintf(stderr, "gencase: syntax error in SpecialCasing.txt field 0 at %s\n", fields[0][0]);
    429         *pErrorCode=U_PARSE_ERROR;
    430         exit(U_PARSE_ERROR);
    431     }
    432 
    433     /* is this a complex mapping? */
    434     if(*(end=(char *)u_skipWhitespace(fields[4][0]))!=0 && *end!=';' && *end!='#') {
    435         /* there is some condition text in the fifth field */
    436         specialCasings[specialCasingCount].isComplex=TRUE;
    437 
    438         /* do not store any actual mappings for this */
    439         specialCasings[specialCasingCount].lowerCase[0]=0;
    440         specialCasings[specialCasingCount].upperCase[0]=0;
    441         specialCasings[specialCasingCount].titleCase[0]=0;
    442     } else {
    443         /* just set the "complex" flag and get the case mappings */
    444         specialCasings[specialCasingCount].isComplex=FALSE;
    445         specialCasings[specialCasingCount].lowerCase[0]=
    446             (UChar)u_parseString(fields[1][0], specialCasings[specialCasingCount].lowerCase+1, 31, NULL, pErrorCode);
    447         specialCasings[specialCasingCount].upperCase[0]=
    448             (UChar)u_parseString(fields[3][0], specialCasings[specialCasingCount].upperCase+1, 31, NULL, pErrorCode);
    449         specialCasings[specialCasingCount].titleCase[0]=
    450             (UChar)u_parseString(fields[2][0], specialCasings[specialCasingCount].titleCase+1, 31, NULL, pErrorCode);
    451         if(U_FAILURE(*pErrorCode)) {
    452             fprintf(stderr, "gencase: error parsing special casing at %s\n", fields[0][0]);
    453             exit(*pErrorCode);
    454         }
    455 
    456         uset_add(caseSensitive, (UChar32)specialCasings[specialCasingCount].code);
    457         _set_addAll(caseSensitive, specialCasings[specialCasingCount].lowerCase+1, specialCasings[specialCasingCount].lowerCase[0]);
    458         _set_addAll(caseSensitive, specialCasings[specialCasingCount].upperCase+1, specialCasings[specialCasingCount].upperCase[0]);
    459         _set_addAll(caseSensitive, specialCasings[specialCasingCount].titleCase+1, specialCasings[specialCasingCount].titleCase[0]);
    460     }
    461 
    462     if(++specialCasingCount==MAX_SPECIAL_CASING_COUNT) {
    463         fprintf(stderr, "gencase: too many special casing mappings\n");
    464         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    465         exit(U_INDEX_OUTOFBOUNDS_ERROR);
    466     }
    467 }
    468 
    469 static int32_t U_CALLCONV
    470 compareSpecialCasings(const void *context, const void *left, const void *right) {
    471     return ((const SpecialCasing *)left)->code-((const SpecialCasing *)right)->code;
    472 }
    473 
    474 static void
    475 parseSpecialCasing(const char *filename, UErrorCode *pErrorCode) {
    476     char *fields[5][2];
    477     int32_t i, j;
    478 
    479     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    480         return;
    481     }
    482 
    483     u_parseDelimitedFile(filename, ';', fields, 5, specialCasingLineFn, NULL, pErrorCode);
    484 
    485     /* sort the special casing entries by code point */
    486     if(specialCasingCount>0) {
    487         uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing),
    488                        compareSpecialCasings, NULL, FALSE, pErrorCode);
    489     }
    490     if(U_FAILURE(*pErrorCode)) {
    491         return;
    492     }
    493 
    494     /* replace multiple entries for any code point by one "complex" one */
    495     j=0;
    496     for(i=1; i<specialCasingCount; ++i) {
    497         if(specialCasings[i-1].code==specialCasings[i].code) {
    498             /* there is a duplicate code point */
    499             specialCasings[i-1].code=0x7fffffff;    /* remove this entry in the following sorting */
    500             specialCasings[i].isComplex=TRUE;       /* make the following one complex */
    501             specialCasings[i].lowerCase[0]=0;
    502             specialCasings[i].upperCase[0]=0;
    503             specialCasings[i].titleCase[0]=0;
    504             ++j;
    505         }
    506     }
    507 
    508     /* if some entries just were removed, then re-sort */
    509     if(j>0) {
    510         uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing),
    511                        compareSpecialCasings, NULL, FALSE, pErrorCode);
    512         specialCasingCount-=j;
    513     }
    514     if(U_FAILURE(*pErrorCode)) {
    515         return;
    516     }
    517 
    518     /*
    519      * Add one complex mapping to caseSensitive that was filtered out above:
    520      * Greek final Sigma has a conditional mapping but not locale-sensitive,
    521      * and it is taken when lowercasing just U+03A3 alone.
    522      * 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
    523      */
    524     uset_add(caseSensitive, 0x3c2);
    525 }
    526 
    527 /* parser for CaseFolding.txt ----------------------------------------------- */
    528 
    529 #define MAX_CASE_FOLDING_COUNT 2000
    530 
    531 static CaseFolding caseFoldings[MAX_CASE_FOLDING_COUNT];
    532 static int32_t caseFoldingCount=0;
    533 
    534 static void U_CALLCONV
    535 caseFoldingLineFn(void *context,
    536                   char *fields[][2], int32_t fieldCount,
    537                   UErrorCode *pErrorCode) {
    538     char *end;
    539     static UChar32 prevCode=0;
    540     int32_t count;
    541     char status;
    542 
    543     /* get code point */
    544     caseFoldings[caseFoldingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
    545     end=(char *)u_skipWhitespace(end);
    546     if(end<=fields[0][0] || end!=fields[0][1]) {
    547         fprintf(stderr, "gencase: syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
    548         *pErrorCode=U_PARSE_ERROR;
    549         exit(U_PARSE_ERROR);
    550     }
    551 
    552     /* get the status of this mapping */
    553     caseFoldings[caseFoldingCount].status=status=*u_skipWhitespace(fields[1][0]);
    554     if(status!='L' && status!='E' && status!='C' && status!='S' && status!='F' && status!='I' && status!='T') {
    555         fprintf(stderr, "gencase: unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
    556         *pErrorCode=U_PARSE_ERROR;
    557         exit(U_PARSE_ERROR);
    558     }
    559 
    560     /* ignore all case folding mappings that are the same as the UnicodeData.txt lowercase mappings */
    561     if(status=='L') {
    562         return;
    563     }
    564 
    565     /* get the mapping */
    566     count=caseFoldings[caseFoldingCount].full[0]=
    567         (UChar)u_parseString(fields[2][0], caseFoldings[caseFoldingCount].full+1, 31, (uint32_t *)&caseFoldings[caseFoldingCount].simple, pErrorCode);
    568     if(U_FAILURE(*pErrorCode)) {
    569         fprintf(stderr, "gencase: error parsing CaseFolding.txt mapping at %s\n", fields[0][0]);
    570         exit(*pErrorCode);
    571     }
    572 
    573     /* there is a simple mapping only if there is exactly one code point (count is in UChars) */
    574     if(count==0 || count>2 || (count==2 && UTF_IS_SINGLE(caseFoldings[caseFoldingCount].full[1]))) {
    575         caseFoldings[caseFoldingCount].simple=0;
    576     }
    577 
    578     /* update the case-sensitive set */
    579     if(status!='T') {
    580         uset_add(caseSensitive, (UChar32)caseFoldings[caseFoldingCount].code);
    581         _set_addAll(caseSensitive, caseFoldings[caseFoldingCount].full+1, caseFoldings[caseFoldingCount].full[0]);
    582     }
    583 
    584     /* check the status */
    585     if(status=='S') {
    586         /* check if there was a full mapping for this code point before */
    587         if( caseFoldingCount>0 &&
    588             caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
    589             caseFoldings[caseFoldingCount-1].status=='F'
    590         ) {
    591             /* merge the two entries */
    592             caseFoldings[caseFoldingCount-1].simple=caseFoldings[caseFoldingCount].simple;
    593             return;
    594         }
    595     } else if(status=='F') {
    596         /* check if there was a simple mapping for this code point before */
    597         if( caseFoldingCount>0 &&
    598             caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
    599             caseFoldings[caseFoldingCount-1].status=='S'
    600         ) {
    601             /* merge the two entries */
    602             uprv_memcpy(caseFoldings[caseFoldingCount-1].full, caseFoldings[caseFoldingCount].full, 32*U_SIZEOF_UCHAR);
    603             return;
    604         }
    605     } else if(status=='I' || status=='T') {
    606         /* check if there was a default mapping for this code point before (remove it) */
    607         while(caseFoldingCount>0 &&
    608               caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code
    609         ) {
    610             prevCode=0;
    611             --caseFoldingCount;
    612         }
    613         /* store only a marker for special handling for cases like dotless i */
    614         caseFoldings[caseFoldingCount].simple=0;
    615         caseFoldings[caseFoldingCount].full[0]=0;
    616     }
    617 
    618     /* check that the code points (caseFoldings[caseFoldingCount].code) are in ascending order */
    619     if(caseFoldings[caseFoldingCount].code<=prevCode && caseFoldings[caseFoldingCount].code>0) {
    620         fprintf(stderr, "gencase: error - CaseFolding entries out of order, U+%04lx after U+%04lx\n",
    621                 (unsigned long)caseFoldings[caseFoldingCount].code,
    622                 (unsigned long)prevCode);
    623         *pErrorCode=U_PARSE_ERROR;
    624         exit(U_PARSE_ERROR);
    625     }
    626     prevCode=caseFoldings[caseFoldingCount].code;
    627 
    628     if(++caseFoldingCount==MAX_CASE_FOLDING_COUNT) {
    629         fprintf(stderr, "gencase: too many case folding mappings\n");
    630         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    631         exit(U_INDEX_OUTOFBOUNDS_ERROR);
    632     }
    633 }
    634 
    635 static void
    636 parseCaseFolding(const char *filename, UErrorCode *pErrorCode) {
    637     char *fields[3][2];
    638 
    639     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    640         return;
    641     }
    642 
    643     u_parseDelimitedFile(filename, ';', fields, 3, caseFoldingLineFn, NULL, pErrorCode);
    644 }
    645 
    646 /* parser for UnicodeData.txt ----------------------------------------------- */
    647 
    648 /* general categories */
    649 const char *const
    650 genCategoryNames[U_CHAR_CATEGORY_COUNT]={
    651     "Cn",
    652     "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me",
    653     "Mc", "Nd", "Nl", "No",
    654     "Zs", "Zl", "Zp",
    655     "Cc", "Cf", "Co", "Cs",
    656     "Pd", "Ps", "Pe", "Pc", "Po",
    657     "Sm", "Sc", "Sk", "So",
    658     "Pi", "Pf"
    659 };
    660 
    661 static int32_t specialCasingIndex=0, caseFoldingIndex=0;
    662 
    663 static void U_CALLCONV
    664 unicodeDataLineFn(void *context,
    665                   char *fields[][2], int32_t fieldCount,
    666                   UErrorCode *pErrorCode) {
    667     Props p;
    668     char *end;
    669     static UChar32 prevCode=0;
    670     UChar32 value;
    671     int32_t i;
    672 
    673     /* reset the properties */
    674     uprv_memset(&p, 0, sizeof(Props));
    675 
    676     /* get the character code, field 0 */
    677     p.code=(UChar32)uprv_strtoul(fields[0][0], &end, 16);
    678     if(end<=fields[0][0] || end!=fields[0][1]) {
    679         fprintf(stderr, "gencase: syntax error in field 0 at %s\n", fields[0][0]);
    680         *pErrorCode=U_PARSE_ERROR;
    681         exit(U_PARSE_ERROR);
    682     }
    683 
    684     /* get general category, field 2 */
    685     i=getTokenIndex(genCategoryNames, U_CHAR_CATEGORY_COUNT, fields[2][0]);
    686     if(i>=0) {
    687         p.gc=(uint8_t)i;
    688     } else {
    689         fprintf(stderr, "gencase: unknown general category \"%s\" at code 0x%lx\n",
    690             fields[2][0], (unsigned long)p.code);
    691         *pErrorCode=U_PARSE_ERROR;
    692         exit(U_PARSE_ERROR);
    693     }
    694 
    695     /* get canonical combining class, field 3 */
    696     value=(UChar32)uprv_strtoul(fields[3][0], &end, 10);
    697     if(end<=fields[3][0] || end!=fields[3][1] || value>0xff) {
    698         fprintf(stderr, "gencase: syntax error in field 3 at %s\n", fields[0][0]);
    699         *pErrorCode=U_PARSE_ERROR;
    700         exit(U_PARSE_ERROR);
    701     }
    702     p.cc=(uint8_t)value;
    703 
    704     /* get uppercase mapping, field 12 */
    705     value=(UChar32)uprv_strtoul(fields[12][0], &end, 16);
    706     if(end!=fields[12][1]) {
    707         fprintf(stderr, "gencase: syntax error in field 12 at code 0x%lx\n",
    708             (unsigned long)p.code);
    709         *pErrorCode=U_PARSE_ERROR;
    710         exit(U_PARSE_ERROR);
    711     }
    712     if(value!=0 && value!=p.code) {
    713         p.upperCase=value;
    714         uset_add(caseSensitive, p.code);
    715         uset_add(caseSensitive, value);
    716     }
    717 
    718     /* get lowercase value, field 13 */
    719     value=(UChar32)uprv_strtoul(fields[13][0], &end, 16);
    720     if(end!=fields[13][1]) {
    721         fprintf(stderr, "gencase: syntax error in field 13 at code 0x%lx\n",
    722             (unsigned long)p.code);
    723         *pErrorCode=U_PARSE_ERROR;
    724         exit(U_PARSE_ERROR);
    725     }
    726     if(value!=0 && value!=p.code) {
    727         p.lowerCase=value;
    728         uset_add(caseSensitive, p.code);
    729         uset_add(caseSensitive, value);
    730     }
    731 
    732     /* get titlecase value, field 14 */
    733     value=(UChar32)uprv_strtoul(fields[14][0], &end, 16);
    734     if(end!=fields[14][1]) {
    735         fprintf(stderr, "gencase: syntax error in field 14 at code 0x%lx\n",
    736             (unsigned long)p.code);
    737         *pErrorCode=U_PARSE_ERROR;
    738         exit(U_PARSE_ERROR);
    739     }
    740     if(value!=0 && value!=p.code) {
    741         p.titleCase=value;
    742         uset_add(caseSensitive, p.code);
    743         uset_add(caseSensitive, value);
    744     }
    745 
    746     /* set additional properties from previously parsed files */
    747     if(specialCasingIndex<specialCasingCount && p.code==specialCasings[specialCasingIndex].code) {
    748         p.specialCasing=specialCasings+specialCasingIndex++;
    749     } else {
    750         p.specialCasing=NULL;
    751     }
    752     if(caseFoldingIndex<caseFoldingCount && p.code==caseFoldings[caseFoldingIndex].code) {
    753         p.caseFolding=caseFoldings+caseFoldingIndex++;
    754 
    755         /* ignore "Common" mappings (simple==full) that map to the same code point as the regular lowercase mapping */
    756         if( p.caseFolding->status=='C' &&
    757             p.caseFolding->simple==p.lowerCase
    758         ) {
    759             p.caseFolding=NULL;
    760         }
    761     } else {
    762         p.caseFolding=NULL;
    763     }
    764 
    765     /* check for non-character code points */
    766     if((p.code&0xfffe)==0xfffe || (uint32_t)(p.code-0xfdd0)<0x20) {
    767         fprintf(stderr, "gencase: error - properties for non-character code point U+%04lx\n",
    768                 (unsigned long)p.code);
    769         *pErrorCode=U_PARSE_ERROR;
    770         exit(U_PARSE_ERROR);
    771     }
    772 
    773     /* check that the code points (p.code) are in ascending order */
    774     if(p.code<=prevCode && p.code>0) {
    775         fprintf(stderr, "gencase: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
    776                 (unsigned long)p.code, (unsigned long)prevCode);
    777         *pErrorCode=U_PARSE_ERROR;
    778         exit(U_PARSE_ERROR);
    779     }
    780 
    781     /* properties for a single code point */
    782     setProps(&p);
    783 
    784     prevCode=p.code;
    785 }
    786 
    787 static void
    788 parseDB(const char *filename, UErrorCode *pErrorCode) {
    789     char *fields[15][2];
    790     UChar32 start, end;
    791     int32_t i;
    792 
    793     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    794         return;
    795     }
    796 
    797     u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode);
    798 
    799     /* are all sub-properties consumed? */
    800     if(specialCasingIndex<specialCasingCount) {
    801         fprintf(stderr, "gencase: error - some code points in SpecialCasing.txt are missing from UnicodeData.txt\n");
    802         *pErrorCode=U_PARSE_ERROR;
    803         exit(U_PARSE_ERROR);
    804     }
    805     if(caseFoldingIndex<caseFoldingCount) {
    806         fprintf(stderr, "gencase: error - some code points in CaseFolding.txt are missing from UnicodeData.txt\n");
    807         *pErrorCode=U_PARSE_ERROR;
    808         exit(U_PARSE_ERROR);
    809     }
    810 
    811     if(U_FAILURE(*pErrorCode)) {
    812         return;
    813     }
    814 
    815     for(i=0;
    816         0==uset_getItem(caseSensitive, i, &start, &end, NULL, 0, pErrorCode) && U_SUCCESS(*pErrorCode);
    817         ++i
    818     ) {
    819         addCaseSensitive(start, end);
    820     }
    821     if(*pErrorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
    822         *pErrorCode=U_ZERO_ERROR;
    823     }
    824 }
    825 
    826 /*
    827  * Hey, Emacs, please set the following:
    828  *
    829  * Local Variables:
    830  * indent-tabs-mode: nil
    831  * End:
    832  *
    833  */
    834