Home | History | Annotate | Download | only in genuca
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2000-2009, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  genuca.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created at the end of XX century
     14 *   created by: Vladimir Weinstein
     15 *
     16 *   This program reads the Franctional UCA table and generates
     17 *   internal format for UCA table as well as inverse UCA table.
     18 *   It then writes binary files containing the data: ucadata.dat
     19 *   & invuca.dat
     20 *   Change history:
     21 *   02/23/2001  grhoten                 Made it into a tool
     22 *   02/23/2001  weiv                    Moved element & table handling code to i18n
     23 *   05/09/2001  weiv                    Case bits are now in the CEs, not in front
     24 */
     25 
     26 #include "unicode/utypes.h"
     27 #include "unicode/putil.h"
     28 #include "unicode/udata.h"
     29 #include "unicode/uclean.h"
     30 #include "ucol_imp.h"
     31 #include "genuca.h"
     32 #include "uoptions.h"
     33 #include "toolutil.h"
     34 #include "unewdata.h"
     35 #include "cstring.h"
     36 #include "cmemory.h"
     37 
     38 #include <stdio.h>
     39 
     40 /*
     41  * Global - verbosity
     42  */
     43 UBool VERBOSE = FALSE;
     44 
     45 static UVersionInfo UCAVersion;
     46 
     47 #if UCONFIG_NO_COLLATION
     48 
     49 /* dummy UDataInfo cf. udata.h */
     50 static UDataInfo dummyDataInfo = {
     51     sizeof(UDataInfo),
     52     0,
     53 
     54     U_IS_BIG_ENDIAN,
     55     U_CHARSET_FAMILY,
     56     U_SIZEOF_UCHAR,
     57     0,
     58 
     59     { 0, 0, 0, 0 },                 /* dummy dataFormat */
     60     { 0, 0, 0, 0 },                 /* dummy formatVersion */
     61     { 0, 0, 0, 0 }                  /* dummy dataVersion */
     62 };
     63 
     64 #else
     65 
     66 static const UDataInfo ucaDataInfo={
     67     sizeof(UDataInfo),
     68     0,
     69 
     70     U_IS_BIG_ENDIAN,
     71     U_CHARSET_FAMILY,
     72     sizeof(UChar),
     73     0,
     74 
     75     {UCA_DATA_FORMAT_0, UCA_DATA_FORMAT_1, UCA_DATA_FORMAT_2, UCA_DATA_FORMAT_3},     /* dataFormat="UCol"            */
     76     /* 03/26/2002 bumped up version since format has changed */
     77     /* 09/16/2002 bumped up version since we went from UColAttributeValue */
     78     /*            to int32_t in UColOptionSet */
     79     /* 05/13/2003 This one also updated since we added UCA and UCD versions */
     80     /*            to header */
     81     /* 09/11/2003 Adding information required by data swapper */
     82     {UCA_FORMAT_VERSION_0, UCA_FORMAT_VERSION_1, UCA_FORMAT_VERSION_2, UCA_FORMAT_VERSION_3},                 /* formatVersion                */
     83     {0, 0, 0, 0}                  /* dataVersion = Unicode Version*/
     84 };
     85 
     86 static const UDataInfo invUcaDataInfo={
     87     sizeof(UDataInfo),
     88     0,
     89 
     90     U_IS_BIG_ENDIAN,
     91     U_CHARSET_FAMILY,
     92     sizeof(UChar),
     93     0,
     94 
     95     {INVUCA_DATA_FORMAT_0, INVUCA_DATA_FORMAT_1, INVUCA_DATA_FORMAT_2, INVUCA_DATA_FORMAT_3},     /* dataFormat="InvC"            */
     96     /* 03/26/2002 bumped up version since format has changed */
     97     /* 04/29/2003 2.1 format - we have added UCA version to header */
     98     {INVUCA_FORMAT_VERSION_0, INVUCA_FORMAT_VERSION_1, INVUCA_FORMAT_VERSION_2, INVUCA_FORMAT_VERSION_3},                 /* formatVersion                */
     99     {0, 0, 0, 0}                  /* dataVersion = Unicode Version*/
    100 };
    101 
    102 UCAElements le;
    103 
    104 int32_t readElement(char **from, char *to, char separator, UErrorCode *status) {
    105     if(U_FAILURE(*status)) {
    106         return 0;
    107     }
    108     char buffer[1024];
    109     int32_t i = 0;
    110     while(**from != separator) {
    111         if(**from != ' ') {
    112             *(buffer+i++) = **from;
    113         }
    114         (*from)++;
    115     }
    116     (*from)++;
    117     *(buffer + i) = 0;
    118     //*to = (char *)malloc(strlen(buffer)+1);
    119     strcpy(to, buffer);
    120     return i/2;
    121 }
    122 
    123 
    124 uint32_t getSingleCEValue(char *primary, char *secondary, char *tertiary, UErrorCode *status) {
    125     if(U_FAILURE(*status)) {
    126         return 0;
    127     }
    128     uint32_t value = 0;
    129     char primsave = '\0';
    130     char secsave = '\0';
    131     char tersave = '\0';
    132     char *primend = primary+4;
    133     if(strlen(primary) > 4) {
    134         primsave = *primend;
    135         *primend = '\0';
    136     }
    137     char *secend = secondary+2;
    138     if(strlen(secondary) > 2) {
    139         secsave = *secend;
    140         *secend = '\0';
    141     }
    142     char *terend = tertiary+2;
    143     if(strlen(tertiary) > 2) {
    144         tersave = *terend;
    145         *terend = '\0';
    146     }
    147     uint32_t primvalue = (uint32_t)((*primary!='\0')?strtoul(primary, &primend, 16):0);
    148     uint32_t secvalue = (uint32_t)((*secondary!='\0')?strtoul(secondary, &secend, 16):0);
    149     uint32_t tervalue = (uint32_t)((*tertiary!='\0')?strtoul(tertiary, &terend, 16):0);
    150     if(primvalue <= 0xFF) {
    151       primvalue <<= 8;
    152     }
    153 
    154     value = ((primvalue<<UCOL_PRIMARYORDERSHIFT)&UCOL_PRIMARYORDERMASK)|
    155         ((secvalue<<UCOL_SECONDARYORDERSHIFT)&UCOL_SECONDARYORDERMASK)|
    156         (tervalue&UCOL_TERTIARYORDERMASK);
    157 
    158     if(primsave!='\0') {
    159         *primend = primsave;
    160     }
    161     if(secsave!='\0') {
    162         *secend = secsave;
    163     }
    164     if(tersave!='\0') {
    165         *terend = tersave;
    166     }
    167     return value;
    168 }
    169 
    170 static uint32_t inverseTable[0xFFFF][3];
    171 static uint32_t inversePos = 0;
    172 static UChar stringContinue[0xFFFF];
    173 static uint32_t sContPos = 0;
    174 
    175 static void addNewInverse(UCAElements *element, UErrorCode *status) {
    176   if(U_FAILURE(*status)) {
    177     return;
    178   }
    179   if(VERBOSE && isContinuation(element->CEs[1])) {
    180     //fprintf(stdout, "+");
    181   }
    182   inversePos++;
    183   inverseTable[inversePos][0] = element->CEs[0];
    184   if(element->noOfCEs > 1 && isContinuation(element->CEs[1])) {
    185     inverseTable[inversePos][1] = element->CEs[1];
    186   } else {
    187     inverseTable[inversePos][1] = 0;
    188   }
    189   if(element->cSize < 2) {
    190     inverseTable[inversePos][2] = element->cPoints[0];
    191   } else { /* add a new store of cruft */
    192     inverseTable[inversePos][2] = ((element->cSize+1) << UCOL_INV_SHIFTVALUE) | sContPos;
    193     memcpy(stringContinue+sContPos, element->cPoints, element->cSize*sizeof(UChar));
    194     sContPos += element->cSize+1;
    195   }
    196 }
    197 
    198 static void insertInverse(UCAElements *element, uint32_t position, UErrorCode *status) {
    199   if(U_FAILURE(*status)) {
    200     return;
    201   }
    202 
    203   if(VERBOSE && isContinuation(element->CEs[1])) {
    204     //fprintf(stdout, "+");
    205   }
    206   if(position <= inversePos) {
    207     /*move stuff around */
    208     uint32_t amountToMove = (inversePos - position+1)*sizeof(inverseTable[0]);
    209     uprv_memmove(inverseTable[position+1], inverseTable[position], amountToMove);
    210   }
    211   inverseTable[position][0] = element->CEs[0];
    212   if(element->noOfCEs > 1 && isContinuation(element->CEs[1])) {
    213     inverseTable[position][1] = element->CEs[1];
    214   } else {
    215     inverseTable[position][1] = 0;
    216   }
    217   if(element->cSize < 2) {
    218     inverseTable[position][2] = element->cPoints[0];
    219   } else { /* add a new store of cruft */
    220     inverseTable[position][2] = ((element->cSize+1) << UCOL_INV_SHIFTVALUE) | sContPos;
    221     memcpy(stringContinue+sContPos, element->cPoints, element->cSize*sizeof(UChar));
    222     sContPos += element->cSize+1;
    223   }
    224   inversePos++;
    225 }
    226 
    227 static void addToExistingInverse(UCAElements *element, uint32_t position, UErrorCode *status) {
    228 
    229   if(U_FAILURE(*status)) {
    230     return;
    231   }
    232 
    233       if((inverseTable[position][2] & UCOL_INV_SIZEMASK) == 0) { /* single element, have to make new extension place and put both guys there */
    234         stringContinue[sContPos] = (UChar)inverseTable[position][2];
    235         inverseTable[position][2] = ((element->cSize+3) << UCOL_INV_SHIFTVALUE) | sContPos;
    236         sContPos++;
    237         stringContinue[sContPos++] = 0xFFFF;
    238         memcpy(stringContinue+sContPos, element->cPoints, element->cSize*sizeof(UChar));
    239         sContPos += element->cSize;
    240         stringContinue[sContPos++] = 0xFFFE;
    241       } else { /* adding to the already existing continuing table */
    242         uint32_t contIndex = inverseTable[position][2] & UCOL_INV_OFFSETMASK;
    243         uint32_t contSize = (inverseTable[position][2] & UCOL_INV_SIZEMASK) >> UCOL_INV_SHIFTVALUE;
    244 
    245         if(contIndex+contSize < sContPos) {
    246           /*fprintf(stderr, ".", sContPos, contIndex+contSize);*/
    247           memcpy(stringContinue+contIndex+contSize+element->cSize+1, stringContinue+contIndex+contSize, (element->cSize+1)*sizeof(UChar));
    248         }
    249 
    250         stringContinue[contIndex+contSize-1] = 0xFFFF;
    251         memcpy(stringContinue+contIndex+contSize, element->cPoints, element->cSize*sizeof(UChar));
    252         sContPos += element->cSize+1;
    253         stringContinue[contIndex+contSize+element->cSize] = 0xFFFE;
    254 
    255         inverseTable[position][2] = ((contSize+element->cSize+1) << UCOL_INV_SHIFTVALUE) | contIndex;
    256       }
    257 }
    258 
    259 /*
    260  * Takes two CEs (lead and continuation) and
    261  * compares them as CEs should be compared:
    262  * primary vs. primary, secondary vs. secondary
    263  * tertiary vs. tertiary
    264  */
    265 static int32_t compareCEs(uint32_t *source, uint32_t *target) {
    266   uint32_t s1 = source[0], s2, t1 = target[0], t2;
    267   if(isContinuation(source[1])) {
    268     s2 = source[1];
    269   } else {
    270     s2 = 0;
    271   }
    272   if(isContinuation(target[1])) {
    273     t2 = target[1];
    274   } else {
    275     t2 = 0;
    276   }
    277 
    278   uint32_t s = 0, t = 0;
    279   if(s1 == t1 && s2 == t2) {
    280     return 0;
    281   }
    282   s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16);
    283   t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16);
    284   if(s < t) {
    285     return -1;
    286   } else if(s > t) {
    287     return 1;
    288   } else {
    289     s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8;
    290     t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8;
    291     if(s < t) {
    292       return -1;
    293     } else if(s > t) {
    294       return 1;
    295     } else {
    296       s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF);
    297       t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF);
    298       if(s < t) {
    299         return -1;
    300       } else {
    301         return 1;
    302       }
    303     }
    304   }
    305 }
    306 
    307 static uint32_t addToInverse(UCAElements *element, UErrorCode *status) {
    308   uint32_t position = inversePos;
    309   uint32_t saveElement = element->CEs[0];
    310   int32_t compResult = 0;
    311   element->CEs[0] &= 0xFFFFFF3F;
    312   if(element->noOfCEs == 1) {
    313     element->CEs[1] = 0;
    314   }
    315   if(inversePos == 0) {
    316     inverseTable[0][0] = inverseTable[0][1] = inverseTable[0][2] = 0;
    317     addNewInverse(element, status);
    318   } else if(compareCEs(inverseTable[inversePos], element->CEs) > 0) {
    319     while((compResult = compareCEs(inverseTable[--position], element->CEs)) > 0);
    320     if(VERBOSE) { fprintf(stdout, "p:%u ", (int)position); }
    321     if(compResult == 0) {
    322       addToExistingInverse(element, position, status);
    323     } else {
    324       insertInverse(element, position+1, status);
    325     }
    326   } else if(compareCEs(inverseTable[inversePos], element->CEs) == 0) {
    327     addToExistingInverse(element, inversePos, status);
    328   } else {
    329     addNewInverse(element, status);
    330   }
    331   element->CEs[0] = saveElement;
    332   if(VERBOSE) { fprintf(stdout, "+"); }
    333   return inversePos;
    334 }
    335 
    336 static InverseUCATableHeader *assembleInverseTable(UErrorCode *status)
    337 {
    338   InverseUCATableHeader *result = NULL;
    339   uint32_t headerByteSize = paddedsize(sizeof(InverseUCATableHeader));
    340   uint32_t inverseTableByteSize = (inversePos+2)*sizeof(uint32_t)*3;
    341   uint32_t contsByteSize = sContPos * sizeof(UChar);
    342   uint32_t i = 0;
    343 
    344   result = (InverseUCATableHeader *)uprv_malloc(headerByteSize + inverseTableByteSize + contsByteSize);
    345   uprv_memset(result, 0, headerByteSize + inverseTableByteSize + contsByteSize);
    346   if(result != NULL) {
    347     result->byteSize = headerByteSize + inverseTableByteSize + contsByteSize;
    348 
    349     inversePos++;
    350     inverseTable[inversePos][0] = 0xFFFFFFFF;
    351     inverseTable[inversePos][1] = 0xFFFFFFFF;
    352     inverseTable[inversePos][2] = 0x0000FFFF;
    353     inversePos++;
    354 
    355     for(i = 2; i<inversePos; i++) {
    356       if(compareCEs(inverseTable[i-1], inverseTable[i]) > 0) {
    357         fprintf(stderr, "Error at %i: %08X & %08X\n", (int)i, (int)inverseTable[i-1][0], (int)inverseTable[i][0]);
    358       } else if(inverseTable[i-1][0] == inverseTable[i][0] && !(inverseTable[i-1][1] < inverseTable[i][1])) {
    359         fprintf(stderr, "Continuation error at %i: %08X %08X & %08X %08X\n", (int)i, (int)inverseTable[i-1][0], (int)inverseTable[i-1][1], (int)inverseTable[i][0], (int)inverseTable[i][1]);
    360       }
    361     }
    362 
    363     result->tableSize = inversePos;
    364     result->contsSize = sContPos;
    365 
    366     result->table = headerByteSize;
    367     result->conts = headerByteSize + inverseTableByteSize;
    368 
    369     memcpy((uint8_t *)result + result->table, inverseTable, inverseTableByteSize);
    370     memcpy((uint8_t *)result + result->conts, stringContinue, contsByteSize);
    371 
    372   } else {
    373     *status = U_MEMORY_ALLOCATION_ERROR;
    374     return NULL;
    375   }
    376 
    377   return result;
    378 }
    379 
    380 
    381 static void writeOutInverseData(InverseUCATableHeader *data,
    382                   const char *outputDir,
    383                   const char *copyright,
    384                   UErrorCode *status)
    385 {
    386     UNewDataMemory *pData;
    387 
    388     long dataLength;
    389 
    390     UDataInfo invUcaInfo;
    391     uprv_memcpy(&invUcaInfo, &invUcaDataInfo, sizeof(UDataInfo));
    392     u_getUnicodeVersion(invUcaInfo.dataVersion);
    393 
    394     pData=udata_create(outputDir, INVC_DATA_TYPE, INVC_DATA_NAME, &invUcaInfo,
    395                        copyright, status);
    396 
    397     if(U_FAILURE(*status)) {
    398         fprintf(stderr, "Error: unable to create %s"INVC_DATA_NAME", error %s\n", outputDir, u_errorName(*status));
    399         return;
    400     }
    401 
    402     /* write the data to the file */
    403     if (VERBOSE) {
    404         fprintf(stdout, "Writing out inverse UCA table: %s%c%s.%s\n", outputDir, U_FILE_SEP_CHAR,
    405                                                                 INVC_DATA_NAME,
    406                                                                 INVC_DATA_TYPE);
    407     }
    408     udata_writeBlock(pData, data, data->byteSize);
    409 
    410     /* finish up */
    411     dataLength=udata_finish(pData, status);
    412     if(U_FAILURE(*status)) {
    413         fprintf(stderr, "Error: error %d writing the output file\n", *status);
    414         return;
    415     }
    416 }
    417 
    418 
    419 
    420 static int32_t hex2num(char hex) {
    421     if(hex>='0' && hex <='9') {
    422         return hex-'0';
    423     } else if(hex>='a' && hex<='f') {
    424         return hex-'a'+10;
    425     } else if(hex>='A' && hex<='F') {
    426         return hex-'A'+10;
    427     } else {
    428         return 0;
    429     }
    430 }
    431 
    432 UCAElements *readAnElement(FILE *data, tempUCATable *t, UCAConstants *consts, UErrorCode *status) {
    433     char buffer[2048], primary[100], secondary[100], tertiary[100];
    434     UBool detectedContraction;
    435     int32_t i = 0;
    436     unsigned int theValue;
    437     char *pointer = NULL;
    438     char *commentStart = NULL;
    439     char *startCodePoint = NULL;
    440     char *endCodePoint = NULL;
    441     char *spacePointer = NULL;
    442     char *dashPointer = NULL;
    443     char *result = fgets(buffer, 2048, data);
    444     int32_t buflen = (int32_t)uprv_strlen(buffer);
    445     if(U_FAILURE(*status)) {
    446         return 0;
    447     }
    448     *primary = *secondary = *tertiary = '\0';
    449     if(result == NULL) {
    450         if(feof(data)) {
    451             return NULL;
    452         } else {
    453             fprintf(stderr, "empty line but no EOF!\n");
    454             *status = U_INVALID_FORMAT_ERROR;
    455             return NULL;
    456         }
    457     }
    458     while(buflen>0 && (buffer[buflen-1] == '\r' || buffer[buflen-1] == '\n')) {
    459       buffer[--buflen] = 0;
    460     }
    461 
    462     if(buffer[0] == 0 || buffer[0] == '#') {
    463         return NULL; // just a comment, skip whole line
    464     }
    465 
    466     UCAElements *element = &le; //(UCAElements *)malloc(sizeof(UCAElements));
    467 
    468     enum ActionType {
    469       READCE,
    470       READHEX,
    471       READUCAVERSION
    472     };
    473 
    474     // Directives.
    475     if(buffer[0] == '[') {
    476       uint32_t cnt = 0;
    477       static const struct {
    478         char name[128];
    479         uint32_t *what;
    480         ActionType what_to_do;
    481       } vt[]  = { {"[first tertiary ignorable",  consts->UCA_FIRST_TERTIARY_IGNORABLE,  READCE},
    482                   {"[last tertiary ignorable",   consts->UCA_LAST_TERTIARY_IGNORABLE,   READCE},
    483                   {"[first secondary ignorable", consts->UCA_FIRST_SECONDARY_IGNORABLE, READCE},
    484                   {"[last secondary ignorable",  consts->UCA_LAST_SECONDARY_IGNORABLE,  READCE},
    485                   {"[first primary ignorable",   consts->UCA_FIRST_PRIMARY_IGNORABLE,   READCE},
    486                   {"[last primary ignorable",    consts->UCA_LAST_PRIMARY_IGNORABLE,    READCE},
    487                   {"[first variable",            consts->UCA_FIRST_VARIABLE,            READCE},
    488                   {"[last variable",             consts->UCA_LAST_VARIABLE,             READCE},
    489                   {"[first regular",             consts->UCA_FIRST_NON_VARIABLE,        READCE},
    490                   {"[last regular",              consts->UCA_LAST_NON_VARIABLE,         READCE},
    491                   {"[first implicit",            consts->UCA_FIRST_IMPLICIT,            READCE},
    492                   {"[last implicit",             consts->UCA_LAST_IMPLICIT,             READCE},
    493                   {"[first trailing",            consts->UCA_FIRST_TRAILING,            READCE},
    494                   {"[last trailing",             consts->UCA_LAST_TRAILING,             READCE},
    495 
    496                   {"[fixed top",                       &consts->UCA_PRIMARY_TOP_MIN,           READHEX},
    497                   {"[fixed first implicit byte",       &consts->UCA_PRIMARY_IMPLICIT_MIN,      READHEX},
    498                   {"[fixed last implicit byte",        &consts->UCA_PRIMARY_IMPLICIT_MAX,      READHEX},
    499                   {"[fixed first trail byte",          &consts->UCA_PRIMARY_TRAILING_MIN,      READHEX},
    500                   {"[fixed last trail byte",           &consts->UCA_PRIMARY_TRAILING_MAX,      READHEX},
    501                   {"[fixed first special byte",        &consts->UCA_PRIMARY_SPECIAL_MIN,       READHEX},
    502                   {"[fixed last special byte",         &consts->UCA_PRIMARY_SPECIAL_MAX,       READHEX},
    503                   {"[variable top = ",                &t->options->variableTopValue,          READHEX},
    504                   {"[UCA version = ",                 NULL,                          READUCAVERSION}
    505       };
    506       for (cnt = 0; cnt<sizeof(vt)/sizeof(vt[0]); cnt++) {
    507         uint32_t vtLen = (uint32_t)uprv_strlen(vt[cnt].name);
    508         if(uprv_strncmp(buffer, vt[cnt].name, vtLen) == 0) {
    509             element->variableTop = TRUE;
    510             if(vt[cnt].what_to_do == READHEX) {
    511               if(sscanf(buffer+vtLen, "%4x", &theValue) != 1) /* read first code point */
    512               {
    513                   fprintf(stderr, " scanf(hex) failed on !\n ");
    514               }
    515               *(vt[cnt].what) = (UChar)theValue;
    516               //if(cnt == 1) { // first implicit
    517                 // we need to set the value for top next
    518                 //uint32_t nextTop = ucol_prv_calculateImplicitPrimary(0x4E00); // CJK base
    519                 //consts->UCA_NEXT_TOP_VALUE = theValue<<24 | 0x030303;
    520               //}
    521             } else if (vt[cnt].what_to_do == READCE) { /* vt[cnt].what_to_do == READCE */
    522               // TODO: combine & clean up the two CE parsers
    523               pointer = strchr(buffer+vtLen, '[');
    524               if(pointer) {
    525                 pointer++;
    526                 element->sizePrim[0]=readElement(&pointer, primary, ',', status);
    527                 element->sizeSec[0]=readElement(&pointer, secondary, ',', status);
    528                 element->sizeTer[0]=readElement(&pointer, tertiary, ']', status);
    529 
    530                 vt[cnt].what[0] = getSingleCEValue(primary, secondary, tertiary, status);
    531                 if(element->sizePrim[0] > 2 || element->sizeSec[0] > 1 || element->sizeTer[0] > 1) {
    532                   uint32_t CEi = 1;
    533                   uint32_t value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
    534                     if(2*CEi<element->sizePrim[i]) {
    535                         value |= ((hex2num(*(primary+4*CEi))&0xF)<<28);
    536                         value |= ((hex2num(*(primary+4*CEi+1))&0xF)<<24);
    537                     }
    538 
    539                     if(2*CEi+1<element->sizePrim[i]) {
    540                         value |= ((hex2num(*(primary+4*CEi+2))&0xF)<<20);
    541                         value |= ((hex2num(*(primary+4*CEi+3))&0xF)<<16);
    542                     }
    543 
    544                     if(CEi<element->sizeSec[i]) {
    545                         value |= ((hex2num(*(secondary+2*CEi))&0xF)<<12);
    546                         value |= ((hex2num(*(secondary+2*CEi+1))&0xF)<<8);
    547                     }
    548 
    549                     if(CEi<element->sizeTer[i]) {
    550                         value |= ((hex2num(*(tertiary+2*CEi))&0x3)<<4);
    551                         value |= (hex2num(*(tertiary+2*CEi+1))&0xF);
    552                     }
    553 
    554                     CEi++;
    555 
    556                     vt[cnt].what[1] = value;
    557                     //element->CEs[CEindex++] = value;
    558                 } else {
    559                   vt[cnt].what[1] = 0;
    560                 }
    561               } else {
    562                 fprintf(stderr, "Failed to read a CE from line %s\n", buffer);
    563               }
    564             } else { //vt[cnt].what_to_do == READUCAVERSION
    565               u_versionFromString(UCAVersion, buffer+vtLen);
    566               if(VERBOSE) {
    567                 fprintf(stdout, "UCA version [%hu.%hu.%hu.%hu]\n", UCAVersion[0], UCAVersion[1], UCAVersion[2], UCAVersion[3]);
    568               }
    569             }
    570             //element->cPoints[0] = (UChar)theValue;
    571             //return element;
    572             return NULL;
    573         }
    574       }
    575       fprintf(stderr, "Warning: unrecognized option: %s\n", buffer);
    576       //*status = U_INVALID_FORMAT_ERROR;
    577       return NULL;
    578     }
    579     element->variableTop = FALSE;
    580 
    581     startCodePoint = buffer;
    582     endCodePoint = strchr(startCodePoint, ';');
    583 
    584     if(endCodePoint == 0) {
    585         fprintf(stderr, "error - line with no code point!\n");
    586         *status = U_INVALID_FORMAT_ERROR; /* No code point - could be an error, but probably only an empty line */
    587         return NULL;
    588     } else {
    589         *(endCodePoint) = 0;
    590     }
    591 
    592     memset(element, 0, sizeof(*element));
    593 
    594     element->cPoints = element->uchars;
    595 
    596     spacePointer = strchr(buffer, ' ');
    597     if(sscanf(buffer, "%4x", &theValue) != 1) /* read first code point */
    598     {
    599       fprintf(stderr, " scanf(hex) failed!\n ");
    600     }
    601     element->cPoints[0] = (UChar)theValue;
    602 
    603     if(spacePointer == 0) {
    604         detectedContraction = FALSE;
    605         element->cSize = 1;
    606     } else {
    607         dashPointer = strchr(buffer, '|');
    608         if (dashPointer != NULL) {
    609             // prefix characters
    610             element->prefixChars[0] = (UChar)theValue;
    611             element->prefixSize = 1;
    612             element->prefix = element->prefixChars;
    613             sscanf(dashPointer+1, "%4x", &theValue);
    614             element->cPoints[0] = (UChar)theValue;
    615             element->cSize = 1;
    616         }
    617         else {
    618           // Contractions or surrogate characters.
    619             i = 1;
    620             detectedContraction = TRUE;
    621             while(spacePointer != NULL) {
    622                 sscanf(spacePointer+1, "%4x", &theValue);
    623                 element->cPoints[i++] = (UChar)theValue;
    624                 spacePointer = strchr(spacePointer+1, ' ');
    625             }
    626             element->cSize = i;
    627         }
    628 
    629 
    630         //fprintf(stderr, "Number of codepoints in contraction: %i\n", i);
    631     }
    632 
    633     startCodePoint = endCodePoint+1;
    634 
    635     commentStart = strchr(startCodePoint, '#');
    636     if(commentStart == NULL) {
    637         commentStart = strlen(startCodePoint) + startCodePoint;
    638     }
    639 
    640     i = 0;
    641     uint32_t CEindex = 0;
    642     element->noOfCEs = 0;
    643     for(;;) {
    644         endCodePoint = strchr(startCodePoint, ']');
    645         if(endCodePoint == NULL || endCodePoint >= commentStart) {
    646             break;
    647         }
    648         pointer = strchr(startCodePoint, '[');
    649         pointer++;
    650 
    651         element->sizePrim[i]=readElement(&pointer, primary, ',', status);
    652         element->sizeSec[i]=readElement(&pointer, secondary, ',', status);
    653         element->sizeTer[i]=readElement(&pointer, tertiary, ']', status);
    654 
    655 
    656         /* I want to get the CEs entered right here, including continuation */
    657         element->CEs[CEindex++] = getSingleCEValue(primary, secondary, tertiary, status);
    658 
    659         uint32_t CEi = 1;
    660         while(2*CEi<element->sizePrim[i] || CEi<element->sizeSec[i] || CEi<element->sizeTer[i]) {
    661           uint32_t value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
    662             if(2*CEi<element->sizePrim[i]) {
    663                 value |= ((hex2num(*(primary+4*CEi))&0xF)<<28);
    664                 value |= ((hex2num(*(primary+4*CEi+1))&0xF)<<24);
    665             }
    666 
    667             if(2*CEi+1<element->sizePrim[i]) {
    668                 value |= ((hex2num(*(primary+4*CEi+2))&0xF)<<20);
    669                 value |= ((hex2num(*(primary+4*CEi+3))&0xF)<<16);
    670             }
    671 
    672             if(CEi<element->sizeSec[i]) {
    673                 value |= ((hex2num(*(secondary+2*CEi))&0xF)<<12);
    674                 value |= ((hex2num(*(secondary+2*CEi+1))&0xF)<<8);
    675             }
    676 
    677             if(CEi<element->sizeTer[i]) {
    678                 value |= ((hex2num(*(tertiary+2*CEi))&0x3)<<4);
    679                 value |= (hex2num(*(tertiary+2*CEi+1))&0xF);
    680             }
    681 
    682             CEi++;
    683 
    684             element->CEs[CEindex++] = value;
    685         }
    686 
    687       startCodePoint = endCodePoint+1;
    688       i++;
    689     }
    690     element->noOfCEs = CEindex;
    691 #if 0
    692     element->isThai = UCOL_ISTHAIPREVOWEL(element->cPoints[0]);
    693 #endif
    694     // we don't want any strange stuff after useful data!
    695     if (pointer == NULL) {
    696         /* huh? Did we get ']' without the '['? Pair your brackets! */
    697         *status=U_INVALID_FORMAT_ERROR;
    698     }
    699     else {
    700         while(pointer < commentStart)  {
    701             if(*pointer != ' ' && *pointer != '\t')
    702             {
    703                 *status=U_INVALID_FORMAT_ERROR;
    704                 break;
    705             }
    706             pointer++;
    707         }
    708     }
    709     // Check for valid bytes in CE weights.
    710     // TODO: Tighten this so that it allows 03 & 04 in intermediate bytes
    711     // but not in final bytes.
    712     // See http://bugs.icu-project.org/trac/ticket/7167
    713     for (i = 0; i < (int32_t)CEindex; ++i) {
    714         uint32_t value = element->CEs[i];
    715         uint8_t bytes[4] = {
    716             (uint8_t)(value >> 24),
    717             (uint8_t)(value >> 16),
    718             (uint8_t)(value >> 8),
    719             (uint8_t)(value & UCOL_NEW_TERTIARYORDERMASK)
    720         };
    721         for (int j = 0; j < 4; ++j) {
    722             uint8_t maxByte =
    723                 (isContinuation(value) || j == 1) ?
    724                     UCOL_BYTE_FIRST_TAILORED :
    725                     UCOL_BYTE_COMMON;
    726             if (0 != bytes[j] && bytes[j] < maxByte) {
    727                 fprintf(stderr, "Warning: invalid UCA weight byte %02X for %s\n", bytes[j], buffer);
    728                 // TODO: return NULL;
    729             }
    730         }
    731     }
    732 
    733     if(U_FAILURE(*status)) {
    734         fprintf(stderr, "problem putting stuff in hash table %s\n", u_errorName(*status));
    735         *status = U_INTERNAL_PROGRAM_ERROR;
    736         return NULL;
    737     }
    738 
    739     return element;
    740 }
    741 
    742 
    743 void writeOutData(UCATableHeader *data,
    744                   UCAConstants *consts,
    745                   UChar contractions[][3],
    746                   uint32_t noOfcontractions,
    747                   const char *outputDir,
    748                   const char *copyright,
    749                   UErrorCode *status)
    750 {
    751     if(U_FAILURE(*status)) {
    752         return;
    753     }
    754 
    755     uint32_t size = data->size;
    756 
    757     data->UCAConsts = data->size;
    758     data->size += paddedsize(sizeof(UCAConstants));
    759 
    760     if(noOfcontractions != 0) {
    761       contractions[noOfcontractions][0] = 0;
    762       contractions[noOfcontractions][1] = 0;
    763       contractions[noOfcontractions][2] = 0;
    764       noOfcontractions++;
    765 
    766 
    767       data->contractionUCACombos = data->size;
    768       data->contractionUCACombosWidth = 3;
    769       data->contractionUCACombosSize = noOfcontractions;
    770       data->size += paddedsize((noOfcontractions*3*sizeof(UChar)));
    771     }
    772 
    773     UNewDataMemory *pData;
    774 
    775     long dataLength;
    776     UDataInfo ucaInfo;
    777     uprv_memcpy(&ucaInfo, &ucaDataInfo, sizeof(UDataInfo));
    778     u_getUnicodeVersion(ucaInfo.dataVersion);
    779 
    780     pData=udata_create(outputDir, UCA_DATA_TYPE, UCA_DATA_NAME, &ucaInfo,
    781                        copyright, status);
    782 
    783     if(U_FAILURE(*status)) {
    784         fprintf(stderr, "Error: unable to create %s"UCA_DATA_NAME", error %s\n", outputDir, u_errorName(*status));
    785         return;
    786     }
    787 
    788     /* write the data to the file */
    789     if (VERBOSE) {
    790         fprintf(stdout, "Writing out UCA table: %s%c%s.%s\n", outputDir,
    791                                                         U_FILE_SEP_CHAR,
    792                                                         U_ICUDATA_NAME "_" UCA_DATA_NAME,
    793                                                         UCA_DATA_TYPE);
    794     }
    795     udata_writeBlock(pData, data, size);
    796 
    797     // output the constants here
    798     udata_writeBlock(pData, consts, sizeof(UCAConstants));
    799 
    800     if(noOfcontractions != 0) {
    801       udata_writeBlock(pData, contractions, noOfcontractions*3*sizeof(UChar));
    802       udata_writePadding(pData, paddedsize((noOfcontractions*3*sizeof(UChar))) - noOfcontractions*3*sizeof(uint16_t));
    803     }
    804 
    805     /* finish up */
    806     dataLength=udata_finish(pData, status);
    807     if(U_FAILURE(*status)) {
    808         fprintf(stderr, "Error: error %d writing the output file\n", *status);
    809         return;
    810     }
    811 }
    812 
    813 enum {
    814     /*
    815      * Maximum number of UCA contractions we can store.
    816      * May need to be increased for a new Unicode version.
    817      */
    818     MAX_UCA_CONTRACTION_CES=2048
    819 };
    820 
    821 static int32_t
    822 write_uca_table(const char *filename,
    823                 const char *outputDir,
    824                 const char *copyright,
    825                 UErrorCode *status)
    826 {
    827     FILE *data = fopen(filename, "r");
    828     if(data == NULL) {
    829         fprintf(stderr, "Couldn't open file: %s\n", filename);
    830         return -1;
    831     }
    832     uint32_t line = 0;
    833     UCAElements *element = NULL;
    834     UChar variableTopValue = 0;
    835     UCATableHeader *myD = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader));
    836     /* test for NULL */
    837     if(myD == NULL) {
    838         *status = U_MEMORY_ALLOCATION_ERROR;
    839         fclose(data);
    840         return 0;
    841     }
    842     uprv_memset(myD, 0, sizeof(UCATableHeader));
    843     UColOptionSet *opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet));
    844     /* test for NULL */
    845     if(opts == NULL) {
    846         *status = U_MEMORY_ALLOCATION_ERROR;
    847         uprv_free(myD);
    848         fclose(data);
    849         return 0;
    850     }
    851     uprv_memset(opts, 0, sizeof(UColOptionSet));
    852     UChar contractionCEs[MAX_UCA_CONTRACTION_CES][3];
    853     uprv_memset(contractionCEs, 0, sizeof(contractionCEs));
    854     uint32_t noOfContractions = 0;
    855     UCAConstants consts;
    856     uprv_memset(&consts, 0, sizeof(consts));
    857 #if 0
    858     UCAConstants consts = {
    859       UCOL_RESET_TOP_VALUE,
    860       UCOL_FIRST_PRIMARY_IGNORABLE,
    861       UCOL_LAST_PRIMARY_IGNORABLE,
    862       UCOL_LAST_PRIMARY_IGNORABLE_CONT,
    863       UCOL_FIRST_SECONDARY_IGNORABLE,
    864       UCOL_LAST_SECONDARY_IGNORABLE,
    865       UCOL_FIRST_TERTIARY_IGNORABLE,
    866       UCOL_LAST_TERTIARY_IGNORABLE,
    867       UCOL_FIRST_VARIABLE,
    868       UCOL_LAST_VARIABLE,
    869       UCOL_FIRST_NON_VARIABLE,
    870       UCOL_LAST_NON_VARIABLE,
    871 
    872       UCOL_NEXT_TOP_VALUE,
    873 /*
    874       UCOL_NEXT_FIRST_PRIMARY_IGNORABLE,
    875       UCOL_NEXT_LAST_PRIMARY_IGNORABLE,
    876       UCOL_NEXT_FIRST_SECONDARY_IGNORABLE,
    877       UCOL_NEXT_LAST_SECONDARY_IGNORABLE,
    878       UCOL_NEXT_FIRST_TERTIARY_IGNORABLE,
    879       UCOL_NEXT_LAST_TERTIARY_IGNORABLE,
    880       UCOL_NEXT_FIRST_VARIABLE,
    881       UCOL_NEXT_LAST_VARIABLE,
    882 */
    883 
    884       PRIMARY_IMPLICIT_MIN,
    885       PRIMARY_IMPLICIT_MAX
    886     };
    887 #endif
    888 
    889 
    890     uprv_memset(inverseTable, 0xDA, sizeof(int32_t)*3*0xFFFF);
    891 
    892     opts->variableTopValue = variableTopValue;
    893     opts->strength = UCOL_TERTIARY;
    894     opts->frenchCollation = UCOL_OFF;
    895     opts->alternateHandling = UCOL_NON_IGNORABLE; /* attribute for handling variable elements*/
    896     opts->caseFirst = UCOL_OFF;         /* who goes first, lower case or uppercase */
    897     opts->caseLevel = UCOL_OFF;         /* do we have an extra case level */
    898     opts->normalizationMode = UCOL_OFF; /* attribute for normalization */
    899     opts->hiraganaQ = UCOL_OFF; /* attribute for JIS X 4061, used only in Japanese */
    900     opts->numericCollation = UCOL_OFF;
    901     myD->jamoSpecial = FALSE;
    902 
    903     tempUCATable *t = uprv_uca_initTempTable(myD, opts, NULL, IMPLICIT_TAG, LEAD_SURROGATE_TAG, status);
    904     if(U_FAILURE(*status))
    905     {
    906         fprintf(stderr, "Failed to init UCA temp table: %s\n", u_errorName(*status));
    907         uprv_free(opts);
    908         uprv_free(myD);
    909         fclose(data);
    910         return -1;
    911     }
    912 
    913 #if 0
    914     IMPLICIT_TAG = 9,
    915 /*
    916  *****************************************************************************************
    917  * NON_CHARACTER FDD0 - FDEF, FFFE, FFFF, 1FFFE, 1FFFF, 2FFFE, 2FFFF,...e.g. **FFFE, **FFFF
    918  ******************************************************************************************
    919  */
    920 #endif
    921 
    922 // * set to zero
    923 struct {
    924       UChar32 start;
    925       UChar32 end;
    926       int32_t value;
    927     } ranges[] =
    928     {
    929 #if 0
    930       {0xAC00, 0xD7AF, UCOL_SPECIAL_FLAG | (HANGUL_SYLLABLE_TAG << 24) },  //0 HANGUL_SYLLABLE_TAG,/* AC00-D7AF*/
    931       {0xD800, 0xDBFF, UCOL_SPECIAL_FLAG | (LEAD_SURROGATE_TAG << 24)  },  //1 LEAD_SURROGATE_TAG,  /* D800-DBFF*/
    932       {0xDC00, 0xDFFF, UCOL_SPECIAL_FLAG | (TRAIL_SURROGATE_TAG << 24) },  //2 TRAIL_SURROGATE DC00-DFFF
    933       {0x3400, 0x4DB5, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24)    },  //3 CJK_IMPLICIT_TAG,   /* 0x3400-0x4DB5*/
    934       {0x4E00, 0x9FA5, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24)    },  //4 CJK_IMPLICIT_TAG,   /* 0x4E00-0x9FA5*/
    935       {0xF900, 0xFA2D, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24)    },  //5 CJK_IMPLICIT_TAG,   /* 0xF900-0xFA2D*/
    936       {0x20000, 0x2A6D6, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24)  },  //6 CJK_IMPLICIT_TAG,   /* 0x20000-0x2A6D6*/
    937       {0x2F800, 0x2FA1D, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24)  },  //7 CJK_IMPLICIT_TAG,   /* 0x2F800-0x2FA1D*/
    938 #endif
    939       {0xAC00, 0xD7B0, UCOL_SPECIAL_FLAG | (HANGUL_SYLLABLE_TAG << 24) },  //0 HANGUL_SYLLABLE_TAG,/* AC00-D7AF*/
    940       //{0xD800, 0xDC00, UCOL_SPECIAL_FLAG | (LEAD_SURROGATE_TAG << 24)  },  //1 LEAD_SURROGATE_TAG,  /* D800-DBFF*/
    941       {0xDC00, 0xE000, UCOL_SPECIAL_FLAG | (TRAIL_SURROGATE_TAG << 24) },  //2 TRAIL_SURROGATE DC00-DFFF
    942       // Now directly handled in the collation code by the swapCJK function.
    943       //{0x3400, 0x4DB6, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24)    },  //3 CJK_IMPLICIT_TAG,   /* 0x3400-0x4DB5*/
    944       //{0x4E00, 0x9FA6, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24)    },  //4 CJK_IMPLICIT_TAG,   /* 0x4E00-0x9FA5*/
    945       //{0xF900, 0xFA2E, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24)    },  //5 CJK_IMPLICIT_TAG,   /* 0xF900-0xFA2D*/
    946       //{0x20000, 0x2A6D7, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24)  },  //6 CJK_IMPLICIT_TAG,   /* 0x20000-0x2A6D6*/
    947       //{0x2F800, 0x2FA1E, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24)  },  //7 CJK_IMPLICIT_TAG,   /* 0x2F800-0x2FA1D*/
    948     };
    949     uint32_t i = 0;
    950 
    951     for(i = 0; i<sizeof(ranges)/sizeof(ranges[0]); i++) {
    952       /*ucmpe32_setRange32(t->mapping, ranges[i].start, ranges[i].end, ranges[i].value); */
    953       utrie_setRange32(t->mapping, ranges[i].start, ranges[i].end, ranges[i].value, TRUE);
    954     }
    955 
    956 
    957     int32_t surrogateCount = 0;
    958     while(!feof(data)) {
    959         if(U_FAILURE(*status)) {
    960             fprintf(stderr, "Something returned an error %i (%s) while processing line %u of %s. Exiting...\n",
    961                 *status, u_errorName(*status), (int)line, filename);
    962             exit(*status);
    963         }
    964 
    965         element = readAnElement(data, t, &consts, status);
    966         line++;
    967         if(VERBOSE) {
    968           fprintf(stdout, "%u ", (int)line);
    969         }
    970         if(element != NULL) {
    971             // we have read the line, now do something sensible with the read data!
    972 
    973             // Below stuff was taken care of in readAnElement
    974             //if(element->variableTop == TRUE && variableTopValue == 0) {
    975             //    t->options->variableTopValue = element->cPoints[0];
    976             //}
    977 
    978             // if element is a contraction, we want to add it to contractions
    979             if(element->cSize > 1 && element->cPoints[0] != 0xFDD0) { // this is a contraction
    980               if(UTF_IS_LEAD(element->cPoints[0]) && UTF_IS_TRAIL(element->cPoints[1]) && element->cSize == 2) {
    981                 surrogateCount++;
    982               } else {
    983                 if(noOfContractions>=MAX_UCA_CONTRACTION_CES) {
    984                   fprintf(stderr,
    985                           "\nMore than %d contractions. Please increase MAX_UCA_CONTRACTION_CES in genuca.cpp. "
    986                           "Exiting...\n",
    987                           (int)MAX_UCA_CONTRACTION_CES);
    988                   exit(*status);
    989                 }
    990                 contractionCEs[noOfContractions][0] = element->cPoints[0];
    991                 contractionCEs[noOfContractions][1] = element->cPoints[1];
    992                 if(element->cSize > 2) { // the third one
    993                   contractionCEs[noOfContractions][2] = element->cPoints[2];
    994                 } else {
    995                   contractionCEs[noOfContractions][2] = 0;
    996                 }
    997                 noOfContractions++;
    998               }
    999             }
   1000             else {
   1001                 // TODO (claireho): does this work? Need more tests
   1002                 // The following code is to handle the UCA pre-context rules
   1003                 // for L/l with middle dot. We share the structures for contractionCombos.
   1004                 // The format for pre-context character is
   1005                 // contractionCEs[0]: codepoint in element->cPoints[0]
   1006                 // contractionCEs[1]: '\0' to differentiate with contractions.
   1007                 // contractionCEs[2]: prefix char
   1008                 if (element->prefixSize>0) {
   1009                     if(noOfContractions>=MAX_UCA_CONTRACTION_CES) {
   1010                       fprintf(stderr,
   1011                               "\nMore than %d contractions. Please increase MAX_UCA_CONTRACTION_CES in genuca.cpp. "
   1012                               "Exiting...\n",
   1013                               (int)MAX_UCA_CONTRACTION_CES);
   1014                       exit(*status);
   1015                     }
   1016                     contractionCEs[noOfContractions][0]=element->cPoints[0];
   1017                     contractionCEs[noOfContractions][1]='\0';
   1018                     contractionCEs[noOfContractions][2]=element->prefixChars[0];
   1019                     noOfContractions++;
   1020                 }
   1021 
   1022             }
   1023 
   1024             /* we're first adding to inverse, because addAnElement will reverse the order */
   1025             /* of code points and stuff... we don't want that to happen */
   1026             addToInverse(element, status);
   1027             if(!(element->cSize > 1 && element->cPoints[0] == 0xFDD0)) {
   1028               uprv_uca_addAnElement(t, element, status);
   1029             }
   1030         }
   1031     }
   1032 
   1033     if(UCAVersion[0] == 0 && UCAVersion[1] == 0 && UCAVersion[2] == 0 && UCAVersion[3] == 0) {
   1034         fprintf(stderr, "UCA version not specified. Cannot create data file!\n");
   1035         uprv_uca_closeTempTable(t);
   1036         uprv_free(opts);
   1037         uprv_free(myD);
   1038         fclose(data);
   1039         return -1;
   1040     }
   1041 /*    {
   1042         uint32_t trieWord = utrie_get32(t->mapping, 0xDC01, NULL);
   1043     }*/
   1044 
   1045     if (VERBOSE) {
   1046         fprintf(stdout, "\nLines read: %u\n", (int)line);
   1047         fprintf(stdout, "Surrogate count: %i\n", (int)surrogateCount);
   1048         fprintf(stdout, "Raw data breakdown:\n");
   1049         /*fprintf(stdout, "Compact array stage1 top: %i, stage2 top: %i\n", t->mapping->stage1Top, t->mapping->stage2Top);*/
   1050         fprintf(stdout, "Number of contractions: %u\n", (int)noOfContractions);
   1051         fprintf(stdout, "Contraction image size: %u\n", (int)t->image->contractionSize);
   1052         fprintf(stdout, "Expansions size: %i\n", (int)t->expansions->position);
   1053     }
   1054 
   1055 
   1056     /* produce canonical closure for table */
   1057     /* first set up constants for implicit calculation */
   1058     uprv_uca_initImplicitConstants(status);
   1059     /* do the closure */
   1060     int32_t noOfClosures = uprv_uca_canonicalClosure(t, NULL, status);
   1061     if(noOfClosures != 0) {
   1062       fprintf(stderr, "Warning: %i canonical closures occured!\n", (int)noOfClosures);
   1063     }
   1064 
   1065     /* test */
   1066     UCATableHeader *myData = uprv_uca_assembleTable(t, status);
   1067 
   1068     if (VERBOSE) {
   1069         fprintf(stdout, "Compacted data breakdown:\n");
   1070         /*fprintf(stdout, "Compact array stage1 top: %i, stage2 top: %i\n", t->mapping->stage1Top, t->mapping->stage2Top);*/
   1071         fprintf(stdout, "Number of contractions: %u\n", (int)noOfContractions);
   1072         fprintf(stdout, "Contraction image size: %u\n", (int)t->image->contractionSize);
   1073         fprintf(stdout, "Expansions size: %i\n", (int)t->expansions->position);
   1074     }
   1075 
   1076     if(U_FAILURE(*status)) {
   1077         fprintf(stderr, "Error creating table: %s\n", u_errorName(*status));
   1078         uprv_uca_closeTempTable(t);
   1079         uprv_free(opts);
   1080         uprv_free(myD);
   1081         fclose(data);
   1082         return -1;
   1083     }
   1084 
   1085     /* populate the version info struct with version info*/
   1086     myData->version[0] = UCOL_BUILDER_VERSION;
   1087     myData->version[1] = UCAVersion[0];
   1088     myData->version[2] = UCAVersion[1];
   1089     myData->version[3] = UCAVersion[2];
   1090     /*TODO:The fractional rules version should be taken from FractionalUCA.txt*/
   1091     // Removed this macro. Instead, we use the fields below
   1092     //myD->version[1] = UCOL_FRACTIONAL_UCA_VERSION;
   1093     //myD->UCAVersion = UCAVersion; // out of FractionalUCA.txt
   1094     uprv_memcpy(myData->UCAVersion, UCAVersion, sizeof(UVersionInfo));
   1095     u_getUnicodeVersion(myData->UCDVersion);
   1096 
   1097     writeOutData(myData, &consts, contractionCEs, noOfContractions, outputDir, copyright, status);
   1098 
   1099     InverseUCATableHeader *inverse = assembleInverseTable(status);
   1100     uprv_memcpy(inverse->UCAVersion, UCAVersion, sizeof(UVersionInfo));
   1101     writeOutInverseData(inverse, outputDir, copyright, status);
   1102 
   1103     uprv_uca_closeTempTable(t);
   1104     uprv_free(myD);
   1105     uprv_free(opts);
   1106 
   1107 
   1108     uprv_free(myData);
   1109     uprv_free(inverse);
   1110     fclose(data);
   1111 
   1112     return 0;
   1113 }
   1114 
   1115 #endif /* #if !UCONFIG_NO_COLLATION */
   1116 
   1117 static UOption options[]={
   1118     UOPTION_HELP_H,              /* 0  Numbers for those who*/
   1119     UOPTION_HELP_QUESTION_MARK,  /* 1   can't count. */
   1120     UOPTION_COPYRIGHT,           /* 2 */
   1121     UOPTION_VERSION,             /* 3 */
   1122     UOPTION_DESTDIR,             /* 4 */
   1123     UOPTION_SOURCEDIR,           /* 5 */
   1124     UOPTION_VERBOSE,             /* 6 */
   1125     UOPTION_ICUDATADIR           /* 7 */
   1126     /* weiv can't count :))))) */
   1127 };
   1128 
   1129 int main(int argc, char* argv[]) {
   1130     UErrorCode status = U_ZERO_ERROR;
   1131     const char* destdir = NULL;
   1132     const char* srcDir = NULL;
   1133     char filename[300];
   1134     char *basename = NULL;
   1135     const char *copyright = NULL;
   1136     uprv_memset(&UCAVersion, 0, 4);
   1137 
   1138     U_MAIN_INIT_ARGS(argc, argv);
   1139 
   1140     /* preset then read command line options */
   1141     options[4].value=u_getDataDirectory();
   1142     options[5].value="";
   1143     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
   1144 
   1145     /* error handling, printing usage message */
   1146     if(argc<0) {
   1147         fprintf(stderr,
   1148             "error in command line argument \"%s\"\n",
   1149             argv[-argc]);
   1150     } else if(argc<2) {
   1151         argc=-1;
   1152     }
   1153     if(options[0].doesOccur || options[1].doesOccur) {
   1154         fprintf(stderr,
   1155             "usage: %s [-options] file\n"
   1156             "\tRead in UCA collation text data and write out the binary collation data\n"
   1157             "options:\n"
   1158             "\t-h or -? or --help  this usage text\n"
   1159             "\t-V or --version     show a version message\n"
   1160             "\t-c or --copyright   include a copyright notice\n"
   1161             "\t-d or --destdir     destination directory, followed by the path\n"
   1162             "\t-s or --sourcedir   source directory, followed by the path\n"
   1163             "\t-v or --verbose     turn on verbose output\n"
   1164             "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n"
   1165             "\t                    followed by path, defaults to %s\n",
   1166             argv[0], u_getDataDirectory());
   1167         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
   1168     }
   1169     if(options[3].doesOccur) {
   1170         fprintf(stdout, "genuca version %hu.%hu, ICU tool to read UCA text data and create UCA data tables for collation.\n",
   1171 #if UCONFIG_NO_COLLATION
   1172             0, 0
   1173 #else
   1174             UCA_FORMAT_VERSION_0, UCA_FORMAT_VERSION_1
   1175 #endif
   1176             );
   1177         fprintf(stdout, U_COPYRIGHT_STRING"\n");
   1178         exit(0);
   1179     }
   1180 
   1181     /* get the options values */
   1182     destdir = options[4].value;
   1183     srcDir = options[5].value;
   1184     VERBOSE = options[6].doesOccur;
   1185 
   1186     if (options[2].doesOccur) {
   1187         copyright = U_COPYRIGHT_STRING;
   1188     }
   1189 
   1190     if (options[7].doesOccur) {
   1191         u_setDataDirectory(options[7].value);
   1192     }
   1193     /* Initialize ICU */
   1194     u_init(&status);
   1195     if (U_FAILURE(status) && status != U_FILE_ACCESS_ERROR) {
   1196         fprintf(stderr, "%s: can not initialize ICU.  status = %s\n",
   1197             argv[0], u_errorName(status));
   1198         exit(1);
   1199     }
   1200     status = U_ZERO_ERROR;
   1201 
   1202 
   1203     /* prepare the filename beginning with the source dir */
   1204     uprv_strcpy(filename, srcDir);
   1205     basename=filename+uprv_strlen(filename);
   1206 
   1207     if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
   1208         *basename++ = U_FILE_SEP_CHAR;
   1209     }
   1210 
   1211     if(argc < 0) {
   1212       uprv_strcpy(basename, "FractionalUCA.txt");
   1213     } else {
   1214       argv++;
   1215       uprv_strcpy(basename, getLongPathname(*argv));
   1216     }
   1217 
   1218 #if 0
   1219     if(u_getCombiningClass(0x0053) == 0)
   1220     {
   1221         fprintf(stderr, "SEVERE ERROR: Normalization data is not functioning! Bailing out.  Was not able to load unorm.dat.\n");
   1222         exit(1);
   1223     }
   1224 #endif
   1225 
   1226 #if UCONFIG_NO_COLLATION
   1227 
   1228     UNewDataMemory *pData;
   1229     const char *msg;
   1230 
   1231     msg = "genuca writes dummy " UCA_DATA_NAME "." UCA_DATA_TYPE " because of UCONFIG_NO_COLLATION, see uconfig.h";
   1232     fprintf(stderr, "%s\n", msg);
   1233     pData = udata_create(destdir, UCA_DATA_TYPE, UCA_DATA_NAME, &dummyDataInfo,
   1234                          NULL, &status);
   1235     udata_writeBlock(pData, msg, strlen(msg));
   1236     udata_finish(pData, &status);
   1237 
   1238     msg = "genuca writes dummy " INVC_DATA_NAME "." INVC_DATA_TYPE " because of UCONFIG_NO_COLLATION, see uconfig.h";
   1239     fprintf(stderr, "%s\n", msg);
   1240     pData = udata_create(destdir, INVC_DATA_TYPE, INVC_DATA_NAME, &dummyDataInfo,
   1241                          NULL, &status);
   1242     udata_writeBlock(pData, msg, strlen(msg));
   1243     udata_finish(pData, &status);
   1244 
   1245     return (int)status;
   1246 
   1247 #else
   1248 
   1249     return write_uca_table(filename, destdir, copyright, &status);
   1250 
   1251 #endif
   1252 }
   1253 
   1254 /*
   1255  * Hey, Emacs, please set the following:
   1256  *
   1257  * Local Variables:
   1258  * indent-tabs-mode: nil
   1259  * End:
   1260  *
   1261  */
   1262