Home | History | Annotate | Download | only in gennorm2
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2009-2010, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  n2builder.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2009nov25
     14 *   created by: Markus W. Scherer
     15 *
     16 * Builds Normalizer2 data and writes a binary .nrm file.
     17 * For the file format see source/common/normalizer2impl.h.
     18 */
     19 
     20 #include "unicode/utypes.h"
     21 #include "unicode/std_string.h"  // U_HAVE_STD_STRING, #include <string>
     22 #include "n2builder.h"  // UCONFIG_NO_NORMALIZATION=1 if !U_HAVE_STD_STRING
     23 
     24 #include <stdio.h>
     25 #include <stdlib.h>
     26 #include <string.h>
     27 #if U_HAVE_STD_STRING
     28 #include <vector>
     29 #endif
     30 #include "unicode/errorcode.h"
     31 #include "unicode/localpointer.h"
     32 #include "unicode/putil.h"
     33 #include "unicode/udata.h"
     34 #include "unicode/uniset.h"
     35 #include "unicode/unistr.h"
     36 #include "unicode/ustring.h"
     37 #include "hash.h"
     38 #include "normalizer2impl.h"
     39 #include "toolutil.h"
     40 #include "unewdata.h"
     41 #include "utrie2.h"
     42 
     43 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
     44 
     45 #if !UCONFIG_NO_NORMALIZATION
     46 
     47 /* UDataInfo cf. udata.h */
     48 static UDataInfo dataInfo={
     49     sizeof(UDataInfo),
     50     0,
     51 
     52     U_IS_BIG_ENDIAN,
     53     U_CHARSET_FAMILY,
     54     U_SIZEOF_UCHAR,
     55     0,
     56 
     57     { 0x4e, 0x72, 0x6d, 0x32 }, /* dataFormat="Nrm2" */
     58     { 1, 0, 0, 0 },             /* formatVersion */
     59     { 5, 2, 0, 0 }              /* dataVersion (Unicode version) */
     60 };
     61 
     62 U_NAMESPACE_BEGIN
     63 
     64 class HangulIterator {
     65 public:
     66     struct Range {
     67         UChar32 start, limit;
     68         uint16_t norm16;
     69     };
     70 
     71     HangulIterator() : rangeIndex(0) {}
     72     const Range *nextRange() {
     73         if(rangeIndex<LENGTHOF(ranges)) {
     74             return ranges+rangeIndex++;
     75         } else {
     76             return NULL;
     77         }
     78     }
     79     void reset() { rangeIndex=0; }
     80 private:
     81     static const Range ranges[4];
     82     int32_t rangeIndex;
     83 };
     84 
     85 const HangulIterator::Range HangulIterator::ranges[4]={
     86     { Hangul::JAMO_L_BASE, Hangul::JAMO_L_BASE+Hangul::JAMO_L_COUNT, 1 },
     87     { Hangul::JAMO_V_BASE, Hangul::JAMO_V_BASE+Hangul::JAMO_V_COUNT, Normalizer2Impl::JAMO_VT },
     88     // JAMO_T_BASE+1: not U+11A7
     89     { Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_BASE+Hangul::JAMO_T_COUNT, Normalizer2Impl::JAMO_VT },
     90     { Hangul::HANGUL_BASE, Hangul::HANGUL_BASE+Hangul::HANGUL_COUNT, 0 },  // will become minYesNo
     91 };
     92 
     93 struct CompositionPair {
     94     CompositionPair(UChar32 t, UChar32 c) : trail(t), composite(c) {}
     95     UChar32 trail, composite;
     96 };
     97 
     98 struct Norm {
     99     enum MappingType { NONE, REMOVED, ROUND_TRIP, ONE_WAY };
    100 
    101     UBool hasMapping() const { return mappingType>REMOVED; }
    102 
    103     // Requires hasMapping() and well-formed mapping.
    104     void setMappingCP() {
    105         UChar32 c;
    106         if(!mapping->isEmpty() && mapping->length()==U16_LENGTH(c=mapping->char32At(0))) {
    107             mappingCP=c;
    108         } else {
    109             mappingCP=U_SENTINEL;
    110         }
    111     }
    112 
    113     UnicodeString *mapping;
    114     UChar32 mappingCP;  // >=0 if mapping to 1 code point
    115     int32_t mappingPhase;
    116     MappingType mappingType;
    117 
    118     U_STD_NSQ vector<CompositionPair> *compositions;
    119     uint8_t cc;
    120     UBool combinesBack;
    121     UBool hasNoCompBoundaryAfter;
    122 
    123     enum OffsetType {
    124         OFFSET_NONE, OFFSET_MAYBE_YES,
    125         OFFSET_YES_YES, OFFSET_YES_NO, OFFSET_NO_NO,
    126         OFFSET_DELTA
    127     };
    128     enum { OFFSET_SHIFT=4, OFFSET_MASK=(1<<OFFSET_SHIFT)-1 };
    129     int32_t offset;
    130 };
    131 
    132 class Normalizer2DBEnumerator {
    133 public:
    134     Normalizer2DBEnumerator(Normalizer2DataBuilder &b) : builder(b) {}
    135     virtual ~Normalizer2DBEnumerator() {}
    136     virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) = 0;
    137     Normalizer2DBEnumerator *ptr() { return this; }
    138 protected:
    139     Normalizer2DataBuilder &builder;
    140 };
    141 
    142 U_CDECL_BEGIN
    143 
    144 static UBool U_CALLCONV
    145 enumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
    146     return ((Normalizer2DBEnumerator *)context)->rangeHandler(start, end, value);
    147 }
    148 
    149 U_CDECL_END
    150 
    151 Normalizer2DataBuilder::Normalizer2DataBuilder(UErrorCode &errorCode) :
    152         phase(0), overrideHandling(OVERRIDE_PREVIOUS), optimization(OPTIMIZE_NORMAL) {
    153     memset(unicodeVersion, 0, sizeof(unicodeVersion));
    154     normTrie=utrie2_open(0, 0, &errorCode);
    155     normMem=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(Norm));
    156     norms=allocNorm();  // unused Norm struct at index 0
    157     memset(indexes, 0, sizeof(indexes));
    158 }
    159 
    160 Normalizer2DataBuilder::~Normalizer2DataBuilder() {
    161     utrie2_close(normTrie);
    162     int32_t normsLength=utm_countItems(normMem);
    163     for(int32_t i=1; i<normsLength; ++i) {
    164         delete norms[i].mapping;
    165         delete norms[i].compositions;
    166     }
    167     utm_close(normMem);
    168     utrie2_close(norm16Trie);
    169 }
    170 
    171 void
    172 Normalizer2DataBuilder::setUnicodeVersion(const char *v) {
    173     u_versionFromString(unicodeVersion, v);
    174 }
    175 
    176 Norm *Normalizer2DataBuilder::allocNorm() {
    177     Norm *p=(Norm *)utm_alloc(normMem);
    178     norms=(Norm *)utm_getStart(normMem);  // in case it got reallocated
    179     return p;
    180 }
    181 
    182 /* get an existing Norm unit */
    183 Norm *Normalizer2DataBuilder::getNorm(UChar32 c) {
    184     uint32_t i=utrie2_get32(normTrie, c);
    185     if(i==0) {
    186         return NULL;
    187     }
    188     return norms+i;
    189 }
    190 
    191 const Norm &Normalizer2DataBuilder::getNormRef(UChar32 c) const {
    192     return norms[utrie2_get32(normTrie, c)];
    193 }
    194 
    195 /*
    196  * get or create a Norm unit;
    197  * get or create the intermediate trie entries for it as well
    198  */
    199 Norm *Normalizer2DataBuilder::createNorm(UChar32 c) {
    200     uint32_t i=utrie2_get32(normTrie, c);
    201     if(i!=0) {
    202         return norms+i;
    203     } else {
    204         /* allocate Norm */
    205         Norm *p=allocNorm();
    206         IcuToolErrorCode errorCode("gennorm2/createNorm()");
    207         utrie2_set32(normTrie, c, (uint32_t)(p-norms), errorCode);
    208         return p;
    209     }
    210 }
    211 
    212 Norm *Normalizer2DataBuilder::checkNormForMapping(Norm *p, UChar32 c) {
    213     if(p!=NULL) {
    214         if(p->mappingType!=Norm::NONE) {
    215             if( overrideHandling==OVERRIDE_NONE ||
    216                 (overrideHandling==OVERRIDE_PREVIOUS && p->mappingPhase==phase)
    217             ) {
    218                 fprintf(stderr,
    219                         "error in gennorm2 phase %d: "
    220                         "not permitted to override mapping for U+%04lX from phase %d\n",
    221                         (int)phase, (long)c, (int)p->mappingPhase);
    222                 exit(U_INVALID_FORMAT_ERROR);
    223             }
    224             delete p->mapping;
    225             p->mapping=NULL;
    226         }
    227         p->mappingPhase=phase;
    228     }
    229     return p;
    230 }
    231 
    232 void Normalizer2DataBuilder::setOverrideHandling(OverrideHandling oh) {
    233     overrideHandling=oh;
    234     ++phase;
    235 }
    236 
    237 void Normalizer2DataBuilder::setCC(UChar32 c, uint8_t cc) {
    238     createNorm(c)->cc=cc;
    239 }
    240 
    241 uint8_t Normalizer2DataBuilder::getCC(UChar32 c) const {
    242     return getNormRef(c).cc;
    243 }
    244 
    245 static UBool isWellFormed(const UnicodeString &s) {
    246     UErrorCode errorCode=U_ZERO_ERROR;
    247     u_strToUTF8(NULL, 0, NULL, s.getBuffer(), s.length(), &errorCode);
    248     return U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR;
    249 }
    250 
    251 void Normalizer2DataBuilder::setOneWayMapping(UChar32 c, const UnicodeString &m) {
    252     if(!isWellFormed(m)) {
    253         fprintf(stderr,
    254                 "error in gennorm2 phase %d: "
    255                 "illegal one-way mapping from U+%04lX to malformed string\n",
    256                 (int)phase, (long)c);
    257         exit(U_INVALID_FORMAT_ERROR);
    258     }
    259     Norm *p=checkNormForMapping(createNorm(c), c);
    260     p->mapping=new UnicodeString(m);
    261     p->mappingType=Norm::ONE_WAY;
    262     p->setMappingCP();
    263 }
    264 
    265 void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString &m) {
    266     if(U_IS_SURROGATE(c)) {
    267         fprintf(stderr,
    268                 "error in gennorm2 phase %d: "
    269                 "illegal round-trip mapping from surrogate code point U+%04lX\n",
    270                 (int)phase, (long)c);
    271         exit(U_INVALID_FORMAT_ERROR);
    272     }
    273     if(!isWellFormed(m)) {
    274         fprintf(stderr,
    275                 "error in gennorm2 phase %d: "
    276                 "illegal round-trip mapping from U+%04lX to malformed string\n",
    277                 (int)phase, (long)c);
    278         exit(U_INVALID_FORMAT_ERROR);
    279     }
    280     int32_t numCP=u_countChar32(m.getBuffer(), m.length());
    281     if(numCP!=2) {
    282         fprintf(stderr,
    283                 "error in gennorm2 phase %d: "
    284                 "illegal round-trip mapping from U+%04lX to %d!=2 code points\n",
    285                 (int)phase, (long)c, (int)numCP);
    286         exit(U_INVALID_FORMAT_ERROR);
    287     }
    288     Norm *p=checkNormForMapping(createNorm(c), c);
    289     p->mapping=new UnicodeString(m);
    290     p->mappingType=Norm::ROUND_TRIP;
    291     p->mappingCP=U_SENTINEL;
    292 }
    293 
    294 void Normalizer2DataBuilder::removeMapping(UChar32 c) {
    295     Norm *p=checkNormForMapping(getNorm(c), c);
    296     if(p!=NULL) {
    297         p->mappingType=Norm::REMOVED;
    298     }
    299 }
    300 
    301 class CompositionBuilder : public Normalizer2DBEnumerator {
    302 public:
    303     CompositionBuilder(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b) {}
    304     virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
    305         builder.addComposition(start, end, value);
    306         return TRUE;
    307     }
    308 };
    309 
    310 void
    311 Normalizer2DataBuilder::addComposition(UChar32 start, UChar32 end, uint32_t value) {
    312     if(norms[value].mappingType==Norm::ROUND_TRIP) {
    313         if(start!=end) {
    314             fprintf(stderr,
    315                     "gennorm2 error: same round-trip mapping for "
    316                     "more than 1 code point U+%04lX..U+%04lX\n",
    317                     (long)start, (long)end);
    318             exit(U_INVALID_FORMAT_ERROR);
    319         }
    320         if(norms[value].cc!=0) {
    321             fprintf(stderr,
    322                     "gennorm2 error: "
    323                     "U+%04lX has a round-trip mapping and ccc!=0, "
    324                     "not possible in Unicode normalization\n",
    325                     (long)start);
    326             exit(U_INVALID_FORMAT_ERROR);
    327         }
    328         // setRoundTripMapping() ensured that there are exactly two code points.
    329         const UnicodeString &m=*norms[value].mapping;
    330         UChar32 lead=m.char32At(0);
    331         UChar32 trail=m.char32At(m.length()-1);
    332         if(getCC(lead)!=0) {
    333             fprintf(stderr,
    334                     "gennorm2 error: "
    335                     "U+%04lX's round-trip mapping's starter U+%04lX has ccc!=0, "
    336                     "not possible in Unicode normalization\n",
    337                     (long)start, (long)lead);
    338             exit(U_INVALID_FORMAT_ERROR);
    339         }
    340         // Flag for trailing character.
    341         createNorm(trail)->combinesBack=TRUE;
    342         // Insert (trail, composite) pair into compositions list for the lead character.
    343         CompositionPair pair(trail, start);
    344         Norm *leadNorm=createNorm(lead);
    345         U_STD_NSQ vector<CompositionPair> *compositions=leadNorm->compositions;
    346         if(compositions==NULL) {
    347             compositions=leadNorm->compositions=new U_STD_NSQ vector<CompositionPair>;
    348             compositions->push_back(pair);
    349         } else {
    350             // Insertion sort, and check for duplicate trail characters.
    351             U_STD_NSQ vector<CompositionPair>::iterator it;
    352             for(it=compositions->begin(); it!=compositions->end(); ++it) {
    353                 if(trail==it->trail) {
    354                     fprintf(stderr,
    355                             "gennorm2 error: same round-trip mapping for "
    356                             "more than 1 code point (e.g., U+%04lX) to U+%04lX + U+%04lX\n",
    357                             (long)start, (long)lead, (long)trail);
    358                     exit(U_INVALID_FORMAT_ERROR);
    359                 }
    360                 if(trail<it->trail) {
    361                     break;
    362                 }
    363             }
    364             compositions->insert(it, pair);
    365         }
    366     }
    367 }
    368 
    369 UBool Normalizer2DataBuilder::combinesWithCCBetween(const Norm &norm,
    370                                                     uint8_t lowCC, uint8_t highCC) const {
    371     const U_STD_NSQ vector<CompositionPair> *compositions=norm.compositions;
    372     if(compositions!=NULL && (highCC-lowCC)>=2) {
    373         U_STD_NSQ vector<CompositionPair>::const_iterator it;
    374         for(it=compositions->begin(); it!=compositions->end(); ++it) {
    375             uint8_t trailCC=getCC(it->trail);
    376             if(lowCC<trailCC && trailCC<highCC) {
    377                 return TRUE;
    378             }
    379         }
    380     }
    381     return FALSE;
    382 }
    383 
    384 UChar32 Normalizer2DataBuilder::combine(const Norm &norm, UChar32 trail) const {
    385     const U_STD_NSQ vector<CompositionPair> *compositions=norm.compositions;
    386     if(compositions!=NULL) {
    387         U_STD_NSQ vector<CompositionPair>::const_iterator it;
    388         for(it=compositions->begin(); it!=compositions->end(); ++it) {
    389             if(trail==it->trail) {
    390                 return it->composite;
    391             }
    392             if(trail<it->trail) {
    393                 break;
    394             }
    395         }
    396     }
    397     return U_SENTINEL;
    398 }
    399 
    400 class Decomposer : public Normalizer2DBEnumerator {
    401 public:
    402     Decomposer(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b), didDecompose(FALSE) {}
    403     virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
    404         didDecompose|=builder.decompose(start, end, value);
    405         return TRUE;
    406     }
    407     UBool didDecompose;
    408 };
    409 
    410 UBool
    411 Normalizer2DataBuilder::decompose(UChar32 start, UChar32 end, uint32_t value) {
    412     if(norms[value].hasMapping()) {
    413         const UnicodeString &m=*norms[value].mapping;
    414         UnicodeString *decomposed=NULL;
    415         const UChar *s=m.getBuffer();
    416         int32_t length=m.length();
    417         int32_t prev, i=0;
    418         UChar32 c;
    419         while(i<length) {
    420             prev=i;
    421             U16_NEXT(s, i, length, c);
    422             if(start<=c && c<=end) {
    423                 fprintf(stderr,
    424                         "gennorm2 error: U+%04lX maps to itself directly or indirectly\n",
    425                         (long)c);
    426                 exit(U_INVALID_FORMAT_ERROR);
    427             }
    428             const Norm &cNorm=getNormRef(c);
    429             if(cNorm.hasMapping()) {
    430                 if(norms[value].mappingType==Norm::ROUND_TRIP) {
    431                     if(prev==0) {
    432                         if(cNorm.mappingType!=Norm::ROUND_TRIP) {
    433                             fprintf(stderr,
    434                                     "gennorm2 error: "
    435                                     "U+%04lX's round-trip mapping's starter "
    436                                     "U+%04lX one-way-decomposes, "
    437                                     "not possible in Unicode normalization\n",
    438                                     (long)start, (long)c);
    439                             exit(U_INVALID_FORMAT_ERROR);
    440                         }
    441                         uint8_t myTrailCC=getCC(m.char32At(i));
    442                         UChar32 cTrailChar=cNorm.mapping->char32At(cNorm.mapping->length()-1);
    443                         uint8_t cTrailCC=getCC(cTrailChar);
    444                         if(cTrailCC>myTrailCC) {
    445                             fprintf(stderr,
    446                                     "gennorm2 error: "
    447                                     "U+%04lX's round-trip mapping's starter "
    448                                     "U+%04lX decomposes and the "
    449                                     "inner/earlier tccc=%hu > outer/following tccc=%hu, "
    450                                     "not possible in Unicode normalization\n",
    451                                     (long)start, (long)c,
    452                                     (short)cTrailCC, (short)myTrailCC);
    453                             exit(U_INVALID_FORMAT_ERROR);
    454                         }
    455                     } else {
    456                         fprintf(stderr,
    457                                 "gennorm2 error: "
    458                                 "U+%04lX's round-trip mapping's non-starter "
    459                                 "U+%04lX decomposes, "
    460                                 "not possible in Unicode normalization\n",
    461                                 (long)start, (long)c);
    462                         exit(U_INVALID_FORMAT_ERROR);
    463                     }
    464                 }
    465                 if(decomposed==NULL) {
    466                     decomposed=new UnicodeString(m, 0, prev);
    467                 }
    468                 decomposed->append(*cNorm.mapping);
    469             } else if(Hangul::isHangul(c)) {
    470                 UChar buffer[3];
    471                 int32_t hangulLength=Hangul::decompose(c, buffer);
    472                 if(norms[value].mappingType==Norm::ROUND_TRIP && prev!=0) {
    473                     fprintf(stderr,
    474                             "gennorm2 error: "
    475                             "U+%04lX's round-trip mapping's non-starter "
    476                             "U+%04lX decomposes, "
    477                             "not possible in Unicode normalization\n",
    478                             (long)start, (long)c);
    479                     exit(U_INVALID_FORMAT_ERROR);
    480                 }
    481                 if(decomposed==NULL) {
    482                     decomposed=new UnicodeString(m, 0, prev);
    483                 }
    484                 decomposed->append(buffer, hangulLength);
    485             } else if(decomposed!=NULL) {
    486                 decomposed->append(m, prev, i-prev);
    487             }
    488         }
    489         if(decomposed!=NULL) {
    490             delete norms[value].mapping;
    491             norms[value].mapping=decomposed;
    492             // Not  norms[value].setMappingCP();  because the original mapping
    493             // is most likely to be encodable as a delta.
    494             return TRUE;
    495         }
    496     }
    497     return FALSE;
    498 }
    499 
    500 class BuilderReorderingBuffer {
    501 public:
    502     BuilderReorderingBuffer() : fLength(0), fLastStarterIndex(-1), fDidReorder(FALSE) {}
    503     void reset() {
    504         fLength=0;
    505         fLastStarterIndex=-1;
    506         fDidReorder=FALSE;
    507     }
    508     int32_t length() const { return fLength; }
    509     UBool isEmpty() const { return fLength==0; }
    510     int32_t lastStarterIndex() const { return fLastStarterIndex; }
    511     UChar32 charAt(int32_t i) const { return fArray[i]>>8; }
    512     uint8_t ccAt(int32_t i) const { return (uint8_t)fArray[i]; }
    513     UBool didReorder() const { return fDidReorder; }
    514     void append(UChar32 c, uint8_t cc) {
    515         if(cc==0 || fLength==0 || ccAt(fLength-1)<=cc) {
    516             if(cc==0) {
    517                 fLastStarterIndex=fLength;
    518             }
    519             fArray[fLength++]=(c<<8)|cc;
    520             return;
    521         }
    522         // Let this character bubble back to its canonical order.
    523         int32_t i=fLength-1;
    524         while(i>fLastStarterIndex && ccAt(i)>cc) {
    525             --i;
    526         }
    527         ++i;  // after the last starter or prevCC<=cc
    528         // Move this and the following characters forward one to make space.
    529         for(int32_t j=fLength; i<j; --j) {
    530             fArray[j]=fArray[j-1];
    531         }
    532         fArray[i]=(c<<8)|cc;
    533         ++fLength;
    534         fDidReorder=TRUE;
    535     }
    536     void toString(UnicodeString &dest) {
    537         dest.remove();
    538         for(int32_t i=0; i<fLength; ++i) {
    539             dest.append(charAt(i));
    540         }
    541     }
    542     void setComposite(UChar32 composite, int32_t combMarkIndex) {
    543         fArray[fLastStarterIndex]=composite<<8;
    544         // Remove the combining mark that contributed to the composite.
    545         --fLength;
    546         while(combMarkIndex<fLength) {
    547             fArray[combMarkIndex]=fArray[combMarkIndex+1];
    548             ++combMarkIndex;
    549         }
    550     }
    551 private:
    552     int32_t fArray[Normalizer2Impl::MAPPING_LENGTH_MASK];
    553     int32_t fLength;
    554     int32_t fLastStarterIndex;
    555     UBool fDidReorder;
    556 };
    557 
    558 void
    559 Normalizer2DataBuilder::reorder(Norm *p, BuilderReorderingBuffer &buffer) {
    560     UnicodeString &m=*p->mapping;
    561     int32_t length=m.length();
    562     if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) {
    563         return;  // writeMapping() will complain about it and print the code point.
    564     }
    565     const UChar *s=m.getBuffer();
    566     int32_t i=0;
    567     UChar32 c;
    568     while(i<length) {
    569         U16_NEXT(s, i, length, c);
    570         buffer.append(c, getCC(c));
    571     }
    572     if(buffer.didReorder()) {
    573         buffer.toString(m);
    574     }
    575 }
    576 
    577 UBool Normalizer2DataBuilder::hasNoCompBoundaryAfter(BuilderReorderingBuffer &buffer) {
    578     if(buffer.isEmpty()) {
    579         return TRUE;  // maps-to-empty string is no boundary of any kind
    580     }
    581     int32_t lastStarterIndex=buffer.lastStarterIndex();
    582     if(lastStarterIndex<0) {
    583         return TRUE;  // no starter
    584     }
    585     UChar32 starter=buffer.charAt(lastStarterIndex);
    586     if( Hangul::isJamoL(starter) ||
    587         (Hangul::isJamoV(starter) &&
    588          0<lastStarterIndex && Hangul::isJamoL(buffer.charAt(lastStarterIndex-1)))
    589     ) {
    590         // A Jamo leading consonant or an LV pair combines-forward if it is at the end,
    591         // otherwise it is blocked.
    592         return lastStarterIndex==buffer.length()-1;
    593     }
    594     // no Hangul in fully decomposed mapping
    595     const Norm *starterNorm=&getNormRef(starter);
    596     if(starterNorm->compositions==NULL) {
    597         return FALSE;  // the last starter does not combine forward
    598     }
    599     // Compose as far as possible, and see if further compositions are possible.
    600     uint8_t prevCC=0;
    601     for(int32_t combMarkIndex=lastStarterIndex+1; combMarkIndex<buffer.length();) {
    602         uint8_t cc=buffer.ccAt(combMarkIndex);  // !=0 because after last starter
    603         if(combinesWithCCBetween(*starterNorm, prevCC, cc)) {
    604             return TRUE;
    605         }
    606         if( prevCC<cc &&
    607             (starter=combine(*starterNorm, buffer.charAt(combMarkIndex)))>=0
    608         ) {
    609             buffer.setComposite(starter, combMarkIndex);
    610             starterNorm=&getNormRef(starter);
    611             if(starterNorm->compositions==NULL) {
    612                 return FALSE;  // the composite does not combine further
    613             }
    614         } else {
    615             prevCC=cc;
    616             ++combMarkIndex;
    617         }
    618     }
    619     // TRUE if the final, forward-combining starter is at the end.
    620     return prevCC==0;
    621 }
    622 
    623 // Requires p->hasMapping().
    624 void Normalizer2DataBuilder::writeMapping(UChar32 c, const Norm *p, UnicodeString &dataString) {
    625     UnicodeString &m=*p->mapping;
    626     int32_t length=m.length();
    627     if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) {
    628         fprintf(stderr,
    629                 "gennorm2 error: "
    630                 "mapping for U+%04lX longer than maximum of %d\n",
    631                 (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK);
    632         exit(U_INVALID_FORMAT_ERROR);
    633     }
    634     int32_t leadCC, trailCC;
    635     if(length==0) {
    636         leadCC=trailCC=0;
    637     } else {
    638         leadCC=getCC(m.char32At(0));
    639         trailCC=getCC(m.char32At(length-1));
    640     }
    641     if(c<Normalizer2Impl::MIN_CCC_LCCC_CP && (p->cc!=0 || leadCC!=0)) {
    642         fprintf(stderr,
    643                 "gennorm2 error: "
    644                 "U+%04lX below U+0300 has ccc!=0 or lccc!=0, not supported by ICU\n",
    645                 (long)c);
    646         exit(U_INVALID_FORMAT_ERROR);
    647     }
    648     int32_t firstUnit=length|(trailCC<<8);
    649     int32_t secondUnit=p->cc|(leadCC<<8);
    650     if(secondUnit!=0) {
    651         firstUnit|=Normalizer2Impl::MAPPING_HAS_CCC_LCCC_WORD;
    652     }
    653     if(p->compositions!=NULL) {
    654         firstUnit|=Normalizer2Impl::MAPPING_PLUS_COMPOSITION_LIST;
    655     }
    656     if(p->hasNoCompBoundaryAfter) {
    657         firstUnit|=Normalizer2Impl::MAPPING_NO_COMP_BOUNDARY_AFTER;
    658     }
    659     dataString.append((UChar)firstUnit);
    660     if(secondUnit!=0) {
    661         dataString.append((UChar)secondUnit);
    662     }
    663     dataString.append(m);
    664 }
    665 
    666 // Requires p->compositions!=NULL.
    667 void Normalizer2DataBuilder::writeCompositions(UChar32 c, const Norm *p, UnicodeString &dataString) {
    668     if(p->cc!=0) {
    669         fprintf(stderr,
    670                 "gennorm2 error: "
    671                 "U+%04lX combines-forward and has ccc!=0, not possible in Unicode normalization\n",
    672                 (long)c);
    673         exit(U_INVALID_FORMAT_ERROR);
    674     }
    675     int32_t length=p->compositions->size();
    676     for(int32_t i=0; i<length; ++i) {
    677         CompositionPair &pair=p->compositions->at(i);
    678         // 22 bits for the composite character and whether it combines forward.
    679         UChar32 compositeAndFwd=pair.composite<<1;
    680         if(getNormRef(pair.composite).compositions!=NULL) {
    681             compositeAndFwd|=1;  // The composite character also combines-forward.
    682         }
    683         // Encode most pairs in two units and some in three.
    684         int32_t firstUnit, secondUnit, thirdUnit;
    685         if(pair.trail<Normalizer2Impl::COMP_1_TRAIL_LIMIT) {
    686             if(compositeAndFwd<=0xffff) {
    687                 firstUnit=pair.trail<<1;
    688                 secondUnit=compositeAndFwd;
    689                 thirdUnit=-1;
    690             } else {
    691                 firstUnit=(pair.trail<<1)|Normalizer2Impl::COMP_1_TRIPLE;
    692                 secondUnit=compositeAndFwd>>16;
    693                 thirdUnit=compositeAndFwd;
    694             }
    695         } else {
    696             firstUnit=(Normalizer2Impl::COMP_1_TRAIL_LIMIT+
    697                        (pair.trail>>Normalizer2Impl::COMP_1_TRAIL_SHIFT))|
    698                       Normalizer2Impl::COMP_1_TRIPLE;
    699             secondUnit=(pair.trail<<Normalizer2Impl::COMP_2_TRAIL_SHIFT)|
    700                        (compositeAndFwd>>16);
    701             thirdUnit=compositeAndFwd;
    702         }
    703         // Set the high bit of the first unit if this is the last composition pair.
    704         if(i==(length-1)) {
    705             firstUnit|=Normalizer2Impl::COMP_1_LAST_TUPLE;
    706         }
    707         dataString.append((UChar)firstUnit).append((UChar)secondUnit);
    708         if(thirdUnit>=0) {
    709             dataString.append((UChar)thirdUnit);
    710         }
    711     }
    712 }
    713 
    714 class ExtraDataWriter : public Normalizer2DBEnumerator {
    715 public:
    716     ExtraDataWriter(Normalizer2DataBuilder &b) :
    717         Normalizer2DBEnumerator(b),
    718         yesYesCompositions(1000, (UChar32)0xffff, 2),  // 0=inert, 1=Jamo L, 2=start of compositions
    719         yesNoData(1000, (UChar32)0, 1) {}  // 0=Hangul, 1=start of normal data
    720     virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
    721         if(value!=0) {
    722             if(start!=end) {
    723                 fprintf(stderr,
    724                         "gennorm2 error: unexpected shared data for "
    725                         "multiple code points U+%04lX..U+%04lX\n",
    726                         (long)start, (long)end);
    727                 exit(U_INTERNAL_PROGRAM_ERROR);
    728             }
    729             builder.writeExtraData(start, value, *this);
    730         }
    731         return TRUE;
    732     }
    733     UnicodeString maybeYesCompositions;
    734     UnicodeString yesYesCompositions;
    735     UnicodeString yesNoData;
    736     UnicodeString noNoMappings;
    737     Hashtable previousNoNoMappings;  // If constructed in runtime code, pass in UErrorCode.
    738 };
    739 
    740 void Normalizer2DataBuilder::writeExtraData(UChar32 c, uint32_t value, ExtraDataWriter &writer) {
    741     Norm *p=norms+value;
    742     if(p->combinesBack) {
    743         if(p->hasMapping()) {
    744             fprintf(stderr,
    745                     "gennorm2 error: "
    746                     "U+%04lX combines-back and decomposes, not possible in Unicode normalization\n",
    747                     (long)c);
    748             exit(U_INVALID_FORMAT_ERROR);
    749         }
    750         if(p->compositions!=NULL) {
    751             p->offset=
    752                 (writer.maybeYesCompositions.length()<<Norm::OFFSET_SHIFT)|
    753                 Norm::OFFSET_MAYBE_YES;
    754             writeCompositions(c, p, writer.maybeYesCompositions);
    755         }
    756     } else if(!p->hasMapping()) {
    757         if(p->compositions!=NULL) {
    758             p->offset=
    759                 (writer.yesYesCompositions.length()<<Norm::OFFSET_SHIFT)|
    760                 Norm::OFFSET_YES_YES;
    761             writeCompositions(c, p, writer.yesYesCompositions);
    762         }
    763     } else if(p->mappingType==Norm::ROUND_TRIP) {
    764         p->offset=
    765             (writer.yesNoData.length()<<Norm::OFFSET_SHIFT)|
    766             Norm::OFFSET_YES_NO;
    767         writeMapping(c, p, writer.yesNoData);
    768         if(p->compositions!=NULL) {
    769             writeCompositions(c, p, writer.yesNoData);
    770         }
    771     } else /* one-way */ {
    772         if(p->compositions!=NULL) {
    773             fprintf(stderr,
    774                     "gennorm2 error: "
    775                     "U+%04lX combines-forward and has a one-way mapping, "
    776                     "not possible in Unicode normalization\n",
    777                     (long)c);
    778             exit(U_INVALID_FORMAT_ERROR);
    779         }
    780         if(p->cc==0 && optimization!=OPTIMIZE_FAST) {
    781             // Try a compact, algorithmic encoding.
    782             // Only for ccc=0, because we can't store additional information.
    783             if(p->mappingCP>=0) {
    784                 int32_t delta=p->mappingCP-c;
    785                 if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl::MAX_DELTA) {
    786                     p->offset=(delta<<Norm::OFFSET_SHIFT)|Norm::OFFSET_DELTA;
    787                 }
    788             }
    789         }
    790         if(p->offset==0) {
    791             int32_t oldNoNoLength=writer.noNoMappings.length();
    792             writeMapping(c, p, writer.noNoMappings);
    793             UnicodeString newMapping=writer.noNoMappings.tempSubString(oldNoNoLength);
    794             int32_t previousOffset=writer.previousNoNoMappings.geti(newMapping);
    795             if(previousOffset!=0) {
    796                 // Duplicate, remove the new units and point to the old ones.
    797                 writer.noNoMappings.truncate(oldNoNoLength);
    798                 p->offset=
    799                     ((previousOffset-1)<<Norm::OFFSET_SHIFT)|
    800                     Norm::OFFSET_NO_NO;
    801             } else {
    802                 // Enter this new mapping into the hashtable, avoiding value 0 which is "not found".
    803                 IcuToolErrorCode errorCode("gennorm2/writeExtraData()/Hashtable.puti()");
    804                 writer.previousNoNoMappings.puti(newMapping, oldNoNoLength+1, errorCode);
    805                 p->offset=
    806                     (oldNoNoLength<<Norm::OFFSET_SHIFT)|
    807                     Norm::OFFSET_NO_NO;
    808             }
    809         }
    810     }
    811 }
    812 
    813 class Norm16Writer : public Normalizer2DBEnumerator {
    814 public:
    815     Norm16Writer(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b) {}
    816     virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
    817         builder.writeNorm16(start, end, value);
    818         return TRUE;
    819     }
    820 };
    821 
    822 void Normalizer2DataBuilder::writeNorm16(UChar32 start, UChar32 end, uint32_t value) {
    823     if(value!=0) {
    824         const Norm *p=norms+value;
    825         int32_t offset=p->offset>>Norm::OFFSET_SHIFT;
    826         int32_t norm16=0;
    827         UBool isDecompNo=FALSE;
    828         UBool isCompNoMaybe=FALSE;
    829         switch(p->offset&Norm::OFFSET_MASK) {
    830         case Norm::OFFSET_NONE:
    831             // No mapping, no compositions list.
    832             if(p->combinesBack) {
    833                 norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+p->cc;
    834                 isDecompNo=(UBool)(p->cc!=0);
    835                 isCompNoMaybe=TRUE;
    836             } else if(p->cc!=0) {
    837                 norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-1+p->cc;
    838                 isDecompNo=isCompNoMaybe=TRUE;
    839             }
    840             break;
    841         case Norm::OFFSET_MAYBE_YES:
    842             norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]+offset;
    843             isCompNoMaybe=TRUE;
    844             break;
    845         case Norm::OFFSET_YES_YES:
    846             norm16=offset;
    847             break;
    848         case Norm::OFFSET_YES_NO:
    849             norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+offset;
    850             isDecompNo=TRUE;
    851             break;
    852         case Norm::OFFSET_NO_NO:
    853             norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+offset;
    854             isDecompNo=isCompNoMaybe=TRUE;
    855             break;
    856         case Norm::OFFSET_DELTA:
    857             norm16=getCenterNoNoDelta()+offset;
    858             isDecompNo=isCompNoMaybe=TRUE;
    859             break;
    860         default:  // Should not occur.
    861             exit(U_INTERNAL_PROGRAM_ERROR);
    862         }
    863         IcuToolErrorCode errorCode("gennorm2/writeNorm16()");
    864         utrie2_setRange32(norm16Trie, start, end, (uint32_t)norm16, TRUE, errorCode);
    865         if(isDecompNo && start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) {
    866             indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=start;
    867         }
    868         if(isCompNoMaybe && start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) {
    869             indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=start;
    870         }
    871     }
    872 }
    873 
    874 void Normalizer2DataBuilder::setHangulData() {
    875     HangulIterator hi;
    876     const HangulIterator::Range *range;
    877     // Check that none of the Hangul/Jamo code points have data.
    878     while((range=hi.nextRange())!=NULL) {
    879         for(UChar32 c=range->start; c<range->limit; ++c) {
    880             if(utrie2_get32(norm16Trie, c)!=0) {
    881                 fprintf(stderr,
    882                         "gennorm2 error: "
    883                         "illegal mapping/composition/ccc data for Hangul or Jamo U+%04lX\n",
    884                         (long)c);
    885                 exit(U_INVALID_FORMAT_ERROR);
    886             }
    887         }
    888     }
    889     // Set data for algorithmic runtime handling.
    890     IcuToolErrorCode errorCode("gennorm2/setHangulData()");
    891     hi.reset();
    892     while((range=hi.nextRange())!=NULL) {
    893         uint16_t norm16=range->norm16;
    894         if(norm16==0) {
    895             norm16=(uint16_t)indexes[Normalizer2Impl::IX_MIN_YES_NO];  // Hangul LV/LVT encoded as minYesNo
    896             if(range->start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) {
    897                 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=range->start;
    898             }
    899         } else {
    900             if(range->start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) {  // Jamo V/T are maybeYes
    901                 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=range->start;
    902             }
    903         }
    904         utrie2_setRange32(norm16Trie, range->start, range->limit-1, norm16, TRUE, errorCode);
    905         errorCode.assertSuccess();
    906     }
    907 }
    908 
    909 U_CDECL_BEGIN
    910 
    911 static UBool U_CALLCONV
    912 enumRangeMaxValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) {
    913     uint32_t *pMaxValue=(uint32_t *)context;
    914     if(value>*pMaxValue) {
    915         *pMaxValue=value;
    916     }
    917     return TRUE;
    918 }
    919 
    920 U_CDECL_END
    921 
    922 void Normalizer2DataBuilder::processData() {
    923     IcuToolErrorCode errorCode("gennorm2/processData()");
    924     norm16Trie=utrie2_open(0, 0, errorCode);
    925     errorCode.assertSuccess();
    926 
    927     utrie2_enum(normTrie, NULL, enumRangeHandler, CompositionBuilder(*this).ptr());
    928 
    929     Decomposer decomposer(*this);
    930     do {
    931         decomposer.didDecompose=FALSE;
    932         utrie2_enum(normTrie, NULL, enumRangeHandler, &decomposer);
    933     } while(decomposer.didDecompose);
    934 
    935     BuilderReorderingBuffer buffer;
    936     int32_t normsLength=utm_countItems(normMem);
    937     for(int32_t i=1; i<normsLength; ++i) {
    938         if(norms[i].hasMapping()) {
    939             buffer.reset();
    940             reorder(norms+i, buffer);
    941             norms[i].hasNoCompBoundaryAfter=hasNoCompBoundaryAfter(buffer);
    942         }
    943     }
    944 
    945     indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=0x110000;
    946     indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=0x110000;
    947 
    948     ExtraDataWriter extraDataWriter(*this);
    949     utrie2_enum(normTrie, NULL, enumRangeHandler, &extraDataWriter);
    950 
    951     extraData=extraDataWriter.maybeYesCompositions;
    952     extraData.append(extraDataWriter.yesYesCompositions).
    953               append(extraDataWriter.yesNoData).
    954               append(extraDataWriter.noNoMappings);
    955     // Pad to even length for 4-byte alignment of following data.
    956     if(extraData.length()&1) {
    957         extraData.append((UChar)0);
    958     }
    959 
    960     indexes[Normalizer2Impl::IX_MIN_YES_NO]=
    961         extraDataWriter.yesYesCompositions.length();
    962     indexes[Normalizer2Impl::IX_MIN_NO_NO]=
    963         indexes[Normalizer2Impl::IX_MIN_YES_NO]+
    964         extraDataWriter.yesNoData.length();
    965     indexes[Normalizer2Impl::IX_LIMIT_NO_NO]=
    966         indexes[Normalizer2Impl::IX_MIN_NO_NO]+
    967         extraDataWriter.noNoMappings.length();
    968     indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]=
    969         Normalizer2Impl::MIN_NORMAL_MAYBE_YES-
    970         extraDataWriter.maybeYesCompositions.length();
    971 
    972     int32_t minNoNoDelta=getCenterNoNoDelta()-Normalizer2Impl::MAX_DELTA;
    973     if(indexes[Normalizer2Impl::IX_LIMIT_NO_NO]>minNoNoDelta) {
    974         fprintf(stderr,
    975                 "gennorm2 error: "
    976                 "data structure overflow, too much mapping composition data\n");
    977         exit(U_BUFFER_OVERFLOW_ERROR);
    978     }
    979 
    980     utrie2_enum(normTrie, NULL, enumRangeHandler, Norm16Writer(*this).ptr());
    981 
    982     setHangulData();
    983 
    984     // Look for the "worst" norm16 value of any supplementary code point
    985     // corresponding to a lead surrogate, and set it as that surrogate's value.
    986     // Enables quick check inner loops to look at only code units.
    987     //
    988     // We could be more sophisticated:
    989     // We could collect a bit set for whether there are values in the different
    990     // norm16 ranges (yesNo, maybeYes, yesYesWithCC etc.)
    991     // and select the best value that only breaks the composition and/or decomposition
    992     // inner loops if necessary.
    993     // However, that seems like overkill for an optimization for supplementary characters.
    994     for(UChar lead=0xd800; lead<0xdc00; ++lead) {
    995         uint32_t maxValue=utrie2_get32(norm16Trie, lead);
    996         utrie2_enumForLeadSurrogate(norm16Trie, lead, NULL, enumRangeMaxValue, &maxValue);
    997         if( maxValue>=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO] &&
    998             maxValue>(uint32_t)indexes[Normalizer2Impl::IX_MIN_NO_NO]
    999         ) {
   1000             // Set noNo ("worst" value) if it got into "less-bad" maybeYes or ccc!=0.
   1001             // Otherwise it might end up at something like JAMO_VT which stays in
   1002             // the inner decomposition quick check loop.
   1003             maxValue=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]-1;
   1004         }
   1005         utrie2_set32ForLeadSurrogateCodeUnit(norm16Trie, lead, maxValue, errorCode);
   1006     }
   1007 
   1008     // Adjust supplementary minimum code points to break quick check loops at their lead surrogates.
   1009     // For an empty data file, minCP=0x110000 turns into 0xdc00 (first trail surrogate)
   1010     // which is harmless.
   1011     // As a result, the minimum code points are always BMP code points.
   1012     int32_t minCP=indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP];
   1013     if(minCP>=0x10000) {
   1014         indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=U16_LEAD(minCP);
   1015     }
   1016     minCP=indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP];
   1017     if(minCP>=0x10000) {
   1018         indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=U16_LEAD(minCP);
   1019     }
   1020 }
   1021 
   1022 void Normalizer2DataBuilder::writeBinaryFile(const char *filename) {
   1023     processData();
   1024 
   1025     IcuToolErrorCode errorCode("gennorm2/writeBinaryFile()");
   1026     utrie2_freeze(norm16Trie, UTRIE2_16_VALUE_BITS, errorCode);
   1027     int32_t norm16TrieLength=utrie2_serialize(norm16Trie, NULL, 0, errorCode);
   1028     if(errorCode.get()!=U_BUFFER_OVERFLOW_ERROR) {
   1029         fprintf(stderr, "gennorm2 error: unable to freeze/serialize the normalization trie - %s\n",
   1030                 errorCode.errorName());
   1031         exit(errorCode.reset());
   1032     }
   1033     errorCode.reset();
   1034     LocalArray<uint8_t> norm16TrieBytes(new uint8_t[norm16TrieLength]);
   1035     utrie2_serialize(norm16Trie, norm16TrieBytes.getAlias(), norm16TrieLength, errorCode);
   1036     errorCode.assertSuccess();
   1037 
   1038     int32_t offset=(int32_t)sizeof(indexes);
   1039     indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]=offset;
   1040     offset+=norm16TrieLength;
   1041     indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]=offset;
   1042     int32_t totalSize=offset+=extraData.length()*2;
   1043     for(int32_t i=Normalizer2Impl::IX_RESERVED2_OFFSET; i<=Normalizer2Impl::IX_TOTAL_SIZE; ++i) {
   1044         indexes[i]=totalSize;
   1045     }
   1046 
   1047     if(beVerbose) {
   1048         printf("size of normalization trie:         %5ld bytes\n", (long)norm16TrieLength);
   1049         printf("size of 16-bit extra data:          %5ld uint16_t\n", (long)extraData.length());
   1050         printf("size of binary data file contents:  %5ld bytes\n", (long)totalSize);
   1051         printf("minDecompNoCodePoint:              U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]);
   1052         printf("minCompNoMaybeCodePoint:           U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]);
   1053         printf("minYesNo:                          0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO]);
   1054         printf("minNoNo:                           0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO]);
   1055         printf("limitNoNo:                         0x%04x\n", (int)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]);
   1056         printf("minMaybeYes:                       0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]);
   1057     }
   1058 
   1059     memcpy(dataInfo.dataVersion, unicodeVersion, 4);
   1060     UNewDataMemory *pData=
   1061         udata_create(NULL, NULL, filename, &dataInfo,
   1062                      haveCopyright ? U_COPYRIGHT_STRING : NULL, errorCode);
   1063     if(errorCode.isFailure()) {
   1064         fprintf(stderr, "gennorm2 error: unable to create the output file %s - %s\n",
   1065                 filename, errorCode.errorName());
   1066         exit(errorCode.reset());
   1067     }
   1068     udata_writeBlock(pData, indexes, sizeof(indexes));
   1069     udata_writeBlock(pData, norm16TrieBytes.getAlias(), norm16TrieLength);
   1070     udata_writeUString(pData, extraData.getBuffer(), extraData.length());
   1071 
   1072     int32_t writtenSize=udata_finish(pData, errorCode);
   1073     if(errorCode.isFailure()) {
   1074         fprintf(stderr, "gennorm2: error %s writing the output file\n", errorCode.errorName());
   1075         exit(errorCode.reset());
   1076     }
   1077     if(writtenSize!=totalSize) {
   1078         fprintf(stderr, "gennorm2 error: written size %ld != calculated size %ld\n",
   1079             (long)writtenSize, (long)totalSize);
   1080         exit(U_INTERNAL_PROGRAM_ERROR);
   1081     }
   1082 }
   1083 
   1084 U_NAMESPACE_END
   1085 
   1086 #endif /* #if !UCONFIG_NO_NORMALIZATION */
   1087 
   1088 /*
   1089  * Hey, Emacs, please set the following:
   1090  *
   1091  * Local Variables:
   1092  * indent-tabs-mode: nil
   1093  * End:
   1094  */
   1095