Home | History | Annotate | Download | only in gennorm2
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2009-2012, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  n2builder.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2009nov25
     14 *   created by: Markus W. Scherer
     15 *
     16 * Builds Normalizer2 data and writes a binary .nrm file.
     17 * For the file format see source/common/normalizer2impl.h.
     18 */
     19 
     20 #include "unicode/utypes.h"
     21 #include "n2builder.h"
     22 
     23 #include <stdio.h>
     24 #include <stdlib.h>
     25 #include <string.h>
     26 #if U_HAVE_STD_STRING
     27 #include <vector>
     28 #endif
     29 #include "unicode/errorcode.h"
     30 #include "unicode/localpointer.h"
     31 #include "unicode/putil.h"
     32 #include "unicode/udata.h"
     33 #include "unicode/uniset.h"
     34 #include "unicode/unistr.h"
     35 #include "unicode/ustring.h"
     36 #include "hash.h"
     37 #include "normalizer2impl.h"
     38 #include "toolutil.h"
     39 #include "unewdata.h"
     40 #include "utrie2.h"
     41 #include "uvectr32.h"
     42 
     43 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
     44 
     45 #if !UCONFIG_NO_NORMALIZATION
     46 
     47 /* UDataInfo cf. udata.h */
     48 static UDataInfo dataInfo={
     49     sizeof(UDataInfo),
     50     0,
     51 
     52     U_IS_BIG_ENDIAN,
     53     U_CHARSET_FAMILY,
     54     U_SIZEOF_UCHAR,
     55     0,
     56 
     57     { 0x4e, 0x72, 0x6d, 0x32 }, /* dataFormat="Nrm2" */
     58     { 2, 0, 0, 0 },             /* formatVersion */
     59     { 5, 2, 0, 0 }              /* dataVersion (Unicode version) */
     60 };
     61 
     62 U_NAMESPACE_BEGIN
     63 
     64 class HangulIterator {
     65 public:
     66     struct Range {
     67         UChar32 start, limit;
     68         uint16_t norm16;
     69     };
     70 
     71     HangulIterator() : rangeIndex(0) {}
     72     const Range *nextRange() {
     73         if(rangeIndex<LENGTHOF(ranges)) {
     74             return ranges+rangeIndex++;
     75         } else {
     76             return NULL;
     77         }
     78     }
     79     void reset() { rangeIndex=0; }
     80 private:
     81     static const Range ranges[4];
     82     int32_t rangeIndex;
     83 };
     84 
     85 const HangulIterator::Range HangulIterator::ranges[4]={
     86     { Hangul::JAMO_L_BASE, Hangul::JAMO_L_BASE+Hangul::JAMO_L_COUNT, 1 },
     87     { Hangul::JAMO_V_BASE, Hangul::JAMO_V_BASE+Hangul::JAMO_V_COUNT, Normalizer2Impl::JAMO_VT },
     88     // JAMO_T_BASE+1: not U+11A7
     89     { Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_BASE+Hangul::JAMO_T_COUNT, Normalizer2Impl::JAMO_VT },
     90     { Hangul::HANGUL_BASE, Hangul::HANGUL_BASE+Hangul::HANGUL_COUNT, 0 },  // will become minYesNo
     91 };
     92 
     93 struct CompositionPair {
     94     CompositionPair(UChar32 t, UChar32 c) : trail(t), composite(c) {}
     95     UChar32 trail, composite;
     96 };
     97 
     98 struct Norm {
     99     enum MappingType { NONE, REMOVED, ROUND_TRIP, ONE_WAY };
    100 
    101     UBool hasMapping() const { return mappingType>REMOVED; }
    102 
    103     // Requires hasMapping() and well-formed mapping.
    104     void setMappingCP() {
    105         UChar32 c;
    106         if(!mapping->isEmpty() && mapping->length()==U16_LENGTH(c=mapping->char32At(0))) {
    107             mappingCP=c;
    108         } else {
    109             mappingCP=U_SENTINEL;
    110         }
    111     }
    112 
    113     const CompositionPair *getCompositionPairs(int32_t &length) const {
    114         if(compositions==NULL) {
    115             length=0;
    116             return NULL;
    117         } else {
    118             length=compositions->size()/2;
    119             return reinterpret_cast<const CompositionPair *>(compositions->getBuffer());
    120         }
    121     }
    122 
    123     UnicodeString *mapping;
    124     UnicodeString *rawMapping;  // non-NULL if the mapping is further decomposed
    125     UChar32 mappingCP;  // >=0 if mapping to 1 code point
    126     int32_t mappingPhase;
    127     MappingType mappingType;
    128 
    129     UVector32 *compositions;  // (trail, composite) pairs
    130     uint8_t cc;
    131     UBool combinesBack;
    132     UBool hasNoCompBoundaryAfter;
    133 
    134     enum OffsetType {
    135         OFFSET_NONE,
    136         // Composition for back-combining character. Allowed, but not normally used.
    137         OFFSET_MAYBE_YES,
    138         // Composition for a starter that does not have a decomposition mapping.
    139         OFFSET_YES_YES,
    140         // Round-trip mapping & composition for a starter.
    141         OFFSET_YES_NO_MAPPING_AND_COMPOSITION,
    142         // Round-trip mapping for a starter that itself does not combine-forward.
    143         OFFSET_YES_NO_MAPPING_ONLY,
    144         // One-way mapping.
    145         OFFSET_NO_NO,
    146         // Delta for an algorithmic one-way mapping.
    147         OFFSET_DELTA
    148     };
    149     enum { OFFSET_SHIFT=4, OFFSET_MASK=(1<<OFFSET_SHIFT)-1 };
    150     int32_t offset;
    151 };
    152 
    153 class Normalizer2DBEnumerator {
    154 public:
    155     Normalizer2DBEnumerator(Normalizer2DataBuilder &b) : builder(b) {}
    156     virtual ~Normalizer2DBEnumerator() {}
    157     virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) = 0;
    158     Normalizer2DBEnumerator *ptr() { return this; }
    159 protected:
    160     Normalizer2DataBuilder &builder;
    161 };
    162 
    163 U_CDECL_BEGIN
    164 
    165 static UBool U_CALLCONV
    166 enumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
    167     return ((Normalizer2DBEnumerator *)context)->rangeHandler(start, end, value);
    168 }
    169 
    170 U_CDECL_END
    171 
    172 Normalizer2DataBuilder::Normalizer2DataBuilder(UErrorCode &errorCode) :
    173         phase(0), overrideHandling(OVERRIDE_PREVIOUS), optimization(OPTIMIZE_NORMAL) {
    174     memset(unicodeVersion, 0, sizeof(unicodeVersion));
    175     normTrie=utrie2_open(0, 0, &errorCode);
    176     normMem=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(Norm));
    177     norms=allocNorm();  // unused Norm struct at index 0
    178     memset(indexes, 0, sizeof(indexes));
    179     memset(smallFCD, 0, sizeof(smallFCD));
    180 }
    181 
    182 Normalizer2DataBuilder::~Normalizer2DataBuilder() {
    183     utrie2_close(normTrie);
    184     int32_t normsLength=utm_countItems(normMem);
    185     for(int32_t i=1; i<normsLength; ++i) {
    186         delete norms[i].mapping;
    187         delete norms[i].rawMapping;
    188         delete norms[i].compositions;
    189     }
    190     utm_close(normMem);
    191     utrie2_close(norm16Trie);
    192 }
    193 
    194 void
    195 Normalizer2DataBuilder::setUnicodeVersion(const char *v) {
    196     UVersionInfo nullVersion={ 0, 0, 0, 0 };
    197     UVersionInfo version;
    198     u_versionFromString(version, v);
    199     if( 0!=memcmp(version, unicodeVersion, U_MAX_VERSION_LENGTH) &&
    200         0!=memcmp(nullVersion, unicodeVersion, U_MAX_VERSION_LENGTH)
    201     ) {
    202         char buffer[U_MAX_VERSION_STRING_LENGTH];
    203         u_versionToString(unicodeVersion, buffer);
    204         fprintf(stderr, "gennorm2 error: multiple inconsistent Unicode version numbers %s vs. %s\n",
    205                 buffer, v);
    206         exit(U_ILLEGAL_ARGUMENT_ERROR);
    207     }
    208     memcpy(unicodeVersion, version, U_MAX_VERSION_LENGTH);
    209 }
    210 
    211 Norm *Normalizer2DataBuilder::allocNorm() {
    212     Norm *p=(Norm *)utm_alloc(normMem);
    213     norms=(Norm *)utm_getStart(normMem);  // in case it got reallocated
    214     return p;
    215 }
    216 
    217 /* get an existing Norm unit */
    218 Norm *Normalizer2DataBuilder::getNorm(UChar32 c) {
    219     uint32_t i=utrie2_get32(normTrie, c);
    220     if(i==0) {
    221         return NULL;
    222     }
    223     return norms+i;
    224 }
    225 
    226 const Norm &Normalizer2DataBuilder::getNormRef(UChar32 c) const {
    227     return norms[utrie2_get32(normTrie, c)];
    228 }
    229 
    230 /*
    231  * get or create a Norm unit;
    232  * get or create the intermediate trie entries for it as well
    233  */
    234 Norm *Normalizer2DataBuilder::createNorm(UChar32 c) {
    235     uint32_t i=utrie2_get32(normTrie, c);
    236     if(i!=0) {
    237         return norms+i;
    238     } else {
    239         /* allocate Norm */
    240         Norm *p=allocNorm();
    241         IcuToolErrorCode errorCode("gennorm2/createNorm()");
    242         utrie2_set32(normTrie, c, (uint32_t)(p-norms), errorCode);
    243         return p;
    244     }
    245 }
    246 
    247 Norm *Normalizer2DataBuilder::checkNormForMapping(Norm *p, UChar32 c) {
    248     if(p!=NULL) {
    249         if(p->mappingType!=Norm::NONE) {
    250             if( overrideHandling==OVERRIDE_NONE ||
    251                 (overrideHandling==OVERRIDE_PREVIOUS && p->mappingPhase==phase)
    252             ) {
    253                 fprintf(stderr,
    254                         "error in gennorm2 phase %d: "
    255                         "not permitted to override mapping for U+%04lX from phase %d\n",
    256                         (int)phase, (long)c, (int)p->mappingPhase);
    257                 exit(U_INVALID_FORMAT_ERROR);
    258             }
    259             delete p->mapping;
    260             p->mapping=NULL;
    261         }
    262         p->mappingPhase=phase;
    263     }
    264     return p;
    265 }
    266 
    267 void Normalizer2DataBuilder::setOverrideHandling(OverrideHandling oh) {
    268     overrideHandling=oh;
    269     ++phase;
    270 }
    271 
    272 void Normalizer2DataBuilder::setCC(UChar32 c, uint8_t cc) {
    273     createNorm(c)->cc=cc;
    274 }
    275 
    276 uint8_t Normalizer2DataBuilder::getCC(UChar32 c) const {
    277     return getNormRef(c).cc;
    278 }
    279 
    280 static UBool isWellFormed(const UnicodeString &s) {
    281     UErrorCode errorCode=U_ZERO_ERROR;
    282     u_strToUTF8(NULL, 0, NULL, s.getBuffer(), s.length(), &errorCode);
    283     return U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR;
    284 }
    285 
    286 void Normalizer2DataBuilder::setOneWayMapping(UChar32 c, const UnicodeString &m) {
    287     if(!isWellFormed(m)) {
    288         fprintf(stderr,
    289                 "error in gennorm2 phase %d: "
    290                 "illegal one-way mapping from U+%04lX to malformed string\n",
    291                 (int)phase, (long)c);
    292         exit(U_INVALID_FORMAT_ERROR);
    293     }
    294     Norm *p=checkNormForMapping(createNorm(c), c);
    295     p->mapping=new UnicodeString(m);
    296     p->mappingType=Norm::ONE_WAY;
    297     p->setMappingCP();
    298 }
    299 
    300 void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString &m) {
    301     if(U_IS_SURROGATE(c)) {
    302         fprintf(stderr,
    303                 "error in gennorm2 phase %d: "
    304                 "illegal round-trip mapping from surrogate code point U+%04lX\n",
    305                 (int)phase, (long)c);
    306         exit(U_INVALID_FORMAT_ERROR);
    307     }
    308     if(!isWellFormed(m)) {
    309         fprintf(stderr,
    310                 "error in gennorm2 phase %d: "
    311                 "illegal round-trip mapping from U+%04lX to malformed string\n",
    312                 (int)phase, (long)c);
    313         exit(U_INVALID_FORMAT_ERROR);
    314     }
    315     int32_t numCP=u_countChar32(m.getBuffer(), m.length());
    316     if(numCP!=2) {
    317         fprintf(stderr,
    318                 "error in gennorm2 phase %d: "
    319                 "illegal round-trip mapping from U+%04lX to %d!=2 code points\n",
    320                 (int)phase, (long)c, (int)numCP);
    321         exit(U_INVALID_FORMAT_ERROR);
    322     }
    323     Norm *p=checkNormForMapping(createNorm(c), c);
    324     p->mapping=new UnicodeString(m);
    325     p->mappingType=Norm::ROUND_TRIP;
    326     p->mappingCP=U_SENTINEL;
    327 }
    328 
    329 void Normalizer2DataBuilder::removeMapping(UChar32 c) {
    330     Norm *p=checkNormForMapping(getNorm(c), c);
    331     if(p!=NULL) {
    332         p->mappingType=Norm::REMOVED;
    333     }
    334 }
    335 
    336 class CompositionBuilder : public Normalizer2DBEnumerator {
    337 public:
    338     CompositionBuilder(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b) {}
    339     virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
    340         builder.addComposition(start, end, value);
    341         return TRUE;
    342     }
    343 };
    344 
    345 void
    346 Normalizer2DataBuilder::addComposition(UChar32 start, UChar32 end, uint32_t value) {
    347     if(norms[value].mappingType==Norm::ROUND_TRIP) {
    348         if(start!=end) {
    349             fprintf(stderr,
    350                     "gennorm2 error: same round-trip mapping for "
    351                     "more than 1 code point U+%04lX..U+%04lX\n",
    352                     (long)start, (long)end);
    353             exit(U_INVALID_FORMAT_ERROR);
    354         }
    355         if(norms[value].cc!=0) {
    356             fprintf(stderr,
    357                     "gennorm2 error: "
    358                     "U+%04lX has a round-trip mapping and ccc!=0, "
    359                     "not possible in Unicode normalization\n",
    360                     (long)start);
    361             exit(U_INVALID_FORMAT_ERROR);
    362         }
    363         // setRoundTripMapping() ensured that there are exactly two code points.
    364         const UnicodeString &m=*norms[value].mapping;
    365         UChar32 lead=m.char32At(0);
    366         UChar32 trail=m.char32At(m.length()-1);
    367         if(getCC(lead)!=0) {
    368             fprintf(stderr,
    369                     "gennorm2 error: "
    370                     "U+%04lX's round-trip mapping's starter U+%04lX has ccc!=0, "
    371                     "not possible in Unicode normalization\n",
    372                     (long)start, (long)lead);
    373             exit(U_INVALID_FORMAT_ERROR);
    374         }
    375         // Flag for trailing character.
    376         createNorm(trail)->combinesBack=TRUE;
    377         // Insert (trail, composite) pair into compositions list for the lead character.
    378         IcuToolErrorCode errorCode("gennorm2/addComposition()");
    379         Norm *leadNorm=createNorm(lead);
    380         UVector32 *compositions=leadNorm->compositions;
    381         int32_t i;
    382         if(compositions==NULL) {
    383             compositions=leadNorm->compositions=new UVector32(errorCode);
    384             i=0;  // "insert" the first pair at index 0
    385         } else {
    386             // Insertion sort, and check for duplicate trail characters.
    387             int32_t length;
    388             const CompositionPair *pairs=leadNorm->getCompositionPairs(length);
    389             for(i=0; i<length; ++i) {
    390                 if(trail==pairs[i].trail) {
    391                     fprintf(stderr,
    392                             "gennorm2 error: same round-trip mapping for "
    393                             "more than 1 code point (e.g., U+%04lX) to U+%04lX + U+%04lX\n",
    394                             (long)start, (long)lead, (long)trail);
    395                     exit(U_INVALID_FORMAT_ERROR);
    396                 }
    397                 if(trail<pairs[i].trail) {
    398                     break;
    399                 }
    400             }
    401         }
    402         compositions->insertElementAt(trail, 2*i, errorCode);
    403         compositions->insertElementAt(start, 2*i+1, errorCode);
    404     }
    405 }
    406 
    407 UBool Normalizer2DataBuilder::combinesWithCCBetween(const Norm &norm,
    408                                                     uint8_t lowCC, uint8_t highCC) const {
    409     if((highCC-lowCC)>=2) {
    410         int32_t length;
    411         const CompositionPair *pairs=norm.getCompositionPairs(length);
    412         for(int32_t i=0; i<length; ++i) {
    413             uint8_t trailCC=getCC(pairs[i].trail);
    414             if(lowCC<trailCC && trailCC<highCC) {
    415                 return TRUE;
    416             }
    417         }
    418     }
    419     return FALSE;
    420 }
    421 
    422 UChar32 Normalizer2DataBuilder::combine(const Norm &norm, UChar32 trail) const {
    423     int32_t length;
    424     const CompositionPair *pairs=norm.getCompositionPairs(length);
    425     for(int32_t i=0; i<length; ++i) {
    426         if(trail==pairs[i].trail) {
    427             return pairs[i].composite;
    428         }
    429         if(trail<pairs[i].trail) {
    430             break;
    431         }
    432     }
    433     return U_SENTINEL;
    434 }
    435 
    436 class Decomposer : public Normalizer2DBEnumerator {
    437 public:
    438     Decomposer(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b), didDecompose(FALSE) {}
    439     virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
    440         didDecompose|=builder.decompose(start, end, value);
    441         return TRUE;
    442     }
    443     UBool didDecompose;
    444 };
    445 
    446 UBool
    447 Normalizer2DataBuilder::decompose(UChar32 start, UChar32 end, uint32_t value) {
    448     if(norms[value].hasMapping()) {
    449         Norm &norm=norms[value];
    450         const UnicodeString &m=*norm.mapping;
    451         UnicodeString *decomposed=NULL;
    452         const UChar *s=m.getBuffer();
    453         int32_t length=m.length();
    454         int32_t prev, i=0;
    455         UChar32 c;
    456         while(i<length) {
    457             prev=i;
    458             U16_NEXT(s, i, length, c);
    459             if(start<=c && c<=end) {
    460                 fprintf(stderr,
    461                         "gennorm2 error: U+%04lX maps to itself directly or indirectly\n",
    462                         (long)c);
    463                 exit(U_INVALID_FORMAT_ERROR);
    464             }
    465             const Norm &cNorm=getNormRef(c);
    466             if(cNorm.hasMapping()) {
    467                 if(norm.mappingType==Norm::ROUND_TRIP) {
    468                     if(prev==0) {
    469                         if(cNorm.mappingType!=Norm::ROUND_TRIP) {
    470                             fprintf(stderr,
    471                                     "gennorm2 error: "
    472                                     "U+%04lX's round-trip mapping's starter "
    473                                     "U+%04lX one-way-decomposes, "
    474                                     "not possible in Unicode normalization\n",
    475                                     (long)start, (long)c);
    476                             exit(U_INVALID_FORMAT_ERROR);
    477                         }
    478                         uint8_t myTrailCC=getCC(m.char32At(i));
    479                         UChar32 cTrailChar=cNorm.mapping->char32At(cNorm.mapping->length()-1);
    480                         uint8_t cTrailCC=getCC(cTrailChar);
    481                         if(cTrailCC>myTrailCC) {
    482                             fprintf(stderr,
    483                                     "gennorm2 error: "
    484                                     "U+%04lX's round-trip mapping's starter "
    485                                     "U+%04lX decomposes and the "
    486                                     "inner/earlier tccc=%hu > outer/following tccc=%hu, "
    487                                     "not possible in Unicode normalization\n",
    488                                     (long)start, (long)c,
    489                                     (short)cTrailCC, (short)myTrailCC);
    490                             exit(U_INVALID_FORMAT_ERROR);
    491                         }
    492                     } else {
    493                         fprintf(stderr,
    494                                 "gennorm2 error: "
    495                                 "U+%04lX's round-trip mapping's non-starter "
    496                                 "U+%04lX decomposes, "
    497                                 "not possible in Unicode normalization\n",
    498                                 (long)start, (long)c);
    499                         exit(U_INVALID_FORMAT_ERROR);
    500                     }
    501                 }
    502                 if(decomposed==NULL) {
    503                     decomposed=new UnicodeString(m, 0, prev);
    504                 }
    505                 decomposed->append(*cNorm.mapping);
    506             } else if(Hangul::isHangul(c)) {
    507                 UChar buffer[3];
    508                 int32_t hangulLength=Hangul::decompose(c, buffer);
    509                 if(norm.mappingType==Norm::ROUND_TRIP && prev!=0) {
    510                     fprintf(stderr,
    511                             "gennorm2 error: "
    512                             "U+%04lX's round-trip mapping's non-starter "
    513                             "U+%04lX decomposes, "
    514                             "not possible in Unicode normalization\n",
    515                             (long)start, (long)c);
    516                     exit(U_INVALID_FORMAT_ERROR);
    517                 }
    518                 if(decomposed==NULL) {
    519                     decomposed=new UnicodeString(m, 0, prev);
    520                 }
    521                 decomposed->append(buffer, hangulLength);
    522             } else if(decomposed!=NULL) {
    523                 decomposed->append(m, prev, i-prev);
    524             }
    525         }
    526         if(decomposed!=NULL) {
    527             if(norm.rawMapping==NULL) {
    528                 // Remember the original mapping when decomposing recursively.
    529                 norm.rawMapping=norm.mapping;
    530             } else {
    531                 delete norm.mapping;
    532             }
    533             norm.mapping=decomposed;
    534             // Not  norm.setMappingCP();  because the original mapping
    535             // is most likely to be encodable as a delta.
    536             return TRUE;
    537         }
    538     }
    539     return FALSE;
    540 }
    541 
    542 class BuilderReorderingBuffer {
    543 public:
    544     BuilderReorderingBuffer() : fLength(0), fLastStarterIndex(-1), fDidReorder(FALSE) {}
    545     void reset() {
    546         fLength=0;
    547         fLastStarterIndex=-1;
    548         fDidReorder=FALSE;
    549     }
    550     int32_t length() const { return fLength; }
    551     UBool isEmpty() const { return fLength==0; }
    552     int32_t lastStarterIndex() const { return fLastStarterIndex; }
    553     UChar32 charAt(int32_t i) const { return fArray[i]>>8; }
    554     uint8_t ccAt(int32_t i) const { return (uint8_t)fArray[i]; }
    555     UBool didReorder() const { return fDidReorder; }
    556     void append(UChar32 c, uint8_t cc) {
    557         if(cc==0 || fLength==0 || ccAt(fLength-1)<=cc) {
    558             if(cc==0) {
    559                 fLastStarterIndex=fLength;
    560             }
    561             fArray[fLength++]=(c<<8)|cc;
    562             return;
    563         }
    564         // Let this character bubble back to its canonical order.
    565         int32_t i=fLength-1;
    566         while(i>fLastStarterIndex && ccAt(i)>cc) {
    567             --i;
    568         }
    569         ++i;  // after the last starter or prevCC<=cc
    570         // Move this and the following characters forward one to make space.
    571         for(int32_t j=fLength; i<j; --j) {
    572             fArray[j]=fArray[j-1];
    573         }
    574         fArray[i]=(c<<8)|cc;
    575         ++fLength;
    576         fDidReorder=TRUE;
    577     }
    578     void toString(UnicodeString &dest) {
    579         dest.remove();
    580         for(int32_t i=0; i<fLength; ++i) {
    581             dest.append(charAt(i));
    582         }
    583     }
    584     void setComposite(UChar32 composite, int32_t combMarkIndex) {
    585         fArray[fLastStarterIndex]=composite<<8;
    586         // Remove the combining mark that contributed to the composite.
    587         --fLength;
    588         while(combMarkIndex<fLength) {
    589             fArray[combMarkIndex]=fArray[combMarkIndex+1];
    590             ++combMarkIndex;
    591         }
    592     }
    593 private:
    594     int32_t fArray[Normalizer2Impl::MAPPING_LENGTH_MASK];
    595     int32_t fLength;
    596     int32_t fLastStarterIndex;
    597     UBool fDidReorder;
    598 };
    599 
    600 void
    601 Normalizer2DataBuilder::reorder(Norm *p, BuilderReorderingBuffer &buffer) {
    602     UnicodeString &m=*p->mapping;
    603     int32_t length=m.length();
    604     if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) {
    605         return;  // writeMapping() will complain about it and print the code point.
    606     }
    607     const UChar *s=m.getBuffer();
    608     int32_t i=0;
    609     UChar32 c;
    610     while(i<length) {
    611         U16_NEXT(s, i, length, c);
    612         buffer.append(c, getCC(c));
    613     }
    614     if(buffer.didReorder()) {
    615         buffer.toString(m);
    616     }
    617 }
    618 
    619 /*
    620  * Computes the flag for the last code branch in Normalizer2Impl::hasCompBoundaryAfter().
    621  * A starter character with a mapping does not have a composition boundary after it
    622  * if the character itself combines-forward (which is tested by the caller of this function),
    623  * or it is deleted (mapped to the empty string),
    624  * or its mapping contains no starter,
    625  * or the last starter combines-forward.
    626  */
    627 UBool Normalizer2DataBuilder::hasNoCompBoundaryAfter(BuilderReorderingBuffer &buffer) {
    628     if(buffer.isEmpty()) {
    629         return TRUE;  // maps-to-empty-string is no boundary of any kind
    630     }
    631     int32_t lastStarterIndex=buffer.lastStarterIndex();
    632     if(lastStarterIndex<0) {
    633         return TRUE;  // no starter
    634     }
    635     UChar32 starter=buffer.charAt(lastStarterIndex);
    636     if( Hangul::isJamoL(starter) ||
    637         (Hangul::isJamoV(starter) &&
    638          0<lastStarterIndex && Hangul::isJamoL(buffer.charAt(lastStarterIndex-1)))
    639     ) {
    640         // A Jamo leading consonant or an LV pair combines-forward if it is at the end,
    641         // otherwise it is blocked.
    642         return lastStarterIndex==buffer.length()-1;
    643     }
    644     // Note: There can be no Hangul syllable in the fully decomposed mapping.
    645     const Norm *starterNorm=&getNormRef(starter);
    646     if(starterNorm->compositions==NULL) {
    647         return FALSE;  // the last starter does not combine forward
    648     }
    649     // Compose as far as possible, and see if further compositions are possible.
    650     uint8_t prevCC=0;
    651     for(int32_t combMarkIndex=lastStarterIndex+1; combMarkIndex<buffer.length();) {
    652         uint8_t cc=buffer.ccAt(combMarkIndex);  // !=0 because after last starter
    653         if(combinesWithCCBetween(*starterNorm, prevCC, cc)) {
    654             return TRUE;
    655         }
    656         if( prevCC<cc &&
    657             (starter=combine(*starterNorm, buffer.charAt(combMarkIndex)))>=0
    658         ) {
    659             buffer.setComposite(starter, combMarkIndex);
    660             starterNorm=&getNormRef(starter);
    661             if(starterNorm->compositions==NULL) {
    662                 return FALSE;  // the composite does not combine further
    663             }
    664         } else {
    665             prevCC=cc;
    666             ++combMarkIndex;
    667         }
    668     }
    669     // TRUE if the final, forward-combining starter is at the end.
    670     return prevCC==0;
    671 }
    672 
    673 // Requires p->hasMapping().
    674 // Returns the offset of the "first unit" from the beginning of the extraData for c.
    675 // That is the same as the length of the optional data for the raw mapping and the ccc/lccc word.
    676 int32_t Normalizer2DataBuilder::writeMapping(UChar32 c, const Norm *p, UnicodeString &dataString) {
    677     UnicodeString &m=*p->mapping;
    678     int32_t length=m.length();
    679     if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) {
    680         fprintf(stderr,
    681                 "gennorm2 error: "
    682                 "mapping for U+%04lX longer than maximum of %d\n",
    683                 (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK);
    684         exit(U_INVALID_FORMAT_ERROR);
    685     }
    686     int32_t leadCC, trailCC;
    687     if(length==0) {
    688         leadCC=trailCC=0;
    689     } else {
    690         leadCC=getCC(m.char32At(0));
    691         trailCC=getCC(m.char32At(length-1));
    692     }
    693     if(c<Normalizer2Impl::MIN_CCC_LCCC_CP && (p->cc!=0 || leadCC!=0)) {
    694         fprintf(stderr,
    695                 "gennorm2 error: "
    696                 "U+%04lX below U+0300 has ccc!=0 or lccc!=0, not supported by ICU\n",
    697                 (long)c);
    698         exit(U_INVALID_FORMAT_ERROR);
    699     }
    700     // Write small-FCD data.
    701     if((leadCC|trailCC)!=0) {
    702         UChar32 lead= c<=0xffff ? c : U16_LEAD(c);
    703         smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7);
    704     }
    705     // Write the mapping & raw mapping extraData.
    706     int32_t firstUnit=length|(trailCC<<8);
    707     int32_t preMappingLength=0;
    708     if(p->rawMapping!=NULL) {
    709         UnicodeString &rm=*p->rawMapping;
    710         int32_t rmLength=rm.length();
    711         if(rmLength>Normalizer2Impl::MAPPING_LENGTH_MASK) {
    712             fprintf(stderr,
    713                     "gennorm2 error: "
    714                     "raw mapping for U+%04lX longer than maximum of %d\n",
    715                     (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK);
    716             exit(U_INVALID_FORMAT_ERROR);
    717         }
    718         UChar rm0=rm.charAt(0);
    719         if( rmLength==length-1 &&
    720             // 99: overlong substring lengths get pinned to remainder lengths anyway
    721             0==rm.compare(1, 99, m, 2, 99) &&
    722             rm0>Normalizer2Impl::MAPPING_LENGTH_MASK
    723         ) {
    724             // Compression:
    725             // rawMapping=rm0+mapping.substring(2) -> store only rm0
    726             //
    727             // The raw mapping is the same as the final mapping after replacing
    728             // the final mapping's first two code units with the raw mapping's first one.
    729             // In this case, we store only that first unit, rm0.
    730             // This helps with a few hundred mappings.
    731             dataString.append(rm0);
    732             preMappingLength=1;
    733         } else {
    734             // Store the raw mapping with its length.
    735             dataString.append(rm);
    736             dataString.append((UChar)rmLength);
    737             preMappingLength=rmLength+1;
    738         }
    739         firstUnit|=Normalizer2Impl::MAPPING_HAS_RAW_MAPPING;
    740     }
    741     int32_t cccLccc=p->cc|(leadCC<<8);
    742     if(cccLccc!=0) {
    743         dataString.append((UChar)cccLccc);
    744         ++preMappingLength;
    745         firstUnit|=Normalizer2Impl::MAPPING_HAS_CCC_LCCC_WORD;
    746     }
    747     if(p->hasNoCompBoundaryAfter) {
    748         firstUnit|=Normalizer2Impl::MAPPING_NO_COMP_BOUNDARY_AFTER;
    749     }
    750     dataString.append((UChar)firstUnit);
    751     dataString.append(m);
    752     return preMappingLength;
    753 }
    754 
    755 // Requires p->compositions!=NULL.
    756 void Normalizer2DataBuilder::writeCompositions(UChar32 c, const Norm *p, UnicodeString &dataString) {
    757     if(p->cc!=0) {
    758         fprintf(stderr,
    759                 "gennorm2 error: "
    760                 "U+%04lX combines-forward and has ccc!=0, not possible in Unicode normalization\n",
    761                 (long)c);
    762         exit(U_INVALID_FORMAT_ERROR);
    763     }
    764     int32_t length;
    765     const CompositionPair *pairs=p->getCompositionPairs(length);
    766     for(int32_t i=0; i<length; ++i) {
    767         const CompositionPair &pair=pairs[i];
    768         // 22 bits for the composite character and whether it combines forward.
    769         UChar32 compositeAndFwd=pair.composite<<1;
    770         if(getNormRef(pair.composite).compositions!=NULL) {
    771             compositeAndFwd|=1;  // The composite character also combines-forward.
    772         }
    773         // Encode most pairs in two units and some in three.
    774         int32_t firstUnit, secondUnit, thirdUnit;
    775         if(pair.trail<Normalizer2Impl::COMP_1_TRAIL_LIMIT) {
    776             if(compositeAndFwd<=0xffff) {
    777                 firstUnit=pair.trail<<1;
    778                 secondUnit=compositeAndFwd;
    779                 thirdUnit=-1;
    780             } else {
    781                 firstUnit=(pair.trail<<1)|Normalizer2Impl::COMP_1_TRIPLE;
    782                 secondUnit=compositeAndFwd>>16;
    783                 thirdUnit=compositeAndFwd;
    784             }
    785         } else {
    786             firstUnit=(Normalizer2Impl::COMP_1_TRAIL_LIMIT+
    787                        (pair.trail>>Normalizer2Impl::COMP_1_TRAIL_SHIFT))|
    788                       Normalizer2Impl::COMP_1_TRIPLE;
    789             secondUnit=(pair.trail<<Normalizer2Impl::COMP_2_TRAIL_SHIFT)|
    790                        (compositeAndFwd>>16);
    791             thirdUnit=compositeAndFwd;
    792         }
    793         // Set the high bit of the first unit if this is the last composition pair.
    794         if(i==(length-1)) {
    795             firstUnit|=Normalizer2Impl::COMP_1_LAST_TUPLE;
    796         }
    797         dataString.append((UChar)firstUnit).append((UChar)secondUnit);
    798         if(thirdUnit>=0) {
    799             dataString.append((UChar)thirdUnit);
    800         }
    801     }
    802 }
    803 
    804 class ExtraDataWriter : public Normalizer2DBEnumerator {
    805 public:
    806     ExtraDataWriter(Normalizer2DataBuilder &b) :
    807         Normalizer2DBEnumerator(b),
    808         yesYesCompositions(1000, (UChar32)0xffff, 2),  // 0=inert, 1=Jamo L, 2=start of compositions
    809         yesNoMappingsAndCompositions(1000, (UChar32)0, 1) {}  // 0=Hangul, 1=start of normal data
    810     virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
    811         if(value!=0) {
    812             if(start!=end) {
    813                 fprintf(stderr,
    814                         "gennorm2 error: unexpected shared data for "
    815                         "multiple code points U+%04lX..U+%04lX\n",
    816                         (long)start, (long)end);
    817                 exit(U_INTERNAL_PROGRAM_ERROR);
    818             }
    819             builder.writeExtraData(start, value, *this);
    820         }
    821         return TRUE;
    822     }
    823     UnicodeString maybeYesCompositions;
    824     UnicodeString yesYesCompositions;
    825     UnicodeString yesNoMappingsAndCompositions;
    826     UnicodeString yesNoMappingsOnly;
    827     UnicodeString noNoMappings;
    828     Hashtable previousNoNoMappings;  // If constructed in runtime code, pass in UErrorCode.
    829 };
    830 
    831 void Normalizer2DataBuilder::writeExtraData(UChar32 c, uint32_t value, ExtraDataWriter &writer) {
    832     Norm *p=norms+value;
    833     if(!p->hasMapping()) {
    834         // Write small-FCD data.
    835         // There is similar code in writeMapping() for characters that do have a mapping.
    836         if(c<Normalizer2Impl::MIN_CCC_LCCC_CP && p->cc!=0) {
    837             fprintf(stderr,
    838                     "gennorm2 error: "
    839                     "U+%04lX below U+0300 has ccc!=0, not supported by ICU\n",
    840                     (long)c);
    841             exit(U_INVALID_FORMAT_ERROR);
    842         }
    843         if(p->cc!=0) {
    844             UChar32 lead= c<=0xffff ? c : U16_LEAD(c);
    845             smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7);
    846         }
    847     }
    848     if(p->combinesBack) {
    849         if(p->hasMapping()) {
    850             fprintf(stderr,
    851                     "gennorm2 error: "
    852                     "U+%04lX combines-back and decomposes, not possible in Unicode normalization\n",
    853                     (long)c);
    854             exit(U_INVALID_FORMAT_ERROR);
    855         }
    856         if(p->compositions!=NULL) {
    857             p->offset=
    858                 (writer.maybeYesCompositions.length()<<Norm::OFFSET_SHIFT)|
    859                 Norm::OFFSET_MAYBE_YES;
    860             writeCompositions(c, p, writer.maybeYesCompositions);
    861         }
    862     } else if(!p->hasMapping()) {
    863         if(p->compositions!=NULL) {
    864             p->offset=
    865                 (writer.yesYesCompositions.length()<<Norm::OFFSET_SHIFT)|
    866                 Norm::OFFSET_YES_YES;
    867             writeCompositions(c, p, writer.yesYesCompositions);
    868         }
    869     } else if(p->mappingType==Norm::ROUND_TRIP) {
    870         if(p->compositions!=NULL) {
    871             int32_t offset=writer.yesNoMappingsAndCompositions.length()+
    872                            writeMapping(c, p, writer.yesNoMappingsAndCompositions);
    873             p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION;
    874             writeCompositions(c, p, writer.yesNoMappingsAndCompositions);
    875         } else {
    876             int32_t offset=writer.yesNoMappingsOnly.length()+
    877                            writeMapping(c, p, writer.yesNoMappingsOnly);
    878             p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_ONLY;
    879         }
    880     } else /* one-way */ {
    881         if(p->compositions!=NULL) {
    882             fprintf(stderr,
    883                     "gennorm2 error: "
    884                     "U+%04lX combines-forward and has a one-way mapping, "
    885                     "not possible in Unicode normalization\n",
    886                     (long)c);
    887             exit(U_INVALID_FORMAT_ERROR);
    888         }
    889         if(p->cc==0 && optimization!=OPTIMIZE_FAST) {
    890             // Try a compact, algorithmic encoding.
    891             // Only for ccc=0, because we can't store additional information
    892             // and we do not recursively follow an algorithmic encoding for access to the ccc.
    893             //
    894             // Also, if hasNoCompBoundaryAfter is set, we can only use the algorithmic encoding
    895             // if the mappingCP decomposes further, to ensure that there is a place to store it.
    896             // We want to see that the final mapping does not have exactly 1 code point,
    897             // or else we would have to recursively ensure that the final mapping is stored
    898             // in normal extraData.
    899             if(p->mappingCP>=0 && (!p->hasNoCompBoundaryAfter || 1!=p->mapping->countChar32())) {
    900                 int32_t delta=p->mappingCP-c;
    901                 if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl::MAX_DELTA) {
    902                     p->offset=(delta<<Norm::OFFSET_SHIFT)|Norm::OFFSET_DELTA;
    903                 }
    904             }
    905         }
    906         if(p->offset==0) {
    907             int32_t oldNoNoLength=writer.noNoMappings.length();
    908             int32_t offset=oldNoNoLength+writeMapping(c, p, writer.noNoMappings);
    909             UnicodeString newMapping=writer.noNoMappings.tempSubString(oldNoNoLength);
    910             int32_t previousOffset=writer.previousNoNoMappings.geti(newMapping);
    911             if(previousOffset!=0) {
    912                 // Duplicate, remove the new units and point to the old ones.
    913                 writer.noNoMappings.truncate(oldNoNoLength);
    914                 p->offset=((previousOffset-1)<<Norm::OFFSET_SHIFT)|Norm::OFFSET_NO_NO;
    915             } else {
    916                 // Enter this new mapping into the hashtable, avoiding value 0 which is "not found".
    917                 IcuToolErrorCode errorCode("gennorm2/writeExtraData()/Hashtable.puti()");
    918                 writer.previousNoNoMappings.puti(newMapping, offset+1, errorCode);
    919                 p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_NO_NO;
    920             }
    921         }
    922     }
    923 }
    924 
    925 class Norm16Writer : public Normalizer2DBEnumerator {
    926 public:
    927     Norm16Writer(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b) {}
    928     virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
    929         builder.writeNorm16(start, end, value);
    930         return TRUE;
    931     }
    932 };
    933 
    934 void Normalizer2DataBuilder::writeNorm16(UChar32 start, UChar32 end, uint32_t value) {
    935     if(value!=0) {
    936         const Norm *p=norms+value;
    937         int32_t offset=p->offset>>Norm::OFFSET_SHIFT;
    938         int32_t norm16=0;
    939         UBool isDecompNo=FALSE;
    940         UBool isCompNoMaybe=FALSE;
    941         switch(p->offset&Norm::OFFSET_MASK) {
    942         case Norm::OFFSET_NONE:
    943             // No mapping, no compositions list.
    944             if(p->combinesBack) {
    945                 norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+p->cc;
    946                 isDecompNo=(UBool)(p->cc!=0);
    947                 isCompNoMaybe=TRUE;
    948             } else if(p->cc!=0) {
    949                 norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-1+p->cc;
    950                 isDecompNo=isCompNoMaybe=TRUE;
    951             }
    952             break;
    953         case Norm::OFFSET_MAYBE_YES:
    954             norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]+offset;
    955             isCompNoMaybe=TRUE;
    956             break;
    957         case Norm::OFFSET_YES_YES:
    958             norm16=offset;
    959             break;
    960         case Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION:
    961             norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+offset;
    962             isDecompNo=TRUE;
    963             break;
    964         case Norm::OFFSET_YES_NO_MAPPING_ONLY:
    965             norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+offset;
    966             isDecompNo=TRUE;
    967             break;
    968         case Norm::OFFSET_NO_NO:
    969             norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+offset;
    970             isDecompNo=isCompNoMaybe=TRUE;
    971             break;
    972         case Norm::OFFSET_DELTA:
    973             norm16=getCenterNoNoDelta()+offset;
    974             isDecompNo=isCompNoMaybe=TRUE;
    975             break;
    976         default:  // Should not occur.
    977             exit(U_INTERNAL_PROGRAM_ERROR);
    978         }
    979         IcuToolErrorCode errorCode("gennorm2/writeNorm16()");
    980         utrie2_setRange32(norm16Trie, start, end, (uint32_t)norm16, TRUE, errorCode);
    981         if(isDecompNo && start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) {
    982             indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=start;
    983         }
    984         if(isCompNoMaybe && start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) {
    985             indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=start;
    986         }
    987     }
    988 }
    989 
    990 void Normalizer2DataBuilder::setHangulData() {
    991     HangulIterator hi;
    992     const HangulIterator::Range *range;
    993     // Check that none of the Hangul/Jamo code points have data.
    994     while((range=hi.nextRange())!=NULL) {
    995         for(UChar32 c=range->start; c<range->limit; ++c) {
    996             if(utrie2_get32(norm16Trie, c)!=0) {
    997                 fprintf(stderr,
    998                         "gennorm2 error: "
    999                         "illegal mapping/composition/ccc data for Hangul or Jamo U+%04lX\n",
   1000                         (long)c);
   1001                 exit(U_INVALID_FORMAT_ERROR);
   1002             }
   1003         }
   1004     }
   1005     // Set data for algorithmic runtime handling.
   1006     IcuToolErrorCode errorCode("gennorm2/setHangulData()");
   1007     hi.reset();
   1008     while((range=hi.nextRange())!=NULL) {
   1009         uint16_t norm16=range->norm16;
   1010         if(norm16==0) {
   1011             norm16=(uint16_t)indexes[Normalizer2Impl::IX_MIN_YES_NO];  // Hangul LV/LVT encoded as minYesNo
   1012             if(range->start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) {
   1013                 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=range->start;
   1014             }
   1015         } else {
   1016             if(range->start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) {  // Jamo V/T are maybeYes
   1017                 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=range->start;
   1018             }
   1019         }
   1020         utrie2_setRange32(norm16Trie, range->start, range->limit-1, norm16, TRUE, errorCode);
   1021         errorCode.assertSuccess();
   1022     }
   1023 }
   1024 
   1025 U_CDECL_BEGIN
   1026 
   1027 static UBool U_CALLCONV
   1028 enumRangeMaxValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) {
   1029     uint32_t *pMaxValue=(uint32_t *)context;
   1030     if(value>*pMaxValue) {
   1031         *pMaxValue=value;
   1032     }
   1033     return TRUE;
   1034 }
   1035 
   1036 U_CDECL_END
   1037 
   1038 void Normalizer2DataBuilder::processData() {
   1039     IcuToolErrorCode errorCode("gennorm2/processData()");
   1040     norm16Trie=utrie2_open(0, 0, errorCode);
   1041     errorCode.assertSuccess();
   1042 
   1043     utrie2_enum(normTrie, NULL, enumRangeHandler, CompositionBuilder(*this).ptr());
   1044 
   1045     Decomposer decomposer(*this);
   1046     do {
   1047         decomposer.didDecompose=FALSE;
   1048         utrie2_enum(normTrie, NULL, enumRangeHandler, &decomposer);
   1049     } while(decomposer.didDecompose);
   1050 
   1051     BuilderReorderingBuffer buffer;
   1052     int32_t normsLength=utm_countItems(normMem);
   1053     for(int32_t i=1; i<normsLength; ++i) {
   1054         // Set the hasNoCompBoundaryAfter flag for use by the last code branch
   1055         // in Normalizer2Impl::hasCompBoundaryAfter().
   1056         // For details see the comments on hasNoCompBoundaryAfter(buffer).
   1057         const Norm &norm=norms[i];
   1058         if(norm.hasMapping()) {
   1059             if(norm.compositions!=NULL) {
   1060                 norms[i].hasNoCompBoundaryAfter=TRUE;
   1061             } else {
   1062                 buffer.reset();
   1063                 reorder(norms+i, buffer);
   1064                 norms[i].hasNoCompBoundaryAfter=hasNoCompBoundaryAfter(buffer);
   1065             }
   1066         }
   1067     }
   1068 
   1069     indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=0x110000;
   1070     indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=0x110000;
   1071 
   1072     ExtraDataWriter extraDataWriter(*this);
   1073     utrie2_enum(normTrie, NULL, enumRangeHandler, &extraDataWriter);
   1074 
   1075     extraData=extraDataWriter.maybeYesCompositions;
   1076     extraData.append(extraDataWriter.yesYesCompositions).
   1077               append(extraDataWriter.yesNoMappingsAndCompositions).
   1078               append(extraDataWriter.yesNoMappingsOnly).
   1079               append(extraDataWriter.noNoMappings);
   1080     // Pad to even length for 4-byte alignment of following data.
   1081     if(extraData.length()&1) {
   1082         extraData.append((UChar)0);
   1083     }
   1084 
   1085     indexes[Normalizer2Impl::IX_MIN_YES_NO]=
   1086         extraDataWriter.yesYesCompositions.length();
   1087     indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]=
   1088         indexes[Normalizer2Impl::IX_MIN_YES_NO]+
   1089         extraDataWriter.yesNoMappingsAndCompositions.length();
   1090     indexes[Normalizer2Impl::IX_MIN_NO_NO]=
   1091         indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+
   1092         extraDataWriter.yesNoMappingsOnly.length();
   1093     indexes[Normalizer2Impl::IX_LIMIT_NO_NO]=
   1094         indexes[Normalizer2Impl::IX_MIN_NO_NO]+
   1095         extraDataWriter.noNoMappings.length();
   1096     indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]=
   1097         Normalizer2Impl::MIN_NORMAL_MAYBE_YES-
   1098         extraDataWriter.maybeYesCompositions.length();
   1099 
   1100     int32_t minNoNoDelta=getCenterNoNoDelta()-Normalizer2Impl::MAX_DELTA;
   1101     if(indexes[Normalizer2Impl::IX_LIMIT_NO_NO]>minNoNoDelta) {
   1102         fprintf(stderr,
   1103                 "gennorm2 error: "
   1104                 "data structure overflow, too much mapping composition data\n");
   1105         exit(U_BUFFER_OVERFLOW_ERROR);
   1106     }
   1107 
   1108     utrie2_enum(normTrie, NULL, enumRangeHandler, Norm16Writer(*this).ptr());
   1109 
   1110     setHangulData();
   1111 
   1112     // Look for the "worst" norm16 value of any supplementary code point
   1113     // corresponding to a lead surrogate, and set it as that surrogate's value.
   1114     // Enables quick check inner loops to look at only code units.
   1115     //
   1116     // We could be more sophisticated:
   1117     // We could collect a bit set for whether there are values in the different
   1118     // norm16 ranges (yesNo, maybeYes, yesYesWithCC etc.)
   1119     // and select the best value that only breaks the composition and/or decomposition
   1120     // inner loops if necessary.
   1121     // However, that seems like overkill for an optimization for supplementary characters.
   1122     for(UChar lead=0xd800; lead<0xdc00; ++lead) {
   1123         uint32_t maxValue=utrie2_get32(norm16Trie, lead);
   1124         utrie2_enumForLeadSurrogate(norm16Trie, lead, NULL, enumRangeMaxValue, &maxValue);
   1125         if( maxValue>=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO] &&
   1126             maxValue>(uint32_t)indexes[Normalizer2Impl::IX_MIN_NO_NO]
   1127         ) {
   1128             // Set noNo ("worst" value) if it got into "less-bad" maybeYes or ccc!=0.
   1129             // Otherwise it might end up at something like JAMO_VT which stays in
   1130             // the inner decomposition quick check loop.
   1131             maxValue=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]-1;
   1132         }
   1133         utrie2_set32ForLeadSurrogateCodeUnit(norm16Trie, lead, maxValue, errorCode);
   1134     }
   1135 
   1136     // Adjust supplementary minimum code points to break quick check loops at their lead surrogates.
   1137     // For an empty data file, minCP=0x110000 turns into 0xdc00 (first trail surrogate)
   1138     // which is harmless.
   1139     // As a result, the minimum code points are always BMP code points.
   1140     int32_t minCP=indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP];
   1141     if(minCP>=0x10000) {
   1142         indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=U16_LEAD(minCP);
   1143     }
   1144     minCP=indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP];
   1145     if(minCP>=0x10000) {
   1146         indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=U16_LEAD(minCP);
   1147     }
   1148 }
   1149 
   1150 void Normalizer2DataBuilder::writeBinaryFile(const char *filename) {
   1151     processData();
   1152 
   1153     IcuToolErrorCode errorCode("gennorm2/writeBinaryFile()");
   1154     utrie2_freeze(norm16Trie, UTRIE2_16_VALUE_BITS, errorCode);
   1155     int32_t norm16TrieLength=utrie2_serialize(norm16Trie, NULL, 0, errorCode);
   1156     if(errorCode.get()!=U_BUFFER_OVERFLOW_ERROR) {
   1157         fprintf(stderr, "gennorm2 error: unable to freeze/serialize the normalization trie - %s\n",
   1158                 errorCode.errorName());
   1159         exit(errorCode.reset());
   1160     }
   1161     errorCode.reset();
   1162     LocalArray<uint8_t> norm16TrieBytes(new uint8_t[norm16TrieLength]);
   1163     utrie2_serialize(norm16Trie, norm16TrieBytes.getAlias(), norm16TrieLength, errorCode);
   1164     errorCode.assertSuccess();
   1165 
   1166     int32_t offset=(int32_t)sizeof(indexes);
   1167     indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]=offset;
   1168     offset+=norm16TrieLength;
   1169     indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]=offset;
   1170     offset+=extraData.length()*2;
   1171     indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET]=offset;
   1172     offset+=sizeof(smallFCD);
   1173     int32_t totalSize=offset;
   1174     for(int32_t i=Normalizer2Impl::IX_RESERVED3_OFFSET; i<=Normalizer2Impl::IX_TOTAL_SIZE; ++i) {
   1175         indexes[i]=totalSize;
   1176     }
   1177 
   1178     if(beVerbose) {
   1179         printf("size of normalization trie:         %5ld bytes\n", (long)norm16TrieLength);
   1180         printf("size of 16-bit extra data:          %5ld uint16_t\n", (long)extraData.length());
   1181         printf("size of small-FCD data:             %5ld bytes\n", (long)sizeof(smallFCD));
   1182         printf("size of binary data file contents:  %5ld bytes\n", (long)totalSize);
   1183         printf("minDecompNoCodePoint:              U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]);
   1184         printf("minCompNoMaybeCodePoint:           U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]);
   1185         printf("minYesNo:                          0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO]);
   1186         printf("minYesNoMappingsOnly:              0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]);
   1187         printf("minNoNo:                           0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO]);
   1188         printf("limitNoNo:                         0x%04x\n", (int)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]);
   1189         printf("minMaybeYes:                       0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]);
   1190     }
   1191 
   1192     UVersionInfo nullVersion={ 0, 0, 0, 0 };
   1193     if(0==memcmp(nullVersion, unicodeVersion, 4)) {
   1194         u_versionFromString(unicodeVersion, U_UNICODE_VERSION);
   1195     }
   1196     memcpy(dataInfo.dataVersion, unicodeVersion, 4);
   1197     UNewDataMemory *pData=
   1198         udata_create(NULL, NULL, filename, &dataInfo,
   1199                      haveCopyright ? U_COPYRIGHT_STRING : NULL, errorCode);
   1200     if(errorCode.isFailure()) {
   1201         fprintf(stderr, "gennorm2 error: unable to create the output file %s - %s\n",
   1202                 filename, errorCode.errorName());
   1203         exit(errorCode.reset());
   1204     }
   1205     udata_writeBlock(pData, indexes, sizeof(indexes));
   1206     udata_writeBlock(pData, norm16TrieBytes.getAlias(), norm16TrieLength);
   1207     udata_writeUString(pData, extraData.getBuffer(), extraData.length());
   1208     udata_writeBlock(pData, smallFCD, sizeof(smallFCD));
   1209     int32_t writtenSize=udata_finish(pData, errorCode);
   1210     if(errorCode.isFailure()) {
   1211         fprintf(stderr, "gennorm2: error %s writing the output file\n", errorCode.errorName());
   1212         exit(errorCode.reset());
   1213     }
   1214     if(writtenSize!=totalSize) {
   1215         fprintf(stderr, "gennorm2 error: written size %ld != calculated size %ld\n",
   1216             (long)writtenSize, (long)totalSize);
   1217         exit(U_INTERNAL_PROGRAM_ERROR);
   1218     }
   1219 }
   1220 
   1221 U_NAMESPACE_END
   1222 
   1223 #endif /* #if !UCONFIG_NO_NORMALIZATION */
   1224 
   1225 /*
   1226  * Hey, Emacs, please set the following:
   1227  *
   1228  * Local Variables:
   1229  * indent-tabs-mode: nil
   1230  * End:
   1231  */
   1232