Home | History | Annotate | Download | only in gennorm2
      1 //  2017 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 
      4 // norms.cpp
      5 // created: 2017jun04 Markus W. Scherer
      6 // (pulled out of n2builder.cpp)
      7 
      8 #include "unicode/utypes.h"
      9 
     10 #if !UCONFIG_NO_NORMALIZATION
     11 
     12 #include <stdio.h>
     13 #include <stdlib.h>
     14 #include "unicode/errorcode.h"
     15 #include "unicode/unistr.h"
     16 #include "unicode/utf16.h"
     17 #include "normalizer2impl.h"
     18 #include "norms.h"
     19 #include "toolutil.h"
     20 #include "utrie2.h"
     21 #include "uvectr32.h"
     22 
     23 U_NAMESPACE_BEGIN
     24 
     25 void BuilderReorderingBuffer::append(UChar32 c, uint8_t cc) {
     26     if(cc==0 || fLength==0 || ccAt(fLength-1)<=cc) {
     27         if(cc==0) {
     28             fLastStarterIndex=fLength;
     29         }
     30         fArray[fLength++]=(c<<8)|cc;
     31         return;
     32     }
     33     // Let this character bubble back to its canonical order.
     34     int32_t i=fLength-1;
     35     while(i>fLastStarterIndex && ccAt(i)>cc) {
     36         --i;
     37     }
     38     ++i;  // after the last starter or prevCC<=cc
     39     // Move this and the following characters forward one to make space.
     40     for(int32_t j=fLength; i<j; --j) {
     41         fArray[j]=fArray[j-1];
     42     }
     43     fArray[i]=(c<<8)|cc;
     44     ++fLength;
     45     fDidReorder=TRUE;
     46 }
     47 
     48 void BuilderReorderingBuffer::toString(UnicodeString &dest) const {
     49     dest.remove();
     50     for(int32_t i=0; i<fLength; ++i) {
     51         dest.append(charAt(i));
     52     }
     53 }
     54 
     55 UChar32 Norm::combine(UChar32 trail) const {
     56     int32_t length;
     57     const CompositionPair *pairs=getCompositionPairs(length);
     58     for(int32_t i=0; i<length; ++i) {
     59         if(trail==pairs[i].trail) {
     60             return pairs[i].composite;
     61         }
     62         if(trail<pairs[i].trail) {
     63             break;
     64         }
     65     }
     66     return U_SENTINEL;
     67 }
     68 
     69 Norms::Norms(UErrorCode &errorCode) {
     70     normTrie=utrie2_open(0, 0, &errorCode);
     71     normMem=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(Norm));
     72     // Default "inert" Norm struct at index 0. Practically immutable.
     73     norms=allocNorm();
     74     norms->type=Norm::INERT;
     75 }
     76 
     77 Norms::~Norms() {
     78     utrie2_close(normTrie);
     79     int32_t normsLength=utm_countItems(normMem);
     80     for(int32_t i=1; i<normsLength; ++i) {
     81         delete norms[i].mapping;
     82         delete norms[i].rawMapping;
     83         delete norms[i].compositions;
     84     }
     85     utm_close(normMem);
     86 }
     87 
     88 Norm *Norms::allocNorm() {
     89     Norm *p=(Norm *)utm_alloc(normMem);
     90     norms=(Norm *)utm_getStart(normMem);  // in case it got reallocated
     91     return p;
     92 }
     93 
     94 Norm *Norms::getNorm(UChar32 c) {
     95     uint32_t i=utrie2_get32(normTrie, c);
     96     if(i==0) {
     97         return nullptr;
     98     }
     99     return norms+i;
    100 }
    101 
    102 const Norm *Norms::getNorm(UChar32 c) const {
    103     uint32_t i=utrie2_get32(normTrie, c);
    104     if(i==0) {
    105         return nullptr;
    106     }
    107     return norms+i;
    108 }
    109 
    110 const Norm &Norms::getNormRef(UChar32 c) const {
    111     return norms[utrie2_get32(normTrie, c)];
    112 }
    113 
    114 Norm *Norms::createNorm(UChar32 c) {
    115     uint32_t i=utrie2_get32(normTrie, c);
    116     if(i!=0) {
    117         return norms+i;
    118     } else {
    119         /* allocate Norm */
    120         Norm *p=allocNorm();
    121         IcuToolErrorCode errorCode("gennorm2/createNorm()");
    122         utrie2_set32(normTrie, c, (uint32_t)(p-norms), errorCode);
    123         return p;
    124     }
    125 }
    126 
    127 void Norms::reorder(UnicodeString &mapping, BuilderReorderingBuffer &buffer) const {
    128     int32_t length=mapping.length();
    129     U_ASSERT(length<=Normalizer2Impl::MAPPING_LENGTH_MASK);
    130     const char16_t *s=mapping.getBuffer();
    131     int32_t i=0;
    132     UChar32 c;
    133     while(i<length) {
    134         U16_NEXT(s, i, length, c);
    135         buffer.append(c, getCC(c));
    136     }
    137     if(buffer.didReorder()) {
    138         buffer.toString(mapping);
    139     }
    140 }
    141 
    142 UBool Norms::combinesWithCCBetween(const Norm &norm, uint8_t lowCC, int32_t highCC) const {
    143     if((highCC-lowCC)>=2) {
    144         int32_t length;
    145         const CompositionPair *pairs=norm.getCompositionPairs(length);
    146         for(int32_t i=0; i<length; ++i) {
    147             uint8_t trailCC=getCC(pairs[i].trail);
    148             if(lowCC<trailCC && trailCC<highCC) {
    149                 return TRUE;
    150             }
    151         }
    152     }
    153     return FALSE;
    154 }
    155 
    156 U_CDECL_BEGIN
    157 
    158 static UBool U_CALLCONV
    159 enumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
    160     return ((Norms::Enumerator *)context)->rangeHandler(start, end, value);
    161 }
    162 
    163 U_CDECL_END
    164 
    165 void Norms::enumRanges(Enumerator &e) {
    166     utrie2_enum(normTrie, nullptr, enumRangeHandler, &e);
    167 }
    168 
    169 Norms::Enumerator::~Enumerator() {}
    170 
    171 UBool Norms::Enumerator::rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
    172     if(value!=0) {
    173         rangeHandler(start, end, norms.getNormRefByIndex(value));
    174     }
    175     return TRUE;
    176 }
    177 
    178 void CompositionBuilder::rangeHandler(UChar32 start, UChar32 end, Norm &norm) {
    179     if(norm.mappingType!=Norm::ROUND_TRIP) { return; }
    180     if(start!=end) {
    181         fprintf(stderr,
    182                 "gennorm2 error: same round-trip mapping for "
    183                 "more than 1 code point U+%04lX..U+%04lX\n",
    184                 (long)start, (long)end);
    185         exit(U_INVALID_FORMAT_ERROR);
    186     }
    187     if(norm.cc!=0) {
    188         fprintf(stderr,
    189                 "gennorm2 error: "
    190                 "U+%04lX has a round-trip mapping and ccc!=0, "
    191                 "not possible in Unicode normalization\n",
    192                 (long)start);
    193         exit(U_INVALID_FORMAT_ERROR);
    194     }
    195     // setRoundTripMapping() ensured that there are exactly two code points.
    196     const UnicodeString &m=*norm.mapping;
    197     UChar32 lead=m.char32At(0);
    198     UChar32 trail=m.char32At(m.length()-1);
    199     if(norms.getCC(lead)!=0) {
    200         fprintf(stderr,
    201                 "gennorm2 error: "
    202                 "U+%04lX's round-trip mapping's starter U+%04lX has ccc!=0, "
    203                 "not possible in Unicode normalization\n",
    204                 (long)start, (long)lead);
    205         exit(U_INVALID_FORMAT_ERROR);
    206     }
    207     // Flag for trailing character.
    208     norms.createNorm(trail)->combinesBack=TRUE;
    209     // Insert (trail, composite) pair into compositions list for the lead character.
    210     IcuToolErrorCode errorCode("gennorm2/addComposition()");
    211     Norm *leadNorm=norms.createNorm(lead);
    212     UVector32 *compositions=leadNorm->compositions;
    213     int32_t i;
    214     if(compositions==nullptr) {
    215         compositions=leadNorm->compositions=new UVector32(errorCode);
    216         i=0;  // "insert" the first pair at index 0
    217     } else {
    218         // Insertion sort, and check for duplicate trail characters.
    219         int32_t length;
    220         const CompositionPair *pairs=leadNorm->getCompositionPairs(length);
    221         for(i=0; i<length; ++i) {
    222             if(trail==pairs[i].trail) {
    223                 fprintf(stderr,
    224                         "gennorm2 error: same round-trip mapping for "
    225                         "more than 1 code point (e.g., U+%04lX) to U+%04lX + U+%04lX\n",
    226                         (long)start, (long)lead, (long)trail);
    227                 exit(U_INVALID_FORMAT_ERROR);
    228             }
    229             if(trail<pairs[i].trail) {
    230                 break;
    231             }
    232         }
    233     }
    234     compositions->insertElementAt(trail, 2*i, errorCode);
    235     compositions->insertElementAt(start, 2*i+1, errorCode);
    236 }
    237 
    238 void Decomposer::rangeHandler(UChar32 start, UChar32 end, Norm &norm) {
    239     if(!norm.hasMapping()) { return; }
    240     const UnicodeString &m=*norm.mapping;
    241     UnicodeString *decomposed=nullptr;
    242     const UChar *s=toUCharPtr(m.getBuffer());
    243     int32_t length=m.length();
    244     int32_t prev, i=0;
    245     UChar32 c;
    246     while(i<length) {
    247         prev=i;
    248         U16_NEXT(s, i, length, c);
    249         if(start<=c && c<=end) {
    250             fprintf(stderr,
    251                     "gennorm2 error: U+%04lX maps to itself directly or indirectly\n",
    252                     (long)c);
    253             exit(U_INVALID_FORMAT_ERROR);
    254         }
    255         const Norm &cNorm=norms.getNormRef(c);
    256         if(cNorm.hasMapping()) {
    257             if(norm.mappingType==Norm::ROUND_TRIP) {
    258                 if(prev==0) {
    259                     if(cNorm.mappingType!=Norm::ROUND_TRIP) {
    260                         fprintf(stderr,
    261                                 "gennorm2 error: "
    262                                 "U+%04lX's round-trip mapping's starter "
    263                                 "U+%04lX one-way-decomposes, "
    264                                 "not possible in Unicode normalization\n",
    265                                 (long)start, (long)c);
    266                         exit(U_INVALID_FORMAT_ERROR);
    267                     }
    268                     uint8_t myTrailCC=norms.getCC(m.char32At(i));
    269                     UChar32 cTrailChar=cNorm.mapping->char32At(cNorm.mapping->length()-1);
    270                     uint8_t cTrailCC=norms.getCC(cTrailChar);
    271                     if(cTrailCC>myTrailCC) {
    272                         fprintf(stderr,
    273                                 "gennorm2 error: "
    274                                 "U+%04lX's round-trip mapping's starter "
    275                                 "U+%04lX decomposes and the "
    276                                 "inner/earlier tccc=%hu > outer/following tccc=%hu, "
    277                                 "not possible in Unicode normalization\n",
    278                                 (long)start, (long)c,
    279                                 (short)cTrailCC, (short)myTrailCC);
    280                         exit(U_INVALID_FORMAT_ERROR);
    281                     }
    282                 } else {
    283                     fprintf(stderr,
    284                             "gennorm2 error: "
    285                             "U+%04lX's round-trip mapping's non-starter "
    286                             "U+%04lX decomposes, "
    287                             "not possible in Unicode normalization\n",
    288                             (long)start, (long)c);
    289                     exit(U_INVALID_FORMAT_ERROR);
    290                 }
    291             }
    292             if(decomposed==nullptr) {
    293                 decomposed=new UnicodeString(m, 0, prev);
    294             }
    295             decomposed->append(*cNorm.mapping);
    296         } else if(Hangul::isHangul(c)) {
    297             UChar buffer[3];
    298             int32_t hangulLength=Hangul::decompose(c, buffer);
    299             if(norm.mappingType==Norm::ROUND_TRIP && prev!=0) {
    300                 fprintf(stderr,
    301                         "gennorm2 error: "
    302                         "U+%04lX's round-trip mapping's non-starter "
    303                         "U+%04lX decomposes, "
    304                         "not possible in Unicode normalization\n",
    305                         (long)start, (long)c);
    306                 exit(U_INVALID_FORMAT_ERROR);
    307             }
    308             if(decomposed==nullptr) {
    309                 decomposed=new UnicodeString(m, 0, prev);
    310             }
    311             decomposed->append(buffer, hangulLength);
    312         } else if(decomposed!=nullptr) {
    313             decomposed->append(m, prev, i-prev);
    314         }
    315     }
    316     if(decomposed!=nullptr) {
    317         if(norm.rawMapping==nullptr) {
    318             // Remember the original mapping when decomposing recursively.
    319             norm.rawMapping=norm.mapping;
    320         } else {
    321             delete norm.mapping;
    322         }
    323         norm.mapping=decomposed;
    324         // Not  norm.setMappingCP();  because the original mapping
    325         // is most likely to be encodable as a delta.
    326         didDecompose|=TRUE;
    327     }
    328 }
    329 
    330 U_NAMESPACE_END
    331 
    332 #endif // #if !UCONFIG_NO_NORMALIZATION
    333