Home | History | Annotate | Download | only in common
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *   Copyright (C) 2010-2012, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 *   file name:  ucharstriebuilder.h
      9 *   encoding:   UTF-8
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2010nov14
     14 *   created by: Markus W. Scherer
     15 */
     16 
     17 #include "unicode/utypes.h"
     18 #include "unicode/ucharstrie.h"
     19 #include "unicode/ucharstriebuilder.h"
     20 #include "unicode/unistr.h"
     21 #include "unicode/ustring.h"
     22 #include "cmemory.h"
     23 #include "uarrsort.h"
     24 #include "uassert.h"
     25 #include "uhash.h"
     26 #include "ustr_imp.h"
     27 
     28 U_NAMESPACE_BEGIN
     29 
     30 /*
     31  * Note: This builder implementation stores (string, value) pairs with full copies
     32  * of the 16-bit-unit sequences, until the UCharsTrie is built.
     33  * It might(!) take less memory if we collected the data in a temporary, dynamic trie.
     34  */
     35 
     36 class UCharsTrieElement : public UMemory {
     37 public:
     38     // Use compiler's default constructor, initializes nothing.
     39 
     40     void setTo(const UnicodeString &s, int32_t val, UnicodeString &strings, UErrorCode &errorCode);
     41 
     42     UnicodeString getString(const UnicodeString &strings) const {
     43         int32_t length=strings[stringOffset];
     44         return strings.tempSubString(stringOffset+1, length);
     45     }
     46     int32_t getStringLength(const UnicodeString &strings) const {
     47         return strings[stringOffset];
     48     }
     49 
     50     UChar charAt(int32_t index, const UnicodeString &strings) const {
     51         return strings[stringOffset+1+index];
     52     }
     53 
     54     int32_t getValue() const { return value; }
     55 
     56     int32_t compareStringTo(const UCharsTrieElement &o, const UnicodeString &strings) const;
     57 
     58 private:
     59     // The first strings unit contains the string length.
     60     // (Compared with a stringLength field here, this saves 2 bytes per string.)
     61     int32_t stringOffset;
     62     int32_t value;
     63 };
     64 
     65 void
     66 UCharsTrieElement::setTo(const UnicodeString &s, int32_t val,
     67                          UnicodeString &strings, UErrorCode &errorCode) {
     68     if(U_FAILURE(errorCode)) {
     69         return;
     70     }
     71     int32_t length=s.length();
     72     if(length>0xffff) {
     73         // Too long: We store the length in 1 unit.
     74         errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
     75         return;
     76     }
     77     stringOffset=strings.length();
     78     strings.append((UChar)length);
     79     value=val;
     80     strings.append(s);
     81 }
     82 
     83 int32_t
     84 UCharsTrieElement::compareStringTo(const UCharsTrieElement &other, const UnicodeString &strings) const {
     85     return getString(strings).compare(other.getString(strings));
     86 }
     87 
     88 UCharsTrieBuilder::UCharsTrieBuilder(UErrorCode & /*errorCode*/)
     89         : elements(NULL), elementsCapacity(0), elementsLength(0),
     90           uchars(NULL), ucharsCapacity(0), ucharsLength(0) {}
     91 
     92 UCharsTrieBuilder::~UCharsTrieBuilder() {
     93     delete[] elements;
     94     uprv_free(uchars);
     95 }
     96 
     97 UCharsTrieBuilder &
     98 UCharsTrieBuilder::add(const UnicodeString &s, int32_t value, UErrorCode &errorCode) {
     99     if(U_FAILURE(errorCode)) {
    100         return *this;
    101     }
    102     if(ucharsLength>0) {
    103         // Cannot add elements after building.
    104         errorCode=U_NO_WRITE_PERMISSION;
    105         return *this;
    106     }
    107     if(elementsLength==elementsCapacity) {
    108         int32_t newCapacity;
    109         if(elementsCapacity==0) {
    110             newCapacity=1024;
    111         } else {
    112             newCapacity=4*elementsCapacity;
    113         }
    114         UCharsTrieElement *newElements=new UCharsTrieElement[newCapacity];
    115         if(newElements==NULL) {
    116             errorCode=U_MEMORY_ALLOCATION_ERROR;
    117             return *this;
    118         }
    119         if(elementsLength>0) {
    120             uprv_memcpy(newElements, elements, (size_t)elementsLength*sizeof(UCharsTrieElement));
    121         }
    122         delete[] elements;
    123         elements=newElements;
    124         elementsCapacity=newCapacity;
    125     }
    126     elements[elementsLength++].setTo(s, value, strings, errorCode);
    127     if(U_SUCCESS(errorCode) && strings.isBogus()) {
    128         errorCode=U_MEMORY_ALLOCATION_ERROR;
    129     }
    130     return *this;
    131 }
    132 
    133 U_CDECL_BEGIN
    134 
    135 static int32_t U_CALLCONV
    136 compareElementStrings(const void *context, const void *left, const void *right) {
    137     const UnicodeString *strings=static_cast<const UnicodeString *>(context);
    138     const UCharsTrieElement *leftElement=static_cast<const UCharsTrieElement *>(left);
    139     const UCharsTrieElement *rightElement=static_cast<const UCharsTrieElement *>(right);
    140     return leftElement->compareStringTo(*rightElement, *strings);
    141 }
    142 
    143 U_CDECL_END
    144 
    145 UCharsTrie *
    146 UCharsTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
    147     buildUChars(buildOption, errorCode);
    148     UCharsTrie *newTrie=NULL;
    149     if(U_SUCCESS(errorCode)) {
    150         newTrie=new UCharsTrie(uchars, uchars+(ucharsCapacity-ucharsLength));
    151         if(newTrie==NULL) {
    152             errorCode=U_MEMORY_ALLOCATION_ERROR;
    153         } else {
    154             uchars=NULL;  // The new trie now owns the array.
    155             ucharsCapacity=0;
    156         }
    157     }
    158     return newTrie;
    159 }
    160 
    161 UnicodeString &
    162 UCharsTrieBuilder::buildUnicodeString(UStringTrieBuildOption buildOption, UnicodeString &result,
    163                                       UErrorCode &errorCode) {
    164     buildUChars(buildOption, errorCode);
    165     if(U_SUCCESS(errorCode)) {
    166         result.setTo(FALSE, uchars+(ucharsCapacity-ucharsLength), ucharsLength);
    167     }
    168     return result;
    169 }
    170 
    171 void
    172 UCharsTrieBuilder::buildUChars(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
    173     if(U_FAILURE(errorCode)) {
    174         return;
    175     }
    176     if(uchars!=NULL && ucharsLength>0) {
    177         // Already built.
    178         return;
    179     }
    180     if(ucharsLength==0) {
    181         if(elementsLength==0) {
    182             errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    183             return;
    184         }
    185         if(strings.isBogus()) {
    186             errorCode=U_MEMORY_ALLOCATION_ERROR;
    187             return;
    188         }
    189         uprv_sortArray(elements, elementsLength, (int32_t)sizeof(UCharsTrieElement),
    190                       compareElementStrings, &strings,
    191                       FALSE,  // need not be a stable sort
    192                       &errorCode);
    193         if(U_FAILURE(errorCode)) {
    194             return;
    195         }
    196         // Duplicate strings are not allowed.
    197         UnicodeString prev=elements[0].getString(strings);
    198         for(int32_t i=1; i<elementsLength; ++i) {
    199             UnicodeString current=elements[i].getString(strings);
    200             if(prev==current) {
    201                 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    202                 return;
    203             }
    204             prev.fastCopyFrom(current);
    205         }
    206     }
    207     // Create and UChar-serialize the trie for the elements.
    208     ucharsLength=0;
    209     int32_t capacity=strings.length();
    210     if(capacity<1024) {
    211         capacity=1024;
    212     }
    213     if(ucharsCapacity<capacity) {
    214         uprv_free(uchars);
    215         uchars=static_cast<UChar *>(uprv_malloc(capacity*2));
    216         if(uchars==NULL) {
    217             errorCode=U_MEMORY_ALLOCATION_ERROR;
    218             ucharsCapacity=0;
    219             return;
    220         }
    221         ucharsCapacity=capacity;
    222     }
    223     StringTrieBuilder::build(buildOption, elementsLength, errorCode);
    224     if(uchars==NULL) {
    225         errorCode=U_MEMORY_ALLOCATION_ERROR;
    226     }
    227 }
    228 
    229 int32_t
    230 UCharsTrieBuilder::getElementStringLength(int32_t i) const {
    231     return elements[i].getStringLength(strings);
    232 }
    233 
    234 UChar
    235 UCharsTrieBuilder::getElementUnit(int32_t i, int32_t unitIndex) const {
    236     return elements[i].charAt(unitIndex, strings);
    237 }
    238 
    239 int32_t
    240 UCharsTrieBuilder::getElementValue(int32_t i) const {
    241     return elements[i].getValue();
    242 }
    243 
    244 int32_t
    245 UCharsTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const {
    246     const UCharsTrieElement &firstElement=elements[first];
    247     const UCharsTrieElement &lastElement=elements[last];
    248     int32_t minStringLength=firstElement.getStringLength(strings);
    249     while(++unitIndex<minStringLength &&
    250             firstElement.charAt(unitIndex, strings)==
    251             lastElement.charAt(unitIndex, strings)) {}
    252     return unitIndex;
    253 }
    254 
    255 int32_t
    256 UCharsTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const {
    257     int32_t length=0;  // Number of different units at unitIndex.
    258     int32_t i=start;
    259     do {
    260         UChar unit=elements[i++].charAt(unitIndex, strings);
    261         while(i<limit && unit==elements[i].charAt(unitIndex, strings)) {
    262             ++i;
    263         }
    264         ++length;
    265     } while(i<limit);
    266     return length;
    267 }
    268 
    269 int32_t
    270 UCharsTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const {
    271     do {
    272         UChar unit=elements[i++].charAt(unitIndex, strings);
    273         while(unit==elements[i].charAt(unitIndex, strings)) {
    274             ++i;
    275         }
    276     } while(--count>0);
    277     return i;
    278 }
    279 
    280 int32_t
    281 UCharsTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const {
    282     while(unit==elements[i].charAt(unitIndex, strings)) {
    283         ++i;
    284     }
    285     return i;
    286 }
    287 
    288 UCharsTrieBuilder::UCTLinearMatchNode::UCTLinearMatchNode(const UChar *units, int32_t len, Node *nextNode)
    289         : LinearMatchNode(len, nextNode), s(units) {
    290     hash=hash*37u+ustr_hashUCharsN(units, len);
    291 }
    292 
    293 UBool
    294 UCharsTrieBuilder::UCTLinearMatchNode::operator==(const Node &other) const {
    295     if(this==&other) {
    296         return TRUE;
    297     }
    298     if(!LinearMatchNode::operator==(other)) {
    299         return FALSE;
    300     }
    301     const UCTLinearMatchNode &o=(const UCTLinearMatchNode &)other;
    302     return 0==u_memcmp(s, o.s, length);
    303 }
    304 
    305 void
    306 UCharsTrieBuilder::UCTLinearMatchNode::write(StringTrieBuilder &builder) {
    307     UCharsTrieBuilder &b=(UCharsTrieBuilder &)builder;
    308     next->write(builder);
    309     b.write(s, length);
    310     offset=b.writeValueAndType(hasValue, value, b.getMinLinearMatch()+length-1);
    311 }
    312 
    313 StringTrieBuilder::Node *
    314 UCharsTrieBuilder::createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
    315                                          Node *nextNode) const {
    316     return new UCTLinearMatchNode(
    317             elements[i].getString(strings).getBuffer()+unitIndex,
    318             length,
    319             nextNode);
    320 }
    321 
    322 UBool
    323 UCharsTrieBuilder::ensureCapacity(int32_t length) {
    324     if(uchars==NULL) {
    325         return FALSE;  // previous memory allocation had failed
    326     }
    327     if(length>ucharsCapacity) {
    328         int32_t newCapacity=ucharsCapacity;
    329         do {
    330             newCapacity*=2;
    331         } while(newCapacity<=length);
    332         UChar *newUChars=static_cast<UChar *>(uprv_malloc(newCapacity*2));
    333         if(newUChars==NULL) {
    334             // unable to allocate memory
    335             uprv_free(uchars);
    336             uchars=NULL;
    337             ucharsCapacity=0;
    338             return FALSE;
    339         }
    340         u_memcpy(newUChars+(newCapacity-ucharsLength),
    341                  uchars+(ucharsCapacity-ucharsLength), ucharsLength);
    342         uprv_free(uchars);
    343         uchars=newUChars;
    344         ucharsCapacity=newCapacity;
    345     }
    346     return TRUE;
    347 }
    348 
    349 int32_t
    350 UCharsTrieBuilder::write(int32_t unit) {
    351     int32_t newLength=ucharsLength+1;
    352     if(ensureCapacity(newLength)) {
    353         ucharsLength=newLength;
    354         uchars[ucharsCapacity-ucharsLength]=(UChar)unit;
    355     }
    356     return ucharsLength;
    357 }
    358 
    359 int32_t
    360 UCharsTrieBuilder::write(const UChar *s, int32_t length) {
    361     int32_t newLength=ucharsLength+length;
    362     if(ensureCapacity(newLength)) {
    363         ucharsLength=newLength;
    364         u_memcpy(uchars+(ucharsCapacity-ucharsLength), s, length);
    365     }
    366     return ucharsLength;
    367 }
    368 
    369 int32_t
    370 UCharsTrieBuilder::writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) {
    371     return write(elements[i].getString(strings).getBuffer()+unitIndex, length);
    372 }
    373 
    374 int32_t
    375 UCharsTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) {
    376     if(0<=i && i<=UCharsTrie::kMaxOneUnitValue) {
    377         return write(i|(isFinal<<15));
    378     }
    379     UChar intUnits[3];
    380     int32_t length;
    381     if(i<0 || i>UCharsTrie::kMaxTwoUnitValue) {
    382         intUnits[0]=(UChar)(UCharsTrie::kThreeUnitValueLead);
    383         intUnits[1]=(UChar)((uint32_t)i>>16);
    384         intUnits[2]=(UChar)i;
    385         length=3;
    386     // } else if(i<=UCharsTrie::kMaxOneUnitValue) {
    387     //     intUnits[0]=(UChar)(i);
    388     //     length=1;
    389     } else {
    390         intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitValueLead+(i>>16));
    391         intUnits[1]=(UChar)i;
    392         length=2;
    393     }
    394     intUnits[0]=(UChar)(intUnits[0]|(isFinal<<15));
    395     return write(intUnits, length);
    396 }
    397 
    398 int32_t
    399 UCharsTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
    400     if(!hasValue) {
    401         return write(node);
    402     }
    403     UChar intUnits[3];
    404     int32_t length;
    405     if(value<0 || value>UCharsTrie::kMaxTwoUnitNodeValue) {
    406         intUnits[0]=(UChar)(UCharsTrie::kThreeUnitNodeValueLead);
    407         intUnits[1]=(UChar)((uint32_t)value>>16);
    408         intUnits[2]=(UChar)value;
    409         length=3;
    410     } else if(value<=UCharsTrie::kMaxOneUnitNodeValue) {
    411         intUnits[0]=(UChar)((value+1)<<6);
    412         length=1;
    413     } else {
    414         intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitNodeValueLead+((value>>10)&0x7fc0));
    415         intUnits[1]=(UChar)value;
    416         length=2;
    417     }
    418     intUnits[0]|=(UChar)node;
    419     return write(intUnits, length);
    420 }
    421 
    422 int32_t
    423 UCharsTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
    424     int32_t i=ucharsLength-jumpTarget;
    425     U_ASSERT(i>=0);
    426     if(i<=UCharsTrie::kMaxOneUnitDelta) {
    427         return write(i);
    428     }
    429     UChar intUnits[3];
    430     int32_t length;
    431     if(i<=UCharsTrie::kMaxTwoUnitDelta) {
    432         intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitDeltaLead+(i>>16));
    433         length=1;
    434     } else {
    435         intUnits[0]=(UChar)(UCharsTrie::kThreeUnitDeltaLead);
    436         intUnits[1]=(UChar)(i>>16);
    437         length=2;
    438     }
    439     intUnits[length++]=(UChar)i;
    440     return write(intUnits, length);
    441 }
    442 
    443 U_NAMESPACE_END
    444