Home | History | Annotate | Download | only in common
      1 /*
      2 *******************************************************************************
      3 *   Copyright (C) 2010-2011, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 *   file name:  ucharstriebuilder.h
      7 *   encoding:   US-ASCII
      8 *   tab size:   8 (not used)
      9 *   indentation:4
     10 *
     11 *   created on: 2010nov14
     12 *   created by: Markus W. Scherer
     13 */
     14 
     15 #include "unicode/utypes.h"
     16 #include "unicode/ucharstrie.h"
     17 #include "unicode/ucharstriebuilder.h"
     18 #include "unicode/unistr.h"
     19 #include "unicode/ustring.h"
     20 #include "cmemory.h"
     21 #include "uarrsort.h"
     22 #include "uassert.h"
     23 #include "uhash.h"
     24 
     25 U_NAMESPACE_BEGIN
     26 
     27 /*
     28  * Note: This builder implementation stores (string, value) pairs with full copies
     29  * of the 16-bit-unit sequences, until the UCharsTrie is built.
     30  * It might(!) take less memory if we collected the data in a temporary, dynamic trie.
     31  */
     32 
     33 class UCharsTrieElement : public UMemory {
     34 public:
     35     // Use compiler's default constructor, initializes nothing.
     36 
     37     void setTo(const UnicodeString &s, int32_t val, UnicodeString &strings, UErrorCode &errorCode);
     38 
     39     UnicodeString getString(const UnicodeString &strings) const {
     40         int32_t length=strings[stringOffset];
     41         return strings.tempSubString(stringOffset+1, length);
     42     }
     43     int32_t getStringLength(const UnicodeString &strings) const {
     44         return strings[stringOffset];
     45     }
     46 
     47     UChar charAt(int32_t index, const UnicodeString &strings) const {
     48         return strings[stringOffset+1+index];
     49     }
     50 
     51     int32_t getValue() const { return value; }
     52 
     53     int32_t compareStringTo(const UCharsTrieElement &o, const UnicodeString &strings) const;
     54 
     55 private:
     56     // The first strings unit contains the string length.
     57     // (Compared with a stringLength field here, this saves 2 bytes per string.)
     58     int32_t stringOffset;
     59     int32_t value;
     60 };
     61 
     62 void
     63 UCharsTrieElement::setTo(const UnicodeString &s, int32_t val,
     64                          UnicodeString &strings, UErrorCode &errorCode) {
     65     if(U_FAILURE(errorCode)) {
     66         return;
     67     }
     68     int32_t length=s.length();
     69     if(length>0xffff) {
     70         // Too long: We store the length in 1 unit.
     71         errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
     72         return;
     73     }
     74     stringOffset=strings.length();
     75     strings.append((UChar)length);
     76     value=val;
     77     strings.append(s);
     78 }
     79 
     80 int32_t
     81 UCharsTrieElement::compareStringTo(const UCharsTrieElement &other, const UnicodeString &strings) const {
     82     return getString(strings).compare(other.getString(strings));
     83 }
     84 
     85 UCharsTrieBuilder::UCharsTrieBuilder(UErrorCode & /*errorCode*/)
     86         : elements(NULL), elementsCapacity(0), elementsLength(0),
     87           uchars(NULL), ucharsCapacity(0), ucharsLength(0) {}
     88 
     89 UCharsTrieBuilder::~UCharsTrieBuilder() {
     90     delete[] elements;
     91     uprv_free(uchars);
     92 }
     93 
     94 UCharsTrieBuilder &
     95 UCharsTrieBuilder::add(const UnicodeString &s, int32_t value, UErrorCode &errorCode) {
     96     if(U_FAILURE(errorCode)) {
     97         return *this;
     98     }
     99     if(ucharsLength>0) {
    100         // Cannot add elements after building.
    101         errorCode=U_NO_WRITE_PERMISSION;
    102         return *this;
    103     }
    104     if(elementsLength==elementsCapacity) {
    105         int32_t newCapacity;
    106         if(elementsCapacity==0) {
    107             newCapacity=1024;
    108         } else {
    109             newCapacity=4*elementsCapacity;
    110         }
    111         UCharsTrieElement *newElements=new UCharsTrieElement[newCapacity];
    112         if(newElements==NULL) {
    113             errorCode=U_MEMORY_ALLOCATION_ERROR;
    114         }
    115         if(elementsLength>0) {
    116             uprv_memcpy(newElements, elements, elementsLength*sizeof(UCharsTrieElement));
    117         }
    118         delete[] elements;
    119         elements=newElements;
    120         elementsCapacity=newCapacity;
    121     }
    122     elements[elementsLength++].setTo(s, value, strings, errorCode);
    123     if(U_SUCCESS(errorCode) && strings.isBogus()) {
    124         errorCode=U_MEMORY_ALLOCATION_ERROR;
    125     }
    126     return *this;
    127 }
    128 
    129 U_CDECL_BEGIN
    130 
    131 static int32_t U_CALLCONV
    132 compareElementStrings(const void *context, const void *left, const void *right) {
    133     const UnicodeString *strings=reinterpret_cast<const UnicodeString *>(context);
    134     const UCharsTrieElement *leftElement=reinterpret_cast<const UCharsTrieElement *>(left);
    135     const UCharsTrieElement *rightElement=reinterpret_cast<const UCharsTrieElement *>(right);
    136     return leftElement->compareStringTo(*rightElement, *strings);
    137 }
    138 
    139 U_CDECL_END
    140 
    141 UCharsTrie *
    142 UCharsTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
    143     buildUChars(buildOption, errorCode);
    144     UCharsTrie *newTrie=NULL;
    145     if(U_SUCCESS(errorCode)) {
    146         newTrie=new UCharsTrie(uchars, uchars+(ucharsCapacity-ucharsLength));
    147         if(newTrie==NULL) {
    148             errorCode=U_MEMORY_ALLOCATION_ERROR;
    149         } else {
    150             uchars=NULL;  // The new trie now owns the array.
    151             ucharsCapacity=0;
    152         }
    153     }
    154     return newTrie;
    155 }
    156 
    157 UnicodeString &
    158 UCharsTrieBuilder::buildUnicodeString(UStringTrieBuildOption buildOption, UnicodeString &result,
    159                                       UErrorCode &errorCode) {
    160     buildUChars(buildOption, errorCode);
    161     if(U_SUCCESS(errorCode)) {
    162         result.setTo(FALSE, uchars+(ucharsCapacity-ucharsLength), ucharsLength);
    163     }
    164     return result;
    165 }
    166 
    167 void
    168 UCharsTrieBuilder::buildUChars(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
    169     if(U_FAILURE(errorCode)) {
    170         return;
    171     }
    172     if(uchars!=NULL && ucharsLength>0) {
    173         // Already built.
    174         return;
    175     }
    176     if(ucharsLength==0) {
    177         if(elementsLength==0) {
    178             errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    179             return;
    180         }
    181         if(strings.isBogus()) {
    182             errorCode=U_MEMORY_ALLOCATION_ERROR;
    183             return;
    184         }
    185         uprv_sortArray(elements, elementsLength, (int32_t)sizeof(UCharsTrieElement),
    186                       compareElementStrings, &strings,
    187                       FALSE,  // need not be a stable sort
    188                       &errorCode);
    189         if(U_FAILURE(errorCode)) {
    190             return;
    191         }
    192         // Duplicate strings are not allowed.
    193         UnicodeString prev=elements[0].getString(strings);
    194         for(int32_t i=1; i<elementsLength; ++i) {
    195             UnicodeString current=elements[i].getString(strings);
    196             if(prev==current) {
    197                 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    198                 return;
    199             }
    200             prev.fastCopyFrom(current);
    201         }
    202     }
    203     // Create and UChar-serialize the trie for the elements.
    204     ucharsLength=0;
    205     int32_t capacity=strings.length();
    206     if(capacity<1024) {
    207         capacity=1024;
    208     }
    209     if(ucharsCapacity<capacity) {
    210         uprv_free(uchars);
    211         uchars=reinterpret_cast<UChar *>(uprv_malloc(capacity*2));
    212         if(uchars==NULL) {
    213             errorCode=U_MEMORY_ALLOCATION_ERROR;
    214             ucharsCapacity=0;
    215             return;
    216         }
    217         ucharsCapacity=capacity;
    218     }
    219     StringTrieBuilder::build(buildOption, elementsLength, errorCode);
    220     if(uchars==NULL) {
    221         errorCode=U_MEMORY_ALLOCATION_ERROR;
    222     }
    223 }
    224 
    225 int32_t
    226 UCharsTrieBuilder::getElementStringLength(int32_t i) const {
    227     return elements[i].getStringLength(strings);
    228 }
    229 
    230 UChar
    231 UCharsTrieBuilder::getElementUnit(int32_t i, int32_t unitIndex) const {
    232     return elements[i].charAt(unitIndex, strings);
    233 }
    234 
    235 int32_t
    236 UCharsTrieBuilder::getElementValue(int32_t i) const {
    237     return elements[i].getValue();
    238 }
    239 
    240 int32_t
    241 UCharsTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const {
    242     const UCharsTrieElement &firstElement=elements[first];
    243     const UCharsTrieElement &lastElement=elements[last];
    244     int32_t minStringLength=firstElement.getStringLength(strings);
    245     while(++unitIndex<minStringLength &&
    246             firstElement.charAt(unitIndex, strings)==
    247             lastElement.charAt(unitIndex, strings)) {}
    248     return unitIndex;
    249 }
    250 
    251 int32_t
    252 UCharsTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const {
    253     int32_t length=0;  // Number of different units at unitIndex.
    254     int32_t i=start;
    255     do {
    256         UChar unit=elements[i++].charAt(unitIndex, strings);
    257         while(i<limit && unit==elements[i].charAt(unitIndex, strings)) {
    258             ++i;
    259         }
    260         ++length;
    261     } while(i<limit);
    262     return length;
    263 }
    264 
    265 int32_t
    266 UCharsTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const {
    267     do {
    268         UChar unit=elements[i++].charAt(unitIndex, strings);
    269         while(unit==elements[i].charAt(unitIndex, strings)) {
    270             ++i;
    271         }
    272     } while(--count>0);
    273     return i;
    274 }
    275 
    276 int32_t
    277 UCharsTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const {
    278     while(unit==elements[i].charAt(unitIndex, strings)) {
    279         ++i;
    280     }
    281     return i;
    282 }
    283 
    284 UCharsTrieBuilder::UCTLinearMatchNode::UCTLinearMatchNode(const UChar *units, int32_t len, Node *nextNode)
    285         : LinearMatchNode(len, nextNode), s(units) {
    286     hash=hash*37+uhash_hashUCharsN(units, len);
    287 }
    288 
    289 UBool
    290 UCharsTrieBuilder::UCTLinearMatchNode::operator==(const Node &other) const {
    291     if(this==&other) {
    292         return TRUE;
    293     }
    294     if(!LinearMatchNode::operator==(other)) {
    295         return FALSE;
    296     }
    297     const UCTLinearMatchNode &o=(const UCTLinearMatchNode &)other;
    298     return 0==u_memcmp(s, o.s, length);
    299 }
    300 
    301 void
    302 UCharsTrieBuilder::UCTLinearMatchNode::write(StringTrieBuilder &builder) {
    303     UCharsTrieBuilder &b=(UCharsTrieBuilder &)builder;
    304     next->write(builder);
    305     b.write(s, length);
    306     offset=b.writeValueAndType(hasValue, value, b.getMinLinearMatch()+length-1);
    307 }
    308 
    309 StringTrieBuilder::Node *
    310 UCharsTrieBuilder::createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
    311                                          Node *nextNode) const {
    312     return new UCTLinearMatchNode(
    313             elements[i].getString(strings).getBuffer()+unitIndex,
    314             length,
    315             nextNode);
    316 }
    317 
    318 UBool
    319 UCharsTrieBuilder::ensureCapacity(int32_t length) {
    320     if(uchars==NULL) {
    321         return FALSE;  // previous memory allocation had failed
    322     }
    323     if(length>ucharsCapacity) {
    324         int32_t newCapacity=ucharsCapacity;
    325         do {
    326             newCapacity*=2;
    327         } while(newCapacity<=length);
    328         UChar *newUChars=reinterpret_cast<UChar *>(uprv_malloc(newCapacity*2));
    329         if(newUChars==NULL) {
    330             // unable to allocate memory
    331             uprv_free(uchars);
    332             uchars=NULL;
    333             ucharsCapacity=0;
    334             return FALSE;
    335         }
    336         u_memcpy(newUChars+(newCapacity-ucharsLength),
    337                  uchars+(ucharsCapacity-ucharsLength), ucharsLength);
    338         uprv_free(uchars);
    339         uchars=newUChars;
    340         ucharsCapacity=newCapacity;
    341     }
    342     return TRUE;
    343 }
    344 
    345 int32_t
    346 UCharsTrieBuilder::write(int32_t unit) {
    347     int32_t newLength=ucharsLength+1;
    348     if(ensureCapacity(newLength)) {
    349         ucharsLength=newLength;
    350         uchars[ucharsCapacity-ucharsLength]=(UChar)unit;
    351     }
    352     return ucharsLength;
    353 }
    354 
    355 int32_t
    356 UCharsTrieBuilder::write(const UChar *s, int32_t length) {
    357     int32_t newLength=ucharsLength+length;
    358     if(ensureCapacity(newLength)) {
    359         ucharsLength=newLength;
    360         u_memcpy(uchars+(ucharsCapacity-ucharsLength), s, length);
    361     }
    362     return ucharsLength;
    363 }
    364 
    365 int32_t
    366 UCharsTrieBuilder::writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) {
    367     return write(elements[i].getString(strings).getBuffer()+unitIndex, length);
    368 }
    369 
    370 int32_t
    371 UCharsTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) {
    372     if(0<=i && i<=UCharsTrie::kMaxOneUnitValue) {
    373         return write(i|(isFinal<<15));
    374     }
    375     UChar intUnits[3];
    376     int32_t length;
    377     if(i<0 || i>UCharsTrie::kMaxTwoUnitValue) {
    378         intUnits[0]=(UChar)(UCharsTrie::kThreeUnitValueLead);
    379         intUnits[1]=(UChar)(i>>16);
    380         intUnits[2]=(UChar)i;
    381         length=3;
    382     // } else if(i<=UCharsTrie::kMaxOneUnitValue) {
    383     //     intUnits[0]=(UChar)(i);
    384     //     length=1;
    385     } else {
    386         intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitValueLead+(i>>16));
    387         intUnits[1]=(UChar)i;
    388         length=2;
    389     }
    390     intUnits[0]=(UChar)(intUnits[0]|(isFinal<<15));
    391     return write(intUnits, length);
    392 }
    393 
    394 int32_t
    395 UCharsTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
    396     if(!hasValue) {
    397         return write(node);
    398     }
    399     UChar intUnits[3];
    400     int32_t length;
    401     if(value<0 || value>UCharsTrie::kMaxTwoUnitNodeValue) {
    402         intUnits[0]=(UChar)(UCharsTrie::kThreeUnitNodeValueLead);
    403         intUnits[1]=(UChar)(value>>16);
    404         intUnits[2]=(UChar)value;
    405         length=3;
    406     } else if(value<=UCharsTrie::kMaxOneUnitNodeValue) {
    407         intUnits[0]=(UChar)((value+1)<<6);
    408         length=1;
    409     } else {
    410         intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitNodeValueLead+((value>>10)&0x7fc0));
    411         intUnits[1]=(UChar)value;
    412         length=2;
    413     }
    414     intUnits[0]|=(UChar)node;
    415     return write(intUnits, length);
    416 }
    417 
    418 int32_t
    419 UCharsTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
    420     int32_t i=ucharsLength-jumpTarget;
    421     U_ASSERT(i>=0);
    422     if(i<=UCharsTrie::kMaxOneUnitDelta) {
    423         return write(i);
    424     }
    425     UChar intUnits[3];
    426     int32_t length;
    427     if(i<=UCharsTrie::kMaxTwoUnitDelta) {
    428         intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitDeltaLead+(i>>16));
    429         length=1;
    430     } else {
    431         intUnits[0]=(UChar)(UCharsTrie::kThreeUnitDeltaLead);
    432         intUnits[1]=(UChar)(i>>16);
    433         length=2;
    434     }
    435     intUnits[length++]=(UChar)i;
    436     return write(intUnits, length);
    437 }
    438 
    439 U_NAMESPACE_END
    440