Home | History | Annotate | Download | only in common
      1 /*
      2 *******************************************************************************
      3 *   Copyright (C) 2010-2012, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 *   file name:  ucharstriebuilder.h
      7 *   encoding:   US-ASCII
      8 *   tab size:   8 (not used)
      9 *   indentation:4
     10 *
     11 *   created on: 2010nov14
     12 *   created by: Markus W. Scherer
     13 */
     14 
     15 #include "unicode/utypes.h"
     16 #include "unicode/ucharstrie.h"
     17 #include "unicode/ucharstriebuilder.h"
     18 #include "unicode/unistr.h"
     19 #include "unicode/ustring.h"
     20 #include "cmemory.h"
     21 #include "uarrsort.h"
     22 #include "uassert.h"
     23 #include "uhash.h"
     24 #include "ustr_imp.h"
     25 
     26 U_NAMESPACE_BEGIN
     27 
     28 /*
     29  * Note: This builder implementation stores (string, value) pairs with full copies
     30  * of the 16-bit-unit sequences, until the UCharsTrie is built.
     31  * It might(!) take less memory if we collected the data in a temporary, dynamic trie.
     32  */
     33 
     34 class UCharsTrieElement : public UMemory {
     35 public:
     36     // Use compiler's default constructor, initializes nothing.
     37 
     38     void setTo(const UnicodeString &s, int32_t val, UnicodeString &strings, UErrorCode &errorCode);
     39 
     40     UnicodeString getString(const UnicodeString &strings) const {
     41         int32_t length=strings[stringOffset];
     42         return strings.tempSubString(stringOffset+1, length);
     43     }
     44     int32_t getStringLength(const UnicodeString &strings) const {
     45         return strings[stringOffset];
     46     }
     47 
     48     UChar charAt(int32_t index, const UnicodeString &strings) const {
     49         return strings[stringOffset+1+index];
     50     }
     51 
     52     int32_t getValue() const { return value; }
     53 
     54     int32_t compareStringTo(const UCharsTrieElement &o, const UnicodeString &strings) const;
     55 
     56 private:
     57     // The first strings unit contains the string length.
     58     // (Compared with a stringLength field here, this saves 2 bytes per string.)
     59     int32_t stringOffset;
     60     int32_t value;
     61 };
     62 
     63 void
     64 UCharsTrieElement::setTo(const UnicodeString &s, int32_t val,
     65                          UnicodeString &strings, UErrorCode &errorCode) {
     66     if(U_FAILURE(errorCode)) {
     67         return;
     68     }
     69     int32_t length=s.length();
     70     if(length>0xffff) {
     71         // Too long: We store the length in 1 unit.
     72         errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
     73         return;
     74     }
     75     stringOffset=strings.length();
     76     strings.append((UChar)length);
     77     value=val;
     78     strings.append(s);
     79 }
     80 
     81 int32_t
     82 UCharsTrieElement::compareStringTo(const UCharsTrieElement &other, const UnicodeString &strings) const {
     83     return getString(strings).compare(other.getString(strings));
     84 }
     85 
     86 UCharsTrieBuilder::UCharsTrieBuilder(UErrorCode & /*errorCode*/)
     87         : elements(NULL), elementsCapacity(0), elementsLength(0),
     88           uchars(NULL), ucharsCapacity(0), ucharsLength(0) {}
     89 
     90 UCharsTrieBuilder::~UCharsTrieBuilder() {
     91     delete[] elements;
     92     uprv_free(uchars);
     93 }
     94 
     95 UCharsTrieBuilder &
     96 UCharsTrieBuilder::add(const UnicodeString &s, int32_t value, UErrorCode &errorCode) {
     97     if(U_FAILURE(errorCode)) {
     98         return *this;
     99     }
    100     if(ucharsLength>0) {
    101         // Cannot add elements after building.
    102         errorCode=U_NO_WRITE_PERMISSION;
    103         return *this;
    104     }
    105     if(elementsLength==elementsCapacity) {
    106         int32_t newCapacity;
    107         if(elementsCapacity==0) {
    108             newCapacity=1024;
    109         } else {
    110             newCapacity=4*elementsCapacity;
    111         }
    112         UCharsTrieElement *newElements=new UCharsTrieElement[newCapacity];
    113         if(newElements==NULL) {
    114             errorCode=U_MEMORY_ALLOCATION_ERROR;
    115             return *this;
    116         }
    117         if(elementsLength>0) {
    118             uprv_memcpy(newElements, elements, elementsLength*sizeof(UCharsTrieElement));
    119         }
    120         delete[] elements;
    121         elements=newElements;
    122         elementsCapacity=newCapacity;
    123     }
    124     elements[elementsLength++].setTo(s, value, strings, errorCode);
    125     if(U_SUCCESS(errorCode) && strings.isBogus()) {
    126         errorCode=U_MEMORY_ALLOCATION_ERROR;
    127     }
    128     return *this;
    129 }
    130 
    131 U_CDECL_BEGIN
    132 
    133 static int32_t U_CALLCONV
    134 compareElementStrings(const void *context, const void *left, const void *right) {
    135     const UnicodeString *strings=static_cast<const UnicodeString *>(context);
    136     const UCharsTrieElement *leftElement=static_cast<const UCharsTrieElement *>(left);
    137     const UCharsTrieElement *rightElement=static_cast<const UCharsTrieElement *>(right);
    138     return leftElement->compareStringTo(*rightElement, *strings);
    139 }
    140 
    141 U_CDECL_END
    142 
    143 UCharsTrie *
    144 UCharsTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
    145     buildUChars(buildOption, errorCode);
    146     UCharsTrie *newTrie=NULL;
    147     if(U_SUCCESS(errorCode)) {
    148         newTrie=new UCharsTrie(uchars, uchars+(ucharsCapacity-ucharsLength));
    149         if(newTrie==NULL) {
    150             errorCode=U_MEMORY_ALLOCATION_ERROR;
    151         } else {
    152             uchars=NULL;  // The new trie now owns the array.
    153             ucharsCapacity=0;
    154         }
    155     }
    156     return newTrie;
    157 }
    158 
    159 UnicodeString &
    160 UCharsTrieBuilder::buildUnicodeString(UStringTrieBuildOption buildOption, UnicodeString &result,
    161                                       UErrorCode &errorCode) {
    162     buildUChars(buildOption, errorCode);
    163     if(U_SUCCESS(errorCode)) {
    164         result.setTo(FALSE, uchars+(ucharsCapacity-ucharsLength), ucharsLength);
    165     }
    166     return result;
    167 }
    168 
    169 void
    170 UCharsTrieBuilder::buildUChars(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
    171     if(U_FAILURE(errorCode)) {
    172         return;
    173     }
    174     if(uchars!=NULL && ucharsLength>0) {
    175         // Already built.
    176         return;
    177     }
    178     if(ucharsLength==0) {
    179         if(elementsLength==0) {
    180             errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    181             return;
    182         }
    183         if(strings.isBogus()) {
    184             errorCode=U_MEMORY_ALLOCATION_ERROR;
    185             return;
    186         }
    187         uprv_sortArray(elements, elementsLength, (int32_t)sizeof(UCharsTrieElement),
    188                       compareElementStrings, &strings,
    189                       FALSE,  // need not be a stable sort
    190                       &errorCode);
    191         if(U_FAILURE(errorCode)) {
    192             return;
    193         }
    194         // Duplicate strings are not allowed.
    195         UnicodeString prev=elements[0].getString(strings);
    196         for(int32_t i=1; i<elementsLength; ++i) {
    197             UnicodeString current=elements[i].getString(strings);
    198             if(prev==current) {
    199                 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    200                 return;
    201             }
    202             prev.fastCopyFrom(current);
    203         }
    204     }
    205     // Create and UChar-serialize the trie for the elements.
    206     ucharsLength=0;
    207     int32_t capacity=strings.length();
    208     if(capacity<1024) {
    209         capacity=1024;
    210     }
    211     if(ucharsCapacity<capacity) {
    212         uprv_free(uchars);
    213         uchars=static_cast<UChar *>(uprv_malloc(capacity*2));
    214         if(uchars==NULL) {
    215             errorCode=U_MEMORY_ALLOCATION_ERROR;
    216             ucharsCapacity=0;
    217             return;
    218         }
    219         ucharsCapacity=capacity;
    220     }
    221     StringTrieBuilder::build(buildOption, elementsLength, errorCode);
    222     if(uchars==NULL) {
    223         errorCode=U_MEMORY_ALLOCATION_ERROR;
    224     }
    225 }
    226 
    227 int32_t
    228 UCharsTrieBuilder::getElementStringLength(int32_t i) const {
    229     return elements[i].getStringLength(strings);
    230 }
    231 
    232 UChar
    233 UCharsTrieBuilder::getElementUnit(int32_t i, int32_t unitIndex) const {
    234     return elements[i].charAt(unitIndex, strings);
    235 }
    236 
    237 int32_t
    238 UCharsTrieBuilder::getElementValue(int32_t i) const {
    239     return elements[i].getValue();
    240 }
    241 
    242 int32_t
    243 UCharsTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const {
    244     const UCharsTrieElement &firstElement=elements[first];
    245     const UCharsTrieElement &lastElement=elements[last];
    246     int32_t minStringLength=firstElement.getStringLength(strings);
    247     while(++unitIndex<minStringLength &&
    248             firstElement.charAt(unitIndex, strings)==
    249             lastElement.charAt(unitIndex, strings)) {}
    250     return unitIndex;
    251 }
    252 
    253 int32_t
    254 UCharsTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const {
    255     int32_t length=0;  // Number of different units at unitIndex.
    256     int32_t i=start;
    257     do {
    258         UChar unit=elements[i++].charAt(unitIndex, strings);
    259         while(i<limit && unit==elements[i].charAt(unitIndex, strings)) {
    260             ++i;
    261         }
    262         ++length;
    263     } while(i<limit);
    264     return length;
    265 }
    266 
    267 int32_t
    268 UCharsTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const {
    269     do {
    270         UChar unit=elements[i++].charAt(unitIndex, strings);
    271         while(unit==elements[i].charAt(unitIndex, strings)) {
    272             ++i;
    273         }
    274     } while(--count>0);
    275     return i;
    276 }
    277 
    278 int32_t
    279 UCharsTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const {
    280     while(unit==elements[i].charAt(unitIndex, strings)) {
    281         ++i;
    282     }
    283     return i;
    284 }
    285 
    286 UCharsTrieBuilder::UCTLinearMatchNode::UCTLinearMatchNode(const UChar *units, int32_t len, Node *nextNode)
    287         : LinearMatchNode(len, nextNode), s(units) {
    288     hash=hash*37+ustr_hashUCharsN(units, len);
    289 }
    290 
    291 UBool
    292 UCharsTrieBuilder::UCTLinearMatchNode::operator==(const Node &other) const {
    293     if(this==&other) {
    294         return TRUE;
    295     }
    296     if(!LinearMatchNode::operator==(other)) {
    297         return FALSE;
    298     }
    299     const UCTLinearMatchNode &o=(const UCTLinearMatchNode &)other;
    300     return 0==u_memcmp(s, o.s, length);
    301 }
    302 
    303 void
    304 UCharsTrieBuilder::UCTLinearMatchNode::write(StringTrieBuilder &builder) {
    305     UCharsTrieBuilder &b=(UCharsTrieBuilder &)builder;
    306     next->write(builder);
    307     b.write(s, length);
    308     offset=b.writeValueAndType(hasValue, value, b.getMinLinearMatch()+length-1);
    309 }
    310 
    311 StringTrieBuilder::Node *
    312 UCharsTrieBuilder::createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
    313                                          Node *nextNode) const {
    314     return new UCTLinearMatchNode(
    315             elements[i].getString(strings).getBuffer()+unitIndex,
    316             length,
    317             nextNode);
    318 }
    319 
    320 UBool
    321 UCharsTrieBuilder::ensureCapacity(int32_t length) {
    322     if(uchars==NULL) {
    323         return FALSE;  // previous memory allocation had failed
    324     }
    325     if(length>ucharsCapacity) {
    326         int32_t newCapacity=ucharsCapacity;
    327         do {
    328             newCapacity*=2;
    329         } while(newCapacity<=length);
    330         UChar *newUChars=static_cast<UChar *>(uprv_malloc(newCapacity*2));
    331         if(newUChars==NULL) {
    332             // unable to allocate memory
    333             uprv_free(uchars);
    334             uchars=NULL;
    335             ucharsCapacity=0;
    336             return FALSE;
    337         }
    338         u_memcpy(newUChars+(newCapacity-ucharsLength),
    339                  uchars+(ucharsCapacity-ucharsLength), ucharsLength);
    340         uprv_free(uchars);
    341         uchars=newUChars;
    342         ucharsCapacity=newCapacity;
    343     }
    344     return TRUE;
    345 }
    346 
    347 int32_t
    348 UCharsTrieBuilder::write(int32_t unit) {
    349     int32_t newLength=ucharsLength+1;
    350     if(ensureCapacity(newLength)) {
    351         ucharsLength=newLength;
    352         uchars[ucharsCapacity-ucharsLength]=(UChar)unit;
    353     }
    354     return ucharsLength;
    355 }
    356 
    357 int32_t
    358 UCharsTrieBuilder::write(const UChar *s, int32_t length) {
    359     int32_t newLength=ucharsLength+length;
    360     if(ensureCapacity(newLength)) {
    361         ucharsLength=newLength;
    362         u_memcpy(uchars+(ucharsCapacity-ucharsLength), s, length);
    363     }
    364     return ucharsLength;
    365 }
    366 
    367 int32_t
    368 UCharsTrieBuilder::writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) {
    369     return write(elements[i].getString(strings).getBuffer()+unitIndex, length);
    370 }
    371 
    372 int32_t
    373 UCharsTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) {
    374     if(0<=i && i<=UCharsTrie::kMaxOneUnitValue) {
    375         return write(i|(isFinal<<15));
    376     }
    377     UChar intUnits[3];
    378     int32_t length;
    379     if(i<0 || i>UCharsTrie::kMaxTwoUnitValue) {
    380         intUnits[0]=(UChar)(UCharsTrie::kThreeUnitValueLead);
    381         intUnits[1]=(UChar)((uint32_t)i>>16);
    382         intUnits[2]=(UChar)i;
    383         length=3;
    384     // } else if(i<=UCharsTrie::kMaxOneUnitValue) {
    385     //     intUnits[0]=(UChar)(i);
    386     //     length=1;
    387     } else {
    388         intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitValueLead+(i>>16));
    389         intUnits[1]=(UChar)i;
    390         length=2;
    391     }
    392     intUnits[0]=(UChar)(intUnits[0]|(isFinal<<15));
    393     return write(intUnits, length);
    394 }
    395 
    396 int32_t
    397 UCharsTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
    398     if(!hasValue) {
    399         return write(node);
    400     }
    401     UChar intUnits[3];
    402     int32_t length;
    403     if(value<0 || value>UCharsTrie::kMaxTwoUnitNodeValue) {
    404         intUnits[0]=(UChar)(UCharsTrie::kThreeUnitNodeValueLead);
    405         intUnits[1]=(UChar)((uint32_t)value>>16);
    406         intUnits[2]=(UChar)value;
    407         length=3;
    408     } else if(value<=UCharsTrie::kMaxOneUnitNodeValue) {
    409         intUnits[0]=(UChar)((value+1)<<6);
    410         length=1;
    411     } else {
    412         intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitNodeValueLead+((value>>10)&0x7fc0));
    413         intUnits[1]=(UChar)value;
    414         length=2;
    415     }
    416     intUnits[0]|=(UChar)node;
    417     return write(intUnits, length);
    418 }
    419 
    420 int32_t
    421 UCharsTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
    422     int32_t i=ucharsLength-jumpTarget;
    423     U_ASSERT(i>=0);
    424     if(i<=UCharsTrie::kMaxOneUnitDelta) {
    425         return write(i);
    426     }
    427     UChar intUnits[3];
    428     int32_t length;
    429     if(i<=UCharsTrie::kMaxTwoUnitDelta) {
    430         intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitDeltaLead+(i>>16));
    431         length=1;
    432     } else {
    433         intUnits[0]=(UChar)(UCharsTrie::kThreeUnitDeltaLead);
    434         intUnits[1]=(UChar)(i>>16);
    435         length=2;
    436     }
    437     intUnits[length++]=(UChar)i;
    438     return write(intUnits, length);
    439 }
    440 
    441 U_NAMESPACE_END
    442