Home | History | Annotate | Download | only in common
      1 /*
      2 *******************************************************************************
      3 *   Copyright (C) 2010-2012, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 *   file name:  bytestriebuilder.cpp
      7 *   encoding:   US-ASCII
      8 *   tab size:   8 (not used)
      9 *   indentation:4
     10 *
     11 *   created on: 2010sep25
     12 *   created by: Markus W. Scherer
     13 */
     14 
     15 #include "unicode/utypes.h"
     16 #include "unicode/bytestrie.h"
     17 #include "unicode/bytestriebuilder.h"
     18 #include "unicode/stringpiece.h"
     19 #include "charstr.h"
     20 #include "cmemory.h"
     21 #include "uhash.h"
     22 #include "uarrsort.h"
     23 #include "uassert.h"
     24 #include "ustr_imp.h"
     25 
     26 U_NAMESPACE_BEGIN
     27 
     28 /*
     29  * Note: This builder implementation stores (bytes, value) pairs with full copies
     30  * of the byte sequences, until the BytesTrie is built.
     31  * It might(!) take less memory if we collected the data in a temporary, dynamic trie.
     32  */
     33 
     34 class BytesTrieElement : public UMemory {
     35 public:
     36     // Use compiler's default constructor, initializes nothing.
     37 
     38     void setTo(const StringPiece &s, int32_t val, CharString &strings, UErrorCode &errorCode);
     39 
     40     StringPiece getString(const CharString &strings) const {
     41         int32_t offset=stringOffset;
     42         int32_t length;
     43         if(offset>=0) {
     44             length=(uint8_t)strings[offset++];
     45         } else {
     46             offset=~offset;
     47             length=((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
     48             offset+=2;
     49         }
     50         return StringPiece(strings.data()+offset, length);
     51     }
     52     int32_t getStringLength(const CharString &strings) const {
     53         int32_t offset=stringOffset;
     54         if(offset>=0) {
     55             return (uint8_t)strings[offset];
     56         } else {
     57             offset=~offset;
     58             return ((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
     59         }
     60     }
     61 
     62     char charAt(int32_t index, const CharString &strings) const { return data(strings)[index]; }
     63 
     64     int32_t getValue() const { return value; }
     65 
     66     int32_t compareStringTo(const BytesTrieElement &o, const CharString &strings) const;
     67 
     68 private:
     69     const char *data(const CharString &strings) const {
     70         int32_t offset=stringOffset;
     71         if(offset>=0) {
     72             ++offset;
     73         } else {
     74             offset=~offset+2;
     75         }
     76         return strings.data()+offset;
     77     }
     78 
     79     // If the stringOffset is non-negative, then the first strings byte contains
     80     // the string length.
     81     // If the stringOffset is negative, then the first two strings bytes contain
     82     // the string length (big-endian), and the offset needs to be bit-inverted.
     83     // (Compared with a stringLength field here, this saves 3 bytes per string for most strings.)
     84     int32_t stringOffset;
     85     int32_t value;
     86 };
     87 
     88 void
     89 BytesTrieElement::setTo(const StringPiece &s, int32_t val,
     90                         CharString &strings, UErrorCode &errorCode) {
     91     if(U_FAILURE(errorCode)) {
     92         return;
     93     }
     94     int32_t length=s.length();
     95     if(length>0xffff) {
     96         // Too long: We store the length in 1 or 2 bytes.
     97         errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
     98         return;
     99     }
    100     int32_t offset=strings.length();
    101     if(length>0xff) {
    102         offset=~offset;
    103         strings.append((char)(length>>8), errorCode);
    104     }
    105     strings.append((char)length, errorCode);
    106     stringOffset=offset;
    107     value=val;
    108     strings.append(s, errorCode);
    109 }
    110 
    111 int32_t
    112 BytesTrieElement::compareStringTo(const BytesTrieElement &other, const CharString &strings) const {
    113     // TODO: add StringPiece::compare(), see ticket #8187
    114     StringPiece thisString=getString(strings);
    115     StringPiece otherString=other.getString(strings);
    116     int32_t lengthDiff=thisString.length()-otherString.length();
    117     int32_t commonLength;
    118     if(lengthDiff<=0) {
    119         commonLength=thisString.length();
    120     } else {
    121         commonLength=otherString.length();
    122     }
    123     int32_t diff=uprv_memcmp(thisString.data(), otherString.data(), commonLength);
    124     return diff!=0 ? diff : lengthDiff;
    125 }
    126 
    127 BytesTrieBuilder::BytesTrieBuilder(UErrorCode &errorCode)
    128         : strings(NULL), elements(NULL), elementsCapacity(0), elementsLength(0),
    129           bytes(NULL), bytesCapacity(0), bytesLength(0) {
    130     if(U_FAILURE(errorCode)) {
    131         return;
    132     }
    133     strings=new CharString();
    134     if(strings==NULL) {
    135         errorCode=U_MEMORY_ALLOCATION_ERROR;
    136     }
    137 }
    138 
    139 BytesTrieBuilder::~BytesTrieBuilder() {
    140     delete strings;
    141     delete[] elements;
    142     uprv_free(bytes);
    143 }
    144 
    145 BytesTrieBuilder &
    146 BytesTrieBuilder::add(const StringPiece &s, int32_t value, UErrorCode &errorCode) {
    147     if(U_FAILURE(errorCode)) {
    148         return *this;
    149     }
    150     if(bytesLength>0) {
    151         // Cannot add elements after building.
    152         errorCode=U_NO_WRITE_PERMISSION;
    153         return *this;
    154     }
    155     if(elementsLength==elementsCapacity) {
    156         int32_t newCapacity;
    157         if(elementsCapacity==0) {
    158             newCapacity=1024;
    159         } else {
    160             newCapacity=4*elementsCapacity;
    161         }
    162         BytesTrieElement *newElements=new BytesTrieElement[newCapacity];
    163         if(newElements==NULL) {
    164             errorCode=U_MEMORY_ALLOCATION_ERROR;
    165             return *this; // error instead of dereferencing null
    166         }
    167         if(elementsLength>0) {
    168             uprv_memcpy(newElements, elements, elementsLength*sizeof(BytesTrieElement));
    169         }
    170         delete[] elements;
    171         elements=newElements;
    172         elementsCapacity=newCapacity;
    173     }
    174     elements[elementsLength++].setTo(s, value, *strings, errorCode);
    175     return *this;
    176 }
    177 
    178 U_CDECL_BEGIN
    179 
    180 static int32_t U_CALLCONV
    181 compareElementStrings(const void *context, const void *left, const void *right) {
    182     const CharString *strings=static_cast<const CharString *>(context);
    183     const BytesTrieElement *leftElement=static_cast<const BytesTrieElement *>(left);
    184     const BytesTrieElement *rightElement=static_cast<const BytesTrieElement *>(right);
    185     return leftElement->compareStringTo(*rightElement, *strings);
    186 }
    187 
    188 U_CDECL_END
    189 
    190 BytesTrie *
    191 BytesTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
    192     buildBytes(buildOption, errorCode);
    193     BytesTrie *newTrie=NULL;
    194     if(U_SUCCESS(errorCode)) {
    195         newTrie=new BytesTrie(bytes, bytes+(bytesCapacity-bytesLength));
    196         if(newTrie==NULL) {
    197             errorCode=U_MEMORY_ALLOCATION_ERROR;
    198         } else {
    199             bytes=NULL;  // The new trie now owns the array.
    200             bytesCapacity=0;
    201         }
    202     }
    203     return newTrie;
    204 }
    205 
    206 StringPiece
    207 BytesTrieBuilder::buildStringPiece(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
    208     buildBytes(buildOption, errorCode);
    209     StringPiece result;
    210     if(U_SUCCESS(errorCode)) {
    211         result.set(bytes+(bytesCapacity-bytesLength), bytesLength);
    212     }
    213     return result;
    214 }
    215 
    216 void
    217 BytesTrieBuilder::buildBytes(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
    218     if(U_FAILURE(errorCode)) {
    219         return;
    220     }
    221     if(bytes!=NULL && bytesLength>0) {
    222         // Already built.
    223         return;
    224     }
    225     if(bytesLength==0) {
    226         if(elementsLength==0) {
    227             errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    228             return;
    229         }
    230         uprv_sortArray(elements, elementsLength, (int32_t)sizeof(BytesTrieElement),
    231                       compareElementStrings, strings,
    232                       FALSE,  // need not be a stable sort
    233                       &errorCode);
    234         if(U_FAILURE(errorCode)) {
    235             return;
    236         }
    237         // Duplicate strings are not allowed.
    238         StringPiece prev=elements[0].getString(*strings);
    239         for(int32_t i=1; i<elementsLength; ++i) {
    240             StringPiece current=elements[i].getString(*strings);
    241             if(prev==current) {
    242                 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    243                 return;
    244             }
    245             prev=current;
    246         }
    247     }
    248     // Create and byte-serialize the trie for the elements.
    249     bytesLength=0;
    250     int32_t capacity=strings->length();
    251     if(capacity<1024) {
    252         capacity=1024;
    253     }
    254     if(bytesCapacity<capacity) {
    255         uprv_free(bytes);
    256         bytes=static_cast<char *>(uprv_malloc(capacity));
    257         if(bytes==NULL) {
    258             errorCode=U_MEMORY_ALLOCATION_ERROR;
    259             bytesCapacity=0;
    260             return;
    261         }
    262         bytesCapacity=capacity;
    263     }
    264     StringTrieBuilder::build(buildOption, elementsLength, errorCode);
    265     if(bytes==NULL) {
    266         errorCode=U_MEMORY_ALLOCATION_ERROR;
    267     }
    268 }
    269 
    270 BytesTrieBuilder &
    271 BytesTrieBuilder::clear() {
    272     strings->clear();
    273     elementsLength=0;
    274     bytesLength=0;
    275     return *this;
    276 }
    277 
    278 int32_t
    279 BytesTrieBuilder::getElementStringLength(int32_t i) const {
    280     return elements[i].getStringLength(*strings);
    281 }
    282 
    283 UChar
    284 BytesTrieBuilder::getElementUnit(int32_t i, int32_t byteIndex) const {
    285     return (uint8_t)elements[i].charAt(byteIndex, *strings);
    286 }
    287 
    288 int32_t
    289 BytesTrieBuilder::getElementValue(int32_t i) const {
    290     return elements[i].getValue();
    291 }
    292 
    293 int32_t
    294 BytesTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t byteIndex) const {
    295     const BytesTrieElement &firstElement=elements[first];
    296     const BytesTrieElement &lastElement=elements[last];
    297     int32_t minStringLength=firstElement.getStringLength(*strings);
    298     while(++byteIndex<minStringLength &&
    299             firstElement.charAt(byteIndex, *strings)==
    300             lastElement.charAt(byteIndex, *strings)) {}
    301     return byteIndex;
    302 }
    303 
    304 int32_t
    305 BytesTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t byteIndex) const {
    306     int32_t length=0;  // Number of different bytes at byteIndex.
    307     int32_t i=start;
    308     do {
    309         char byte=elements[i++].charAt(byteIndex, *strings);
    310         while(i<limit && byte==elements[i].charAt(byteIndex, *strings)) {
    311             ++i;
    312         }
    313         ++length;
    314     } while(i<limit);
    315     return length;
    316 }
    317 
    318 int32_t
    319 BytesTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t byteIndex, int32_t count) const {
    320     do {
    321         char byte=elements[i++].charAt(byteIndex, *strings);
    322         while(byte==elements[i].charAt(byteIndex, *strings)) {
    323             ++i;
    324         }
    325     } while(--count>0);
    326     return i;
    327 }
    328 
    329 int32_t
    330 BytesTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t byteIndex, UChar byte) const {
    331     char b=(char)byte;
    332     while(b==elements[i].charAt(byteIndex, *strings)) {
    333         ++i;
    334     }
    335     return i;
    336 }
    337 
    338 BytesTrieBuilder::BTLinearMatchNode::BTLinearMatchNode(const char *bytes, int32_t len, Node *nextNode)
    339         : LinearMatchNode(len, nextNode), s(bytes) {
    340     hash=hash*37+ustr_hashCharsN(bytes, len);
    341 }
    342 
    343 UBool
    344 BytesTrieBuilder::BTLinearMatchNode::operator==(const Node &other) const {
    345     if(this==&other) {
    346         return TRUE;
    347     }
    348     if(!LinearMatchNode::operator==(other)) {
    349         return FALSE;
    350     }
    351     const BTLinearMatchNode &o=(const BTLinearMatchNode &)other;
    352     return 0==uprv_memcmp(s, o.s, length);
    353 }
    354 
    355 void
    356 BytesTrieBuilder::BTLinearMatchNode::write(StringTrieBuilder &builder) {
    357     BytesTrieBuilder &b=(BytesTrieBuilder &)builder;
    358     next->write(builder);
    359     b.write(s, length);
    360     offset=b.write(b.getMinLinearMatch()+length-1);
    361 }
    362 
    363 StringTrieBuilder::Node *
    364 BytesTrieBuilder::createLinearMatchNode(int32_t i, int32_t byteIndex, int32_t length,
    365                                         Node *nextNode) const {
    366     return new BTLinearMatchNode(
    367             elements[i].getString(*strings).data()+byteIndex,
    368             length,
    369             nextNode);
    370 }
    371 
    372 UBool
    373 BytesTrieBuilder::ensureCapacity(int32_t length) {
    374     if(bytes==NULL) {
    375         return FALSE;  // previous memory allocation had failed
    376     }
    377     if(length>bytesCapacity) {
    378         int32_t newCapacity=bytesCapacity;
    379         do {
    380             newCapacity*=2;
    381         } while(newCapacity<=length);
    382         char *newBytes=static_cast<char *>(uprv_malloc(newCapacity));
    383         if(newBytes==NULL) {
    384             // unable to allocate memory
    385             uprv_free(bytes);
    386             bytes=NULL;
    387             bytesCapacity=0;
    388             return FALSE;
    389         }
    390         uprv_memcpy(newBytes+(newCapacity-bytesLength),
    391                     bytes+(bytesCapacity-bytesLength), bytesLength);
    392         uprv_free(bytes);
    393         bytes=newBytes;
    394         bytesCapacity=newCapacity;
    395     }
    396     return TRUE;
    397 }
    398 
    399 int32_t
    400 BytesTrieBuilder::write(int32_t byte) {
    401     int32_t newLength=bytesLength+1;
    402     if(ensureCapacity(newLength)) {
    403         bytesLength=newLength;
    404         bytes[bytesCapacity-bytesLength]=(char)byte;
    405     }
    406     return bytesLength;
    407 }
    408 
    409 int32_t
    410 BytesTrieBuilder::write(const char *b, int32_t length) {
    411     int32_t newLength=bytesLength+length;
    412     if(ensureCapacity(newLength)) {
    413         bytesLength=newLength;
    414         uprv_memcpy(bytes+(bytesCapacity-bytesLength), b, length);
    415     }
    416     return bytesLength;
    417 }
    418 
    419 int32_t
    420 BytesTrieBuilder::writeElementUnits(int32_t i, int32_t byteIndex, int32_t length) {
    421     return write(elements[i].getString(*strings).data()+byteIndex, length);
    422 }
    423 
    424 int32_t
    425 BytesTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) {
    426     if(0<=i && i<=BytesTrie::kMaxOneByteValue) {
    427         return write(((BytesTrie::kMinOneByteValueLead+i)<<1)|isFinal);
    428     }
    429     char intBytes[5];
    430     int32_t length=1;
    431     if(i<0 || i>0xffffff) {
    432         intBytes[0]=(char)BytesTrie::kFiveByteValueLead;
    433         intBytes[1]=(char)((uint32_t)i>>24);
    434         intBytes[2]=(char)((uint32_t)i>>16);
    435         intBytes[3]=(char)((uint32_t)i>>8);
    436         intBytes[4]=(char)i;
    437         length=5;
    438     // } else if(i<=BytesTrie::kMaxOneByteValue) {
    439     //     intBytes[0]=(char)(BytesTrie::kMinOneByteValueLead+i);
    440     } else {
    441         if(i<=BytesTrie::kMaxTwoByteValue) {
    442             intBytes[0]=(char)(BytesTrie::kMinTwoByteValueLead+(i>>8));
    443         } else {
    444             if(i<=BytesTrie::kMaxThreeByteValue) {
    445                 intBytes[0]=(char)(BytesTrie::kMinThreeByteValueLead+(i>>16));
    446             } else {
    447                 intBytes[0]=(char)BytesTrie::kFourByteValueLead;
    448                 intBytes[1]=(char)(i>>16);
    449                 length=2;
    450             }
    451             intBytes[length++]=(char)(i>>8);
    452         }
    453         intBytes[length++]=(char)i;
    454     }
    455     intBytes[0]=(char)((intBytes[0]<<1)|isFinal);
    456     return write(intBytes, length);
    457 }
    458 
    459 int32_t
    460 BytesTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
    461     int32_t offset=write(node);
    462     if(hasValue) {
    463         offset=writeValueAndFinal(value, FALSE);
    464     }
    465     return offset;
    466 }
    467 
    468 int32_t
    469 BytesTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
    470     int32_t i=bytesLength-jumpTarget;
    471     U_ASSERT(i>=0);
    472     if(i<=BytesTrie::kMaxOneByteDelta) {
    473         return write(i);
    474     }
    475     char intBytes[5];
    476     int32_t length;
    477     if(i<=BytesTrie::kMaxTwoByteDelta) {
    478         intBytes[0]=(char)(BytesTrie::kMinTwoByteDeltaLead+(i>>8));
    479         length=1;
    480     } else {
    481         if(i<=BytesTrie::kMaxThreeByteDelta) {
    482             intBytes[0]=(char)(BytesTrie::kMinThreeByteDeltaLead+(i>>16));
    483             length=2;
    484         } else {
    485             if(i<=0xffffff) {
    486                 intBytes[0]=(char)BytesTrie::kFourByteDeltaLead;
    487                 length=3;
    488             } else {
    489                 intBytes[0]=(char)BytesTrie::kFiveByteDeltaLead;
    490                 intBytes[1]=(char)(i>>24);
    491                 length=4;
    492             }
    493             intBytes[1]=(char)(i>>16);
    494         }
    495         intBytes[1]=(char)(i>>8);
    496     }
    497     intBytes[length++]=(char)i;
    498     return write(intBytes, length);
    499 }
    500 
    501 U_NAMESPACE_END
    502