Home | History | Annotate | Download | only in common
      1 /*
      2 *******************************************************************************
      3 *   Copyright (C) 2010-2011, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 *   file name:  bytestriebuilder.cpp
      7 *   encoding:   US-ASCII
      8 *   tab size:   8 (not used)
      9 *   indentation:4
     10 *
     11 *   created on: 2010sep25
     12 *   created by: Markus W. Scherer
     13 */
     14 
     15 #include "unicode/utypes.h"
     16 #include "unicode/bytestrie.h"
     17 #include "unicode/bytestriebuilder.h"
     18 #include "unicode/stringpiece.h"
     19 #include "charstr.h"
     20 #include "cmemory.h"
     21 #include "uhash.h"
     22 #include "uarrsort.h"
     23 #include "uassert.h"
     24 
     25 U_NAMESPACE_BEGIN
     26 
     27 /*
     28  * Note: This builder implementation stores (bytes, value) pairs with full copies
     29  * of the byte sequences, until the BytesTrie is built.
     30  * It might(!) take less memory if we collected the data in a temporary, dynamic trie.
     31  */
     32 
     33 class BytesTrieElement : public UMemory {
     34 public:
     35     // Use compiler's default constructor, initializes nothing.
     36 
     37     void setTo(const StringPiece &s, int32_t val, CharString &strings, UErrorCode &errorCode);
     38 
     39     StringPiece getString(const CharString &strings) const {
     40         int32_t offset=stringOffset;
     41         int32_t length;
     42         if(offset>=0) {
     43             length=(uint8_t)strings[offset++];
     44         } else {
     45             offset=~offset;
     46             length=((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
     47             offset+=2;
     48         }
     49         return StringPiece(strings.data()+offset, length);
     50     }
     51     int32_t getStringLength(const CharString &strings) const {
     52         int32_t offset=stringOffset;
     53         if(offset>=0) {
     54             return (uint8_t)strings[offset];
     55         } else {
     56             offset=~offset;
     57             return ((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
     58         }
     59     }
     60 
     61     char charAt(int32_t index, const CharString &strings) const { return data(strings)[index]; }
     62 
     63     int32_t getValue() const { return value; }
     64 
     65     int32_t compareStringTo(const BytesTrieElement &o, const CharString &strings) const;
     66 
     67 private:
     68     const char *data(const CharString &strings) const {
     69         int32_t offset=stringOffset;
     70         if(offset>=0) {
     71             ++offset;
     72         } else {
     73             offset=~offset+2;
     74         }
     75         return strings.data()+offset;
     76     }
     77 
     78     // If the stringOffset is non-negative, then the first strings byte contains
     79     // the string length.
     80     // If the stringOffset is negative, then the first two strings bytes contain
     81     // the string length (big-endian), and the offset needs to be bit-inverted.
     82     // (Compared with a stringLength field here, this saves 3 bytes per string for most strings.)
     83     int32_t stringOffset;
     84     int32_t value;
     85 };
     86 
     87 void
     88 BytesTrieElement::setTo(const StringPiece &s, int32_t val,
     89                         CharString &strings, UErrorCode &errorCode) {
     90     if(U_FAILURE(errorCode)) {
     91         return;
     92     }
     93     int32_t length=s.length();
     94     if(length>0xffff) {
     95         // Too long: We store the length in 1 or 2 bytes.
     96         errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
     97         return;
     98     }
     99     int32_t offset=strings.length();
    100     if(length>0xff) {
    101         offset=~offset;
    102         strings.append((char)(length>>8), errorCode);
    103     }
    104     strings.append((char)length, errorCode);
    105     stringOffset=offset;
    106     value=val;
    107     strings.append(s, errorCode);
    108 }
    109 
    110 int32_t
    111 BytesTrieElement::compareStringTo(const BytesTrieElement &other, const CharString &strings) const {
    112     // TODO: add StringPiece::compare(), see ticket #8187
    113     StringPiece thisString=getString(strings);
    114     StringPiece otherString=other.getString(strings);
    115     int32_t lengthDiff=thisString.length()-otherString.length();
    116     int32_t commonLength;
    117     if(lengthDiff<=0) {
    118         commonLength=thisString.length();
    119     } else {
    120         commonLength=otherString.length();
    121     }
    122     int32_t diff=uprv_memcmp(thisString.data(), otherString.data(), commonLength);
    123     return diff!=0 ? diff : lengthDiff;
    124 }
    125 
    126 BytesTrieBuilder::BytesTrieBuilder(UErrorCode &errorCode)
    127         : strings(NULL), elements(NULL), elementsCapacity(0), elementsLength(0),
    128           bytes(NULL), bytesCapacity(0), bytesLength(0) {
    129     if(U_FAILURE(errorCode)) {
    130         return;
    131     }
    132     strings=new CharString();
    133     if(strings==NULL) {
    134         errorCode=U_MEMORY_ALLOCATION_ERROR;
    135     }
    136 }
    137 
    138 BytesTrieBuilder::~BytesTrieBuilder() {
    139     delete strings;
    140     delete[] elements;
    141     uprv_free(bytes);
    142 }
    143 
    144 BytesTrieBuilder &
    145 BytesTrieBuilder::add(const StringPiece &s, int32_t value, UErrorCode &errorCode) {
    146     if(U_FAILURE(errorCode)) {
    147         return *this;
    148     }
    149     if(bytesLength>0) {
    150         // Cannot add elements after building.
    151         errorCode=U_NO_WRITE_PERMISSION;
    152         return *this;
    153     }
    154     if(elementsLength==elementsCapacity) {
    155         int32_t newCapacity;
    156         if(elementsCapacity==0) {
    157             newCapacity=1024;
    158         } else {
    159             newCapacity=4*elementsCapacity;
    160         }
    161         BytesTrieElement *newElements=new BytesTrieElement[newCapacity];
    162         if(newElements==NULL) {
    163             errorCode=U_MEMORY_ALLOCATION_ERROR;
    164         }
    165         if(elementsLength>0) {
    166             uprv_memcpy(newElements, elements, elementsLength*sizeof(BytesTrieElement));
    167         }
    168         delete[] elements;
    169         elements=newElements;
    170         elementsCapacity=newCapacity;
    171     }
    172     elements[elementsLength++].setTo(s, value, *strings, errorCode);
    173     return *this;
    174 }
    175 
    176 U_CDECL_BEGIN
    177 
    178 static int32_t U_CALLCONV
    179 compareElementStrings(const void *context, const void *left, const void *right) {
    180     const CharString *strings=reinterpret_cast<const CharString *>(context);
    181     const BytesTrieElement *leftElement=reinterpret_cast<const BytesTrieElement *>(left);
    182     const BytesTrieElement *rightElement=reinterpret_cast<const BytesTrieElement *>(right);
    183     return leftElement->compareStringTo(*rightElement, *strings);
    184 }
    185 
    186 U_CDECL_END
    187 
    188 BytesTrie *
    189 BytesTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
    190     buildBytes(buildOption, errorCode);
    191     BytesTrie *newTrie=NULL;
    192     if(U_SUCCESS(errorCode)) {
    193         newTrie=new BytesTrie(bytes, bytes+(bytesCapacity-bytesLength));
    194         if(newTrie==NULL) {
    195             errorCode=U_MEMORY_ALLOCATION_ERROR;
    196         } else {
    197             bytes=NULL;  // The new trie now owns the array.
    198             bytesCapacity=0;
    199         }
    200     }
    201     return newTrie;
    202 }
    203 
    204 StringPiece
    205 BytesTrieBuilder::buildStringPiece(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
    206     buildBytes(buildOption, errorCode);
    207     StringPiece result;
    208     if(U_SUCCESS(errorCode)) {
    209         result.set(bytes+(bytesCapacity-bytesLength), bytesLength);
    210     }
    211     return result;
    212 }
    213 
    214 void
    215 BytesTrieBuilder::buildBytes(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
    216     if(U_FAILURE(errorCode)) {
    217         return;
    218     }
    219     if(bytes!=NULL && bytesLength>0) {
    220         // Already built.
    221         return;
    222     }
    223     if(bytesLength==0) {
    224         if(elementsLength==0) {
    225             errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    226             return;
    227         }
    228         uprv_sortArray(elements, elementsLength, (int32_t)sizeof(BytesTrieElement),
    229                       compareElementStrings, strings,
    230                       FALSE,  // need not be a stable sort
    231                       &errorCode);
    232         if(U_FAILURE(errorCode)) {
    233             return;
    234         }
    235         // Duplicate strings are not allowed.
    236         StringPiece prev=elements[0].getString(*strings);
    237         for(int32_t i=1; i<elementsLength; ++i) {
    238             StringPiece current=elements[i].getString(*strings);
    239             if(prev==current) {
    240                 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    241                 return;
    242             }
    243             prev=current;
    244         }
    245     }
    246     // Create and byte-serialize the trie for the elements.
    247     bytesLength=0;
    248     int32_t capacity=strings->length();
    249     if(capacity<1024) {
    250         capacity=1024;
    251     }
    252     if(bytesCapacity<capacity) {
    253         uprv_free(bytes);
    254         bytes=reinterpret_cast<char *>(uprv_malloc(capacity));
    255         if(bytes==NULL) {
    256             errorCode=U_MEMORY_ALLOCATION_ERROR;
    257             bytesCapacity=0;
    258             return;
    259         }
    260         bytesCapacity=capacity;
    261     }
    262     StringTrieBuilder::build(buildOption, elementsLength, errorCode);
    263     if(bytes==NULL) {
    264         errorCode=U_MEMORY_ALLOCATION_ERROR;
    265     }
    266 }
    267 
    268 BytesTrieBuilder &
    269 BytesTrieBuilder::clear() {
    270     strings->clear();
    271     elementsLength=0;
    272     bytesLength=0;
    273     return *this;
    274 }
    275 
    276 int32_t
    277 BytesTrieBuilder::getElementStringLength(int32_t i) const {
    278     return elements[i].getStringLength(*strings);
    279 }
    280 
    281 UChar
    282 BytesTrieBuilder::getElementUnit(int32_t i, int32_t byteIndex) const {
    283     return (uint8_t)elements[i].charAt(byteIndex, *strings);
    284 }
    285 
    286 int32_t
    287 BytesTrieBuilder::getElementValue(int32_t i) const {
    288     return elements[i].getValue();
    289 }
    290 
    291 int32_t
    292 BytesTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t byteIndex) const {
    293     const BytesTrieElement &firstElement=elements[first];
    294     const BytesTrieElement &lastElement=elements[last];
    295     int32_t minStringLength=firstElement.getStringLength(*strings);
    296     while(++byteIndex<minStringLength &&
    297             firstElement.charAt(byteIndex, *strings)==
    298             lastElement.charAt(byteIndex, *strings)) {}
    299     return byteIndex;
    300 }
    301 
    302 int32_t
    303 BytesTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t byteIndex) const {
    304     int32_t length=0;  // Number of different bytes at byteIndex.
    305     int32_t i=start;
    306     do {
    307         char byte=elements[i++].charAt(byteIndex, *strings);
    308         while(i<limit && byte==elements[i].charAt(byteIndex, *strings)) {
    309             ++i;
    310         }
    311         ++length;
    312     } while(i<limit);
    313     return length;
    314 }
    315 
    316 int32_t
    317 BytesTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t byteIndex, int32_t count) const {
    318     do {
    319         char byte=elements[i++].charAt(byteIndex, *strings);
    320         while(byte==elements[i].charAt(byteIndex, *strings)) {
    321             ++i;
    322         }
    323     } while(--count>0);
    324     return i;
    325 }
    326 
    327 int32_t
    328 BytesTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t byteIndex, UChar byte) const {
    329     char b=(char)byte;
    330     while(b==elements[i].charAt(byteIndex, *strings)) {
    331         ++i;
    332     }
    333     return i;
    334 }
    335 
    336 BytesTrieBuilder::BTLinearMatchNode::BTLinearMatchNode(const char *bytes, int32_t len, Node *nextNode)
    337         : LinearMatchNode(len, nextNode), s(bytes) {
    338     hash=hash*37+uhash_hashCharsN(bytes, len);
    339 }
    340 
    341 UBool
    342 BytesTrieBuilder::BTLinearMatchNode::operator==(const Node &other) const {
    343     if(this==&other) {
    344         return TRUE;
    345     }
    346     if(!LinearMatchNode::operator==(other)) {
    347         return FALSE;
    348     }
    349     const BTLinearMatchNode &o=(const BTLinearMatchNode &)other;
    350     return 0==uprv_memcmp(s, o.s, length);
    351 }
    352 
    353 void
    354 BytesTrieBuilder::BTLinearMatchNode::write(StringTrieBuilder &builder) {
    355     BytesTrieBuilder &b=(BytesTrieBuilder &)builder;
    356     next->write(builder);
    357     b.write(s, length);
    358     offset=b.write(b.getMinLinearMatch()+length-1);
    359 }
    360 
    361 StringTrieBuilder::Node *
    362 BytesTrieBuilder::createLinearMatchNode(int32_t i, int32_t byteIndex, int32_t length,
    363                                         Node *nextNode) const {
    364     return new BTLinearMatchNode(
    365             elements[i].getString(*strings).data()+byteIndex,
    366             length,
    367             nextNode);
    368 }
    369 
    370 UBool
    371 BytesTrieBuilder::ensureCapacity(int32_t length) {
    372     if(bytes==NULL) {
    373         return FALSE;  // previous memory allocation had failed
    374     }
    375     if(length>bytesCapacity) {
    376         int32_t newCapacity=bytesCapacity;
    377         do {
    378             newCapacity*=2;
    379         } while(newCapacity<=length);
    380         char *newBytes=reinterpret_cast<char *>(uprv_malloc(newCapacity));
    381         if(newBytes==NULL) {
    382             // unable to allocate memory
    383             uprv_free(bytes);
    384             bytes=NULL;
    385             bytesCapacity=0;
    386             return FALSE;
    387         }
    388         uprv_memcpy(newBytes+(newCapacity-bytesLength),
    389                     bytes+(bytesCapacity-bytesLength), bytesLength);
    390         uprv_free(bytes);
    391         bytes=newBytes;
    392         bytesCapacity=newCapacity;
    393     }
    394     return TRUE;
    395 }
    396 
    397 int32_t
    398 BytesTrieBuilder::write(int32_t byte) {
    399     int32_t newLength=bytesLength+1;
    400     if(ensureCapacity(newLength)) {
    401         bytesLength=newLength;
    402         bytes[bytesCapacity-bytesLength]=(char)byte;
    403     }
    404     return bytesLength;
    405 }
    406 
    407 int32_t
    408 BytesTrieBuilder::write(const char *b, int32_t length) {
    409     int32_t newLength=bytesLength+length;
    410     if(ensureCapacity(newLength)) {
    411         bytesLength=newLength;
    412         uprv_memcpy(bytes+(bytesCapacity-bytesLength), b, length);
    413     }
    414     return bytesLength;
    415 }
    416 
    417 int32_t
    418 BytesTrieBuilder::writeElementUnits(int32_t i, int32_t byteIndex, int32_t length) {
    419     return write(elements[i].getString(*strings).data()+byteIndex, length);
    420 }
    421 
    422 int32_t
    423 BytesTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) {
    424     if(0<=i && i<=BytesTrie::kMaxOneByteValue) {
    425         return write(((BytesTrie::kMinOneByteValueLead+i)<<1)|isFinal);
    426     }
    427     char intBytes[5];
    428     int32_t length=1;
    429     if(i<0 || i>0xffffff) {
    430         intBytes[0]=(char)BytesTrie::kFiveByteValueLead;
    431         intBytes[1]=(char)(i>>24);
    432         intBytes[2]=(char)(i>>16);
    433         intBytes[3]=(char)(i>>8);
    434         intBytes[4]=(char)i;
    435         length=5;
    436     // } else if(i<=BytesTrie::kMaxOneByteValue) {
    437     //     intBytes[0]=(char)(BytesTrie::kMinOneByteValueLead+i);
    438     } else {
    439         if(i<=BytesTrie::kMaxTwoByteValue) {
    440             intBytes[0]=(char)(BytesTrie::kMinTwoByteValueLead+(i>>8));
    441         } else {
    442             if(i<=BytesTrie::kMaxThreeByteValue) {
    443                 intBytes[0]=(char)(BytesTrie::kMinThreeByteValueLead+(i>>16));
    444             } else {
    445                 intBytes[0]=(char)BytesTrie::kFourByteValueLead;
    446                 intBytes[1]=(char)(i>>16);
    447                 length=2;
    448             }
    449             intBytes[length++]=(char)(i>>8);
    450         }
    451         intBytes[length++]=(char)i;
    452     }
    453     intBytes[0]=(char)((intBytes[0]<<1)|isFinal);
    454     return write(intBytes, length);
    455 }
    456 
    457 int32_t
    458 BytesTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
    459     int32_t offset=write(node);
    460     if(hasValue) {
    461         offset=writeValueAndFinal(value, FALSE);
    462     }
    463     return offset;
    464 }
    465 
    466 int32_t
    467 BytesTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
    468     int32_t i=bytesLength-jumpTarget;
    469     U_ASSERT(i>=0);
    470     if(i<=BytesTrie::kMaxOneByteDelta) {
    471         return write(i);
    472     }
    473     char intBytes[5];
    474     int32_t length;
    475     if(i<=BytesTrie::kMaxTwoByteDelta) {
    476         intBytes[0]=(char)(BytesTrie::kMinTwoByteDeltaLead+(i>>8));
    477         length=1;
    478     } else {
    479         if(i<=BytesTrie::kMaxThreeByteDelta) {
    480             intBytes[0]=(char)(BytesTrie::kMinThreeByteDeltaLead+(i>>16));
    481             length=2;
    482         } else {
    483             if(i<=0xffffff) {
    484                 intBytes[0]=(char)BytesTrie::kFourByteDeltaLead;
    485                 length=3;
    486             } else {
    487                 intBytes[0]=(char)BytesTrie::kFiveByteDeltaLead;
    488                 intBytes[1]=(char)(i>>24);
    489                 length=4;
    490             }
    491             intBytes[1]=(char)(i>>16);
    492         }
    493         intBytes[1]=(char)(i>>8);
    494     }
    495     intBytes[length++]=(char)i;
    496     return write(intBytes, length);
    497 }
    498 
    499 U_NAMESPACE_END
    500