Home | History | Annotate | Download | only in common
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *   Copyright (C) 2010-2012, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 *   file name:  bytestriebuilder.cpp
      9 *   encoding:   UTF-8
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2010sep25
     14 *   created by: Markus W. Scherer
     15 */
     16 
     17 #include "unicode/utypes.h"
     18 #include "unicode/bytestrie.h"
     19 #include "unicode/bytestriebuilder.h"
     20 #include "unicode/stringpiece.h"
     21 #include "charstr.h"
     22 #include "cmemory.h"
     23 #include "uhash.h"
     24 #include "uarrsort.h"
     25 #include "uassert.h"
     26 #include "ustr_imp.h"
     27 
     28 U_NAMESPACE_BEGIN
     29 
     30 /*
     31  * Note: This builder implementation stores (bytes, value) pairs with full copies
     32  * of the byte sequences, until the BytesTrie is built.
     33  * It might(!) take less memory if we collected the data in a temporary, dynamic trie.
     34  */
     35 
     36 class BytesTrieElement : public UMemory {
     37 public:
     38     // Use compiler's default constructor, initializes nothing.
     39 
     40     void setTo(StringPiece s, int32_t val, CharString &strings, UErrorCode &errorCode);
     41 
     42     StringPiece getString(const CharString &strings) const {
     43         int32_t offset=stringOffset;
     44         int32_t length;
     45         if(offset>=0) {
     46             length=(uint8_t)strings[offset++];
     47         } else {
     48             offset=~offset;
     49             length=((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
     50             offset+=2;
     51         }
     52         return StringPiece(strings.data()+offset, length);
     53     }
     54     int32_t getStringLength(const CharString &strings) const {
     55         int32_t offset=stringOffset;
     56         if(offset>=0) {
     57             return (uint8_t)strings[offset];
     58         } else {
     59             offset=~offset;
     60             return ((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
     61         }
     62     }
     63 
     64     char charAt(int32_t index, const CharString &strings) const { return data(strings)[index]; }
     65 
     66     int32_t getValue() const { return value; }
     67 
     68     int32_t compareStringTo(const BytesTrieElement &o, const CharString &strings) const;
     69 
     70 private:
     71     const char *data(const CharString &strings) const {
     72         int32_t offset=stringOffset;
     73         if(offset>=0) {
     74             ++offset;
     75         } else {
     76             offset=~offset+2;
     77         }
     78         return strings.data()+offset;
     79     }
     80 
     81     // If the stringOffset is non-negative, then the first strings byte contains
     82     // the string length.
     83     // If the stringOffset is negative, then the first two strings bytes contain
     84     // the string length (big-endian), and the offset needs to be bit-inverted.
     85     // (Compared with a stringLength field here, this saves 3 bytes per string for most strings.)
     86     int32_t stringOffset;
     87     int32_t value;
     88 };
     89 
     90 void
     91 BytesTrieElement::setTo(StringPiece s, int32_t val,
     92                         CharString &strings, UErrorCode &errorCode) {
     93     if(U_FAILURE(errorCode)) {
     94         return;
     95     }
     96     int32_t length=s.length();
     97     if(length>0xffff) {
     98         // Too long: We store the length in 1 or 2 bytes.
     99         errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    100         return;
    101     }
    102     int32_t offset=strings.length();
    103     if(length>0xff) {
    104         offset=~offset;
    105         strings.append((char)(length>>8), errorCode);
    106     }
    107     strings.append((char)length, errorCode);
    108     stringOffset=offset;
    109     value=val;
    110     strings.append(s, errorCode);
    111 }
    112 
    113 int32_t
    114 BytesTrieElement::compareStringTo(const BytesTrieElement &other, const CharString &strings) const {
    115     // TODO: add StringPiece::compare(), see ticket #8187
    116     StringPiece thisString=getString(strings);
    117     StringPiece otherString=other.getString(strings);
    118     int32_t lengthDiff=thisString.length()-otherString.length();
    119     int32_t commonLength;
    120     if(lengthDiff<=0) {
    121         commonLength=thisString.length();
    122     } else {
    123         commonLength=otherString.length();
    124     }
    125     int32_t diff=uprv_memcmp(thisString.data(), otherString.data(), commonLength);
    126     return diff!=0 ? diff : lengthDiff;
    127 }
    128 
    129 BytesTrieBuilder::BytesTrieBuilder(UErrorCode &errorCode)
    130         : strings(NULL), elements(NULL), elementsCapacity(0), elementsLength(0),
    131           bytes(NULL), bytesCapacity(0), bytesLength(0) {
    132     if(U_FAILURE(errorCode)) {
    133         return;
    134     }
    135     strings=new CharString();
    136     if(strings==NULL) {
    137         errorCode=U_MEMORY_ALLOCATION_ERROR;
    138     }
    139 }
    140 
    141 BytesTrieBuilder::~BytesTrieBuilder() {
    142     delete strings;
    143     delete[] elements;
    144     uprv_free(bytes);
    145 }
    146 
    147 BytesTrieBuilder &
    148 BytesTrieBuilder::add(StringPiece s, int32_t value, UErrorCode &errorCode) {
    149     if(U_FAILURE(errorCode)) {
    150         return *this;
    151     }
    152     if(bytesLength>0) {
    153         // Cannot add elements after building.
    154         errorCode=U_NO_WRITE_PERMISSION;
    155         return *this;
    156     }
    157     if(elementsLength==elementsCapacity) {
    158         int32_t newCapacity;
    159         if(elementsCapacity==0) {
    160             newCapacity=1024;
    161         } else {
    162             newCapacity=4*elementsCapacity;
    163         }
    164         BytesTrieElement *newElements=new BytesTrieElement[newCapacity];
    165         if(newElements==NULL) {
    166             errorCode=U_MEMORY_ALLOCATION_ERROR;
    167             return *this; // error instead of dereferencing null
    168         }
    169         if(elementsLength>0) {
    170             uprv_memcpy(newElements, elements, (size_t)elementsLength*sizeof(BytesTrieElement));
    171         }
    172         delete[] elements;
    173         elements=newElements;
    174         elementsCapacity=newCapacity;
    175     }
    176     elements[elementsLength++].setTo(s, value, *strings, errorCode);
    177     return *this;
    178 }
    179 
    180 U_CDECL_BEGIN
    181 
    182 static int32_t U_CALLCONV
    183 compareElementStrings(const void *context, const void *left, const void *right) {
    184     const CharString *strings=static_cast<const CharString *>(context);
    185     const BytesTrieElement *leftElement=static_cast<const BytesTrieElement *>(left);
    186     const BytesTrieElement *rightElement=static_cast<const BytesTrieElement *>(right);
    187     return leftElement->compareStringTo(*rightElement, *strings);
    188 }
    189 
    190 U_CDECL_END
    191 
    192 BytesTrie *
    193 BytesTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
    194     buildBytes(buildOption, errorCode);
    195     BytesTrie *newTrie=NULL;
    196     if(U_SUCCESS(errorCode)) {
    197         newTrie=new BytesTrie(bytes, bytes+(bytesCapacity-bytesLength));
    198         if(newTrie==NULL) {
    199             errorCode=U_MEMORY_ALLOCATION_ERROR;
    200         } else {
    201             bytes=NULL;  // The new trie now owns the array.
    202             bytesCapacity=0;
    203         }
    204     }
    205     return newTrie;
    206 }
    207 
    208 StringPiece
    209 BytesTrieBuilder::buildStringPiece(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
    210     buildBytes(buildOption, errorCode);
    211     StringPiece result;
    212     if(U_SUCCESS(errorCode)) {
    213         result.set(bytes+(bytesCapacity-bytesLength), bytesLength);
    214     }
    215     return result;
    216 }
    217 
    218 void
    219 BytesTrieBuilder::buildBytes(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
    220     if(U_FAILURE(errorCode)) {
    221         return;
    222     }
    223     if(bytes!=NULL && bytesLength>0) {
    224         // Already built.
    225         return;
    226     }
    227     if(bytesLength==0) {
    228         if(elementsLength==0) {
    229             errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    230             return;
    231         }
    232         uprv_sortArray(elements, elementsLength, (int32_t)sizeof(BytesTrieElement),
    233                       compareElementStrings, strings,
    234                       FALSE,  // need not be a stable sort
    235                       &errorCode);
    236         if(U_FAILURE(errorCode)) {
    237             return;
    238         }
    239         // Duplicate strings are not allowed.
    240         StringPiece prev=elements[0].getString(*strings);
    241         for(int32_t i=1; i<elementsLength; ++i) {
    242             StringPiece current=elements[i].getString(*strings);
    243             if(prev==current) {
    244                 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    245                 return;
    246             }
    247             prev=current;
    248         }
    249     }
    250     // Create and byte-serialize the trie for the elements.
    251     bytesLength=0;
    252     int32_t capacity=strings->length();
    253     if(capacity<1024) {
    254         capacity=1024;
    255     }
    256     if(bytesCapacity<capacity) {
    257         uprv_free(bytes);
    258         bytes=static_cast<char *>(uprv_malloc(capacity));
    259         if(bytes==NULL) {
    260             errorCode=U_MEMORY_ALLOCATION_ERROR;
    261             bytesCapacity=0;
    262             return;
    263         }
    264         bytesCapacity=capacity;
    265     }
    266     StringTrieBuilder::build(buildOption, elementsLength, errorCode);
    267     if(bytes==NULL) {
    268         errorCode=U_MEMORY_ALLOCATION_ERROR;
    269     }
    270 }
    271 
    272 BytesTrieBuilder &
    273 BytesTrieBuilder::clear() {
    274     strings->clear();
    275     elementsLength=0;
    276     bytesLength=0;
    277     return *this;
    278 }
    279 
    280 int32_t
    281 BytesTrieBuilder::getElementStringLength(int32_t i) const {
    282     return elements[i].getStringLength(*strings);
    283 }
    284 
    285 UChar
    286 BytesTrieBuilder::getElementUnit(int32_t i, int32_t byteIndex) const {
    287     return (uint8_t)elements[i].charAt(byteIndex, *strings);
    288 }
    289 
    290 int32_t
    291 BytesTrieBuilder::getElementValue(int32_t i) const {
    292     return elements[i].getValue();
    293 }
    294 
    295 int32_t
    296 BytesTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t byteIndex) const {
    297     const BytesTrieElement &firstElement=elements[first];
    298     const BytesTrieElement &lastElement=elements[last];
    299     int32_t minStringLength=firstElement.getStringLength(*strings);
    300     while(++byteIndex<minStringLength &&
    301             firstElement.charAt(byteIndex, *strings)==
    302             lastElement.charAt(byteIndex, *strings)) {}
    303     return byteIndex;
    304 }
    305 
    306 int32_t
    307 BytesTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t byteIndex) const {
    308     int32_t length=0;  // Number of different bytes at byteIndex.
    309     int32_t i=start;
    310     do {
    311         char byte=elements[i++].charAt(byteIndex, *strings);
    312         while(i<limit && byte==elements[i].charAt(byteIndex, *strings)) {
    313             ++i;
    314         }
    315         ++length;
    316     } while(i<limit);
    317     return length;
    318 }
    319 
    320 int32_t
    321 BytesTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t byteIndex, int32_t count) const {
    322     do {
    323         char byte=elements[i++].charAt(byteIndex, *strings);
    324         while(byte==elements[i].charAt(byteIndex, *strings)) {
    325             ++i;
    326         }
    327     } while(--count>0);
    328     return i;
    329 }
    330 
    331 int32_t
    332 BytesTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t byteIndex, UChar byte) const {
    333     char b=(char)byte;
    334     while(b==elements[i].charAt(byteIndex, *strings)) {
    335         ++i;
    336     }
    337     return i;
    338 }
    339 
    340 BytesTrieBuilder::BTLinearMatchNode::BTLinearMatchNode(const char *bytes, int32_t len, Node *nextNode)
    341         : LinearMatchNode(len, nextNode), s(bytes) {
    342     hash=hash*37+ustr_hashCharsN(bytes, len);
    343 }
    344 
    345 UBool
    346 BytesTrieBuilder::BTLinearMatchNode::operator==(const Node &other) const {
    347     if(this==&other) {
    348         return TRUE;
    349     }
    350     if(!LinearMatchNode::operator==(other)) {
    351         return FALSE;
    352     }
    353     const BTLinearMatchNode &o=(const BTLinearMatchNode &)other;
    354     return 0==uprv_memcmp(s, o.s, length);
    355 }
    356 
    357 void
    358 BytesTrieBuilder::BTLinearMatchNode::write(StringTrieBuilder &builder) {
    359     BytesTrieBuilder &b=(BytesTrieBuilder &)builder;
    360     next->write(builder);
    361     b.write(s, length);
    362     offset=b.write(b.getMinLinearMatch()+length-1);
    363 }
    364 
    365 StringTrieBuilder::Node *
    366 BytesTrieBuilder::createLinearMatchNode(int32_t i, int32_t byteIndex, int32_t length,
    367                                         Node *nextNode) const {
    368     return new BTLinearMatchNode(
    369             elements[i].getString(*strings).data()+byteIndex,
    370             length,
    371             nextNode);
    372 }
    373 
    374 UBool
    375 BytesTrieBuilder::ensureCapacity(int32_t length) {
    376     if(bytes==NULL) {
    377         return FALSE;  // previous memory allocation had failed
    378     }
    379     if(length>bytesCapacity) {
    380         int32_t newCapacity=bytesCapacity;
    381         do {
    382             newCapacity*=2;
    383         } while(newCapacity<=length);
    384         char *newBytes=static_cast<char *>(uprv_malloc(newCapacity));
    385         if(newBytes==NULL) {
    386             // unable to allocate memory
    387             uprv_free(bytes);
    388             bytes=NULL;
    389             bytesCapacity=0;
    390             return FALSE;
    391         }
    392         uprv_memcpy(newBytes+(newCapacity-bytesLength),
    393                     bytes+(bytesCapacity-bytesLength), bytesLength);
    394         uprv_free(bytes);
    395         bytes=newBytes;
    396         bytesCapacity=newCapacity;
    397     }
    398     return TRUE;
    399 }
    400 
    401 int32_t
    402 BytesTrieBuilder::write(int32_t byte) {
    403     int32_t newLength=bytesLength+1;
    404     if(ensureCapacity(newLength)) {
    405         bytesLength=newLength;
    406         bytes[bytesCapacity-bytesLength]=(char)byte;
    407     }
    408     return bytesLength;
    409 }
    410 
    411 int32_t
    412 BytesTrieBuilder::write(const char *b, int32_t length) {
    413     int32_t newLength=bytesLength+length;
    414     if(ensureCapacity(newLength)) {
    415         bytesLength=newLength;
    416         uprv_memcpy(bytes+(bytesCapacity-bytesLength), b, length);
    417     }
    418     return bytesLength;
    419 }
    420 
    421 int32_t
    422 BytesTrieBuilder::writeElementUnits(int32_t i, int32_t byteIndex, int32_t length) {
    423     return write(elements[i].getString(*strings).data()+byteIndex, length);
    424 }
    425 
    426 int32_t
    427 BytesTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) {
    428     if(0<=i && i<=BytesTrie::kMaxOneByteValue) {
    429         return write(((BytesTrie::kMinOneByteValueLead+i)<<1)|isFinal);
    430     }
    431     char intBytes[5];
    432     int32_t length=1;
    433     if(i<0 || i>0xffffff) {
    434         intBytes[0]=(char)BytesTrie::kFiveByteValueLead;
    435         intBytes[1]=(char)((uint32_t)i>>24);
    436         intBytes[2]=(char)((uint32_t)i>>16);
    437         intBytes[3]=(char)((uint32_t)i>>8);
    438         intBytes[4]=(char)i;
    439         length=5;
    440     // } else if(i<=BytesTrie::kMaxOneByteValue) {
    441     //     intBytes[0]=(char)(BytesTrie::kMinOneByteValueLead+i);
    442     } else {
    443         if(i<=BytesTrie::kMaxTwoByteValue) {
    444             intBytes[0]=(char)(BytesTrie::kMinTwoByteValueLead+(i>>8));
    445         } else {
    446             if(i<=BytesTrie::kMaxThreeByteValue) {
    447                 intBytes[0]=(char)(BytesTrie::kMinThreeByteValueLead+(i>>16));
    448             } else {
    449                 intBytes[0]=(char)BytesTrie::kFourByteValueLead;
    450                 intBytes[1]=(char)(i>>16);
    451                 length=2;
    452             }
    453             intBytes[length++]=(char)(i>>8);
    454         }
    455         intBytes[length++]=(char)i;
    456     }
    457     intBytes[0]=(char)((intBytes[0]<<1)|isFinal);
    458     return write(intBytes, length);
    459 }
    460 
    461 int32_t
    462 BytesTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
    463     int32_t offset=write(node);
    464     if(hasValue) {
    465         offset=writeValueAndFinal(value, FALSE);
    466     }
    467     return offset;
    468 }
    469 
    470 int32_t
    471 BytesTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
    472     int32_t i=bytesLength-jumpTarget;
    473     U_ASSERT(i>=0);
    474     if(i<=BytesTrie::kMaxOneByteDelta) {
    475         return write(i);
    476     }
    477     char intBytes[5];
    478     int32_t length;
    479     if(i<=BytesTrie::kMaxTwoByteDelta) {
    480         intBytes[0]=(char)(BytesTrie::kMinTwoByteDeltaLead+(i>>8));
    481         length=1;
    482     } else {
    483         if(i<=BytesTrie::kMaxThreeByteDelta) {
    484             intBytes[0]=(char)(BytesTrie::kMinThreeByteDeltaLead+(i>>16));
    485             length=2;
    486         } else {
    487             if(i<=0xffffff) {
    488                 intBytes[0]=(char)BytesTrie::kFourByteDeltaLead;
    489                 length=3;
    490             } else {
    491                 intBytes[0]=(char)BytesTrie::kFiveByteDeltaLead;
    492                 intBytes[1]=(char)(i>>24);
    493                 length=4;
    494             }
    495             intBytes[1]=(char)(i>>16);
    496         }
    497         intBytes[1]=(char)(i>>8);
    498     }
    499     intBytes[length++]=(char)i;
    500     return write(intBytes, length);
    501 }
    502 
    503 U_NAMESPACE_END
    504