1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2010-2012, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * file name: bytestriebuilder.cpp 9 * encoding: UTF-8 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2010sep25 14 * created by: Markus W. Scherer 15 */ 16 17 #include "unicode/utypes.h" 18 #include "unicode/bytestrie.h" 19 #include "unicode/bytestriebuilder.h" 20 #include "unicode/stringpiece.h" 21 #include "charstr.h" 22 #include "cmemory.h" 23 #include "uhash.h" 24 #include "uarrsort.h" 25 #include "uassert.h" 26 #include "ustr_imp.h" 27 28 U_NAMESPACE_BEGIN 29 30 /* 31 * Note: This builder implementation stores (bytes, value) pairs with full copies 32 * of the byte sequences, until the BytesTrie is built. 33 * It might(!) take less memory if we collected the data in a temporary, dynamic trie. 34 */ 35 36 class BytesTrieElement : public UMemory { 37 public: 38 // Use compiler's default constructor, initializes nothing. 39 40 void setTo(StringPiece s, int32_t val, CharString &strings, UErrorCode &errorCode); 41 42 StringPiece getString(const CharString &strings) const { 43 int32_t offset=stringOffset; 44 int32_t length; 45 if(offset>=0) { 46 length=(uint8_t)strings[offset++]; 47 } else { 48 offset=~offset; 49 length=((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1]; 50 offset+=2; 51 } 52 return StringPiece(strings.data()+offset, length); 53 } 54 int32_t getStringLength(const CharString &strings) const { 55 int32_t offset=stringOffset; 56 if(offset>=0) { 57 return (uint8_t)strings[offset]; 58 } else { 59 offset=~offset; 60 return ((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1]; 61 } 62 } 63 64 char charAt(int32_t index, const CharString &strings) const { return data(strings)[index]; } 65 66 int32_t getValue() const { return value; } 67 68 int32_t compareStringTo(const BytesTrieElement &o, const CharString &strings) const; 69 70 private: 71 const char *data(const CharString &strings) const { 72 int32_t offset=stringOffset; 73 if(offset>=0) { 74 ++offset; 75 } else { 76 offset=~offset+2; 77 } 78 return strings.data()+offset; 79 } 80 81 // If the stringOffset is non-negative, then the first strings byte contains 82 // the string length. 83 // If the stringOffset is negative, then the first two strings bytes contain 84 // the string length (big-endian), and the offset needs to be bit-inverted. 85 // (Compared with a stringLength field here, this saves 3 bytes per string for most strings.) 86 int32_t stringOffset; 87 int32_t value; 88 }; 89 90 void 91 BytesTrieElement::setTo(StringPiece s, int32_t val, 92 CharString &strings, UErrorCode &errorCode) { 93 if(U_FAILURE(errorCode)) { 94 return; 95 } 96 int32_t length=s.length(); 97 if(length>0xffff) { 98 // Too long: We store the length in 1 or 2 bytes. 99 errorCode=U_INDEX_OUTOFBOUNDS_ERROR; 100 return; 101 } 102 int32_t offset=strings.length(); 103 if(length>0xff) { 104 offset=~offset; 105 strings.append((char)(length>>8), errorCode); 106 } 107 strings.append((char)length, errorCode); 108 stringOffset=offset; 109 value=val; 110 strings.append(s, errorCode); 111 } 112 113 int32_t 114 BytesTrieElement::compareStringTo(const BytesTrieElement &other, const CharString &strings) const { 115 // TODO: add StringPiece::compare(), see ticket #8187 116 StringPiece thisString=getString(strings); 117 StringPiece otherString=other.getString(strings); 118 int32_t lengthDiff=thisString.length()-otherString.length(); 119 int32_t commonLength; 120 if(lengthDiff<=0) { 121 commonLength=thisString.length(); 122 } else { 123 commonLength=otherString.length(); 124 } 125 int32_t diff=uprv_memcmp(thisString.data(), otherString.data(), commonLength); 126 return diff!=0 ? diff : lengthDiff; 127 } 128 129 BytesTrieBuilder::BytesTrieBuilder(UErrorCode &errorCode) 130 : strings(NULL), elements(NULL), elementsCapacity(0), elementsLength(0), 131 bytes(NULL), bytesCapacity(0), bytesLength(0) { 132 if(U_FAILURE(errorCode)) { 133 return; 134 } 135 strings=new CharString(); 136 if(strings==NULL) { 137 errorCode=U_MEMORY_ALLOCATION_ERROR; 138 } 139 } 140 141 BytesTrieBuilder::~BytesTrieBuilder() { 142 delete strings; 143 delete[] elements; 144 uprv_free(bytes); 145 } 146 147 BytesTrieBuilder & 148 BytesTrieBuilder::add(StringPiece s, int32_t value, UErrorCode &errorCode) { 149 if(U_FAILURE(errorCode)) { 150 return *this; 151 } 152 if(bytesLength>0) { 153 // Cannot add elements after building. 154 errorCode=U_NO_WRITE_PERMISSION; 155 return *this; 156 } 157 if(elementsLength==elementsCapacity) { 158 int32_t newCapacity; 159 if(elementsCapacity==0) { 160 newCapacity=1024; 161 } else { 162 newCapacity=4*elementsCapacity; 163 } 164 BytesTrieElement *newElements=new BytesTrieElement[newCapacity]; 165 if(newElements==NULL) { 166 errorCode=U_MEMORY_ALLOCATION_ERROR; 167 return *this; // error instead of dereferencing null 168 } 169 if(elementsLength>0) { 170 uprv_memcpy(newElements, elements, (size_t)elementsLength*sizeof(BytesTrieElement)); 171 } 172 delete[] elements; 173 elements=newElements; 174 elementsCapacity=newCapacity; 175 } 176 elements[elementsLength++].setTo(s, value, *strings, errorCode); 177 return *this; 178 } 179 180 U_CDECL_BEGIN 181 182 static int32_t U_CALLCONV 183 compareElementStrings(const void *context, const void *left, const void *right) { 184 const CharString *strings=static_cast<const CharString *>(context); 185 const BytesTrieElement *leftElement=static_cast<const BytesTrieElement *>(left); 186 const BytesTrieElement *rightElement=static_cast<const BytesTrieElement *>(right); 187 return leftElement->compareStringTo(*rightElement, *strings); 188 } 189 190 U_CDECL_END 191 192 BytesTrie * 193 BytesTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { 194 buildBytes(buildOption, errorCode); 195 BytesTrie *newTrie=NULL; 196 if(U_SUCCESS(errorCode)) { 197 newTrie=new BytesTrie(bytes, bytes+(bytesCapacity-bytesLength)); 198 if(newTrie==NULL) { 199 errorCode=U_MEMORY_ALLOCATION_ERROR; 200 } else { 201 bytes=NULL; // The new trie now owns the array. 202 bytesCapacity=0; 203 } 204 } 205 return newTrie; 206 } 207 208 StringPiece 209 BytesTrieBuilder::buildStringPiece(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { 210 buildBytes(buildOption, errorCode); 211 StringPiece result; 212 if(U_SUCCESS(errorCode)) { 213 result.set(bytes+(bytesCapacity-bytesLength), bytesLength); 214 } 215 return result; 216 } 217 218 void 219 BytesTrieBuilder::buildBytes(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { 220 if(U_FAILURE(errorCode)) { 221 return; 222 } 223 if(bytes!=NULL && bytesLength>0) { 224 // Already built. 225 return; 226 } 227 if(bytesLength==0) { 228 if(elementsLength==0) { 229 errorCode=U_INDEX_OUTOFBOUNDS_ERROR; 230 return; 231 } 232 uprv_sortArray(elements, elementsLength, (int32_t)sizeof(BytesTrieElement), 233 compareElementStrings, strings, 234 FALSE, // need not be a stable sort 235 &errorCode); 236 if(U_FAILURE(errorCode)) { 237 return; 238 } 239 // Duplicate strings are not allowed. 240 StringPiece prev=elements[0].getString(*strings); 241 for(int32_t i=1; i<elementsLength; ++i) { 242 StringPiece current=elements[i].getString(*strings); 243 if(prev==current) { 244 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 245 return; 246 } 247 prev=current; 248 } 249 } 250 // Create and byte-serialize the trie for the elements. 251 bytesLength=0; 252 int32_t capacity=strings->length(); 253 if(capacity<1024) { 254 capacity=1024; 255 } 256 if(bytesCapacity<capacity) { 257 uprv_free(bytes); 258 bytes=static_cast<char *>(uprv_malloc(capacity)); 259 if(bytes==NULL) { 260 errorCode=U_MEMORY_ALLOCATION_ERROR; 261 bytesCapacity=0; 262 return; 263 } 264 bytesCapacity=capacity; 265 } 266 StringTrieBuilder::build(buildOption, elementsLength, errorCode); 267 if(bytes==NULL) { 268 errorCode=U_MEMORY_ALLOCATION_ERROR; 269 } 270 } 271 272 BytesTrieBuilder & 273 BytesTrieBuilder::clear() { 274 strings->clear(); 275 elementsLength=0; 276 bytesLength=0; 277 return *this; 278 } 279 280 int32_t 281 BytesTrieBuilder::getElementStringLength(int32_t i) const { 282 return elements[i].getStringLength(*strings); 283 } 284 285 UChar 286 BytesTrieBuilder::getElementUnit(int32_t i, int32_t byteIndex) const { 287 return (uint8_t)elements[i].charAt(byteIndex, *strings); 288 } 289 290 int32_t 291 BytesTrieBuilder::getElementValue(int32_t i) const { 292 return elements[i].getValue(); 293 } 294 295 int32_t 296 BytesTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t byteIndex) const { 297 const BytesTrieElement &firstElement=elements[first]; 298 const BytesTrieElement &lastElement=elements[last]; 299 int32_t minStringLength=firstElement.getStringLength(*strings); 300 while(++byteIndex<minStringLength && 301 firstElement.charAt(byteIndex, *strings)== 302 lastElement.charAt(byteIndex, *strings)) {} 303 return byteIndex; 304 } 305 306 int32_t 307 BytesTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t byteIndex) const { 308 int32_t length=0; // Number of different bytes at byteIndex. 309 int32_t i=start; 310 do { 311 char byte=elements[i++].charAt(byteIndex, *strings); 312 while(i<limit && byte==elements[i].charAt(byteIndex, *strings)) { 313 ++i; 314 } 315 ++length; 316 } while(i<limit); 317 return length; 318 } 319 320 int32_t 321 BytesTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t byteIndex, int32_t count) const { 322 do { 323 char byte=elements[i++].charAt(byteIndex, *strings); 324 while(byte==elements[i].charAt(byteIndex, *strings)) { 325 ++i; 326 } 327 } while(--count>0); 328 return i; 329 } 330 331 int32_t 332 BytesTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t byteIndex, UChar byte) const { 333 char b=(char)byte; 334 while(b==elements[i].charAt(byteIndex, *strings)) { 335 ++i; 336 } 337 return i; 338 } 339 340 BytesTrieBuilder::BTLinearMatchNode::BTLinearMatchNode(const char *bytes, int32_t len, Node *nextNode) 341 : LinearMatchNode(len, nextNode), s(bytes) { 342 hash=hash*37+ustr_hashCharsN(bytes, len); 343 } 344 345 UBool 346 BytesTrieBuilder::BTLinearMatchNode::operator==(const Node &other) const { 347 if(this==&other) { 348 return TRUE; 349 } 350 if(!LinearMatchNode::operator==(other)) { 351 return FALSE; 352 } 353 const BTLinearMatchNode &o=(const BTLinearMatchNode &)other; 354 return 0==uprv_memcmp(s, o.s, length); 355 } 356 357 void 358 BytesTrieBuilder::BTLinearMatchNode::write(StringTrieBuilder &builder) { 359 BytesTrieBuilder &b=(BytesTrieBuilder &)builder; 360 next->write(builder); 361 b.write(s, length); 362 offset=b.write(b.getMinLinearMatch()+length-1); 363 } 364 365 StringTrieBuilder::Node * 366 BytesTrieBuilder::createLinearMatchNode(int32_t i, int32_t byteIndex, int32_t length, 367 Node *nextNode) const { 368 return new BTLinearMatchNode( 369 elements[i].getString(*strings).data()+byteIndex, 370 length, 371 nextNode); 372 } 373 374 UBool 375 BytesTrieBuilder::ensureCapacity(int32_t length) { 376 if(bytes==NULL) { 377 return FALSE; // previous memory allocation had failed 378 } 379 if(length>bytesCapacity) { 380 int32_t newCapacity=bytesCapacity; 381 do { 382 newCapacity*=2; 383 } while(newCapacity<=length); 384 char *newBytes=static_cast<char *>(uprv_malloc(newCapacity)); 385 if(newBytes==NULL) { 386 // unable to allocate memory 387 uprv_free(bytes); 388 bytes=NULL; 389 bytesCapacity=0; 390 return FALSE; 391 } 392 uprv_memcpy(newBytes+(newCapacity-bytesLength), 393 bytes+(bytesCapacity-bytesLength), bytesLength); 394 uprv_free(bytes); 395 bytes=newBytes; 396 bytesCapacity=newCapacity; 397 } 398 return TRUE; 399 } 400 401 int32_t 402 BytesTrieBuilder::write(int32_t byte) { 403 int32_t newLength=bytesLength+1; 404 if(ensureCapacity(newLength)) { 405 bytesLength=newLength; 406 bytes[bytesCapacity-bytesLength]=(char)byte; 407 } 408 return bytesLength; 409 } 410 411 int32_t 412 BytesTrieBuilder::write(const char *b, int32_t length) { 413 int32_t newLength=bytesLength+length; 414 if(ensureCapacity(newLength)) { 415 bytesLength=newLength; 416 uprv_memcpy(bytes+(bytesCapacity-bytesLength), b, length); 417 } 418 return bytesLength; 419 } 420 421 int32_t 422 BytesTrieBuilder::writeElementUnits(int32_t i, int32_t byteIndex, int32_t length) { 423 return write(elements[i].getString(*strings).data()+byteIndex, length); 424 } 425 426 int32_t 427 BytesTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) { 428 if(0<=i && i<=BytesTrie::kMaxOneByteValue) { 429 return write(((BytesTrie::kMinOneByteValueLead+i)<<1)|isFinal); 430 } 431 char intBytes[5]; 432 int32_t length=1; 433 if(i<0 || i>0xffffff) { 434 intBytes[0]=(char)BytesTrie::kFiveByteValueLead; 435 intBytes[1]=(char)((uint32_t)i>>24); 436 intBytes[2]=(char)((uint32_t)i>>16); 437 intBytes[3]=(char)((uint32_t)i>>8); 438 intBytes[4]=(char)i; 439 length=5; 440 // } else if(i<=BytesTrie::kMaxOneByteValue) { 441 // intBytes[0]=(char)(BytesTrie::kMinOneByteValueLead+i); 442 } else { 443 if(i<=BytesTrie::kMaxTwoByteValue) { 444 intBytes[0]=(char)(BytesTrie::kMinTwoByteValueLead+(i>>8)); 445 } else { 446 if(i<=BytesTrie::kMaxThreeByteValue) { 447 intBytes[0]=(char)(BytesTrie::kMinThreeByteValueLead+(i>>16)); 448 } else { 449 intBytes[0]=(char)BytesTrie::kFourByteValueLead; 450 intBytes[1]=(char)(i>>16); 451 length=2; 452 } 453 intBytes[length++]=(char)(i>>8); 454 } 455 intBytes[length++]=(char)i; 456 } 457 intBytes[0]=(char)((intBytes[0]<<1)|isFinal); 458 return write(intBytes, length); 459 } 460 461 int32_t 462 BytesTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) { 463 int32_t offset=write(node); 464 if(hasValue) { 465 offset=writeValueAndFinal(value, FALSE); 466 } 467 return offset; 468 } 469 470 int32_t 471 BytesTrieBuilder::writeDeltaTo(int32_t jumpTarget) { 472 int32_t i=bytesLength-jumpTarget; 473 U_ASSERT(i>=0); 474 if(i<=BytesTrie::kMaxOneByteDelta) { 475 return write(i); 476 } 477 char intBytes[5]; 478 int32_t length; 479 if(i<=BytesTrie::kMaxTwoByteDelta) { 480 intBytes[0]=(char)(BytesTrie::kMinTwoByteDeltaLead+(i>>8)); 481 length=1; 482 } else { 483 if(i<=BytesTrie::kMaxThreeByteDelta) { 484 intBytes[0]=(char)(BytesTrie::kMinThreeByteDeltaLead+(i>>16)); 485 length=2; 486 } else { 487 if(i<=0xffffff) { 488 intBytes[0]=(char)BytesTrie::kFourByteDeltaLead; 489 length=3; 490 } else { 491 intBytes[0]=(char)BytesTrie::kFiveByteDeltaLead; 492 intBytes[1]=(char)(i>>24); 493 length=4; 494 } 495 intBytes[1]=(char)(i>>16); 496 } 497 intBytes[1]=(char)(i>>8); 498 } 499 intBytes[length++]=(char)i; 500 return write(intBytes, length); 501 } 502 503 U_NAMESPACE_END 504