1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2010-2012, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * file name: bytestriebuilder.cpp 9 * encoding: UTF-8 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2010sep25 14 * created by: Markus W. Scherer 15 */ 16 17 #include "unicode/utypes.h" 18 #include "unicode/bytestrie.h" 19 #include "unicode/bytestriebuilder.h" 20 #include "unicode/stringpiece.h" 21 #include "charstr.h" 22 #include "cmemory.h" 23 #include "uhash.h" 24 #include "uarrsort.h" 25 #include "uassert.h" 26 #include "ustr_imp.h" 27 28 U_NAMESPACE_BEGIN 29 30 /* 31 * Note: This builder implementation stores (bytes, value) pairs with full copies 32 * of the byte sequences, until the BytesTrie is built. 33 * It might(!) take less memory if we collected the data in a temporary, dynamic trie. 34 */ 35 36 class BytesTrieElement : public UMemory { 37 public: 38 // Use compiler's default constructor, initializes nothing. 39 40 void setTo(StringPiece s, int32_t val, CharString &strings, UErrorCode &errorCode); 41 42 StringPiece getString(const CharString &strings) const { 43 int32_t offset=stringOffset; 44 int32_t length; 45 if(offset>=0) { 46 length=(uint8_t)strings[offset++]; 47 } else { 48 offset=~offset; 49 length=((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1]; 50 offset+=2; 51 } 52 return StringPiece(strings.data()+offset, length); 53 } 54 int32_t getStringLength(const CharString &strings) const { 55 int32_t offset=stringOffset; 56 if(offset>=0) { 57 return (uint8_t)strings[offset]; 58 } else { 59 offset=~offset; 60 return ((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1]; 61 } 62 } 63 64 char charAt(int32_t index, const CharString &strings) const { return data(strings)[index]; } 65 66 int32_t getValue() const { return value; } 67 68 int32_t compareStringTo(const BytesTrieElement &o, const CharString &strings) const; 69 70 private: 71 const char *data(const CharString &strings) const { 72 int32_t offset=stringOffset; 73 if(offset>=0) { 74 ++offset; 75 } else { 76 offset=~offset+2; 77 } 78 return strings.data()+offset; 79 } 80 81 // If the stringOffset is non-negative, then the first strings byte contains 82 // the string length. 83 // If the stringOffset is negative, then the first two strings bytes contain 84 // the string length (big-endian), and the offset needs to be bit-inverted. 85 // (Compared with a stringLength field here, this saves 3 bytes per string for most strings.) 86 int32_t stringOffset; 87 int32_t value; 88 }; 89 90 void 91 BytesTrieElement::setTo(StringPiece s, int32_t val, 92 CharString &strings, UErrorCode &errorCode) { 93 if(U_FAILURE(errorCode)) { 94 return; 95 } 96 int32_t length=s.length(); 97 if(length>0xffff) { 98 // Too long: We store the length in 1 or 2 bytes. 99 errorCode=U_INDEX_OUTOFBOUNDS_ERROR; 100 return; 101 } 102 int32_t offset=strings.length(); 103 if(length>0xff) { 104 offset=~offset; 105 strings.append((char)(length>>8), errorCode); 106 } 107 strings.append((char)length, errorCode); 108 stringOffset=offset; 109 value=val; 110 strings.append(s, errorCode); 111 } 112 113 int32_t 114 BytesTrieElement::compareStringTo(const BytesTrieElement &other, const CharString &strings) const { 115 // TODO: add StringPiece::compare(), see ticket #8187 116 StringPiece thisString=getString(strings); 117 StringPiece otherString=other.getString(strings); 118 int32_t lengthDiff=thisString.length()-otherString.length(); 119 int32_t commonLength; 120 if(lengthDiff<=0) { 121 commonLength=thisString.length(); 122 } else { 123 commonLength=otherString.length(); 124 } 125 int32_t diff=uprv_memcmp(thisString.data(), otherString.data(), commonLength); 126 return diff!=0 ? diff : lengthDiff; 127 } 128 129 BytesTrieBuilder::BytesTrieBuilder(UErrorCode &errorCode) 130 : strings(NULL), elements(NULL), elementsCapacity(0), elementsLength(0), 131 bytes(NULL), bytesCapacity(0), bytesLength(0) { 132 if(U_FAILURE(errorCode)) { 133 return; 134 } 135 strings=new CharString(); 136 if(strings==NULL) { 137 errorCode=U_MEMORY_ALLOCATION_ERROR; 138 } 139 } 140 141 BytesTrieBuilder::~BytesTrieBuilder() { 142 delete strings; 143 delete[] elements; 144 uprv_free(bytes); 145 } 146 147 BytesTrieBuilder & 148 BytesTrieBuilder::add(StringPiece s, int32_t value, UErrorCode &errorCode) { 149 if(U_FAILURE(errorCode)) { 150 return *this; 151 } 152 if(bytesLength>0) { 153 // Cannot add elements after building. 154 errorCode=U_NO_WRITE_PERMISSION; 155 return *this; 156 } 157 if(elementsLength==elementsCapacity) { 158 int32_t newCapacity; 159 if(elementsCapacity==0) { 160 newCapacity=1024; 161 } else { 162 newCapacity=4*elementsCapacity; 163 } 164 BytesTrieElement *newElements=new BytesTrieElement[newCapacity]; 165 if(newElements==NULL) { 166 errorCode=U_MEMORY_ALLOCATION_ERROR; 167 return *this; // error instead of dereferencing null 168 } 169 if(elementsLength>0) { 170 uprv_memcpy(newElements, elements, (size_t)elementsLength*sizeof(BytesTrieElement)); 171 } 172 delete[] elements; 173 elements=newElements; 174 elementsCapacity=newCapacity; 175 } 176 elements[elementsLength++].setTo(s, value, *strings, errorCode); 177 return *this; 178 } 179 180 U_CDECL_BEGIN 181 182 static int32_t U_CALLCONV 183 compareElementStrings(const void *context, const void *left, const void *right) { 184 const CharString *strings=static_cast<const CharString *>(context); 185 const BytesTrieElement *leftElement=static_cast<const BytesTrieElement *>(left); 186 const BytesTrieElement *rightElement=static_cast<const BytesTrieElement *>(right); 187 return leftElement->compareStringTo(*rightElement, *strings); 188 } 189 190 U_CDECL_END 191 192 BytesTrie * 193 BytesTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { 194 buildBytes(buildOption, errorCode); 195 BytesTrie *newTrie=NULL; 196 if(U_SUCCESS(errorCode)) { 197 newTrie=new BytesTrie(bytes, bytes+(bytesCapacity-bytesLength)); 198 if(newTrie==NULL) { 199 errorCode=U_MEMORY_ALLOCATION_ERROR; 200 } else { 201 bytes=NULL; // The new trie now owns the array. 202 bytesCapacity=0; 203 } 204 } 205 return newTrie; 206 } 207 208 StringPiece 209 BytesTrieBuilder::buildStringPiece(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { 210 buildBytes(buildOption, errorCode); 211 StringPiece result; 212 if(U_SUCCESS(errorCode)) { 213 result.set(bytes+(bytesCapacity-bytesLength), bytesLength); 214 } 215 return result; 216 } 217 218 void 219 BytesTrieBuilder::buildBytes(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { 220 if(U_FAILURE(errorCode)) { 221 return; 222 } 223 if(bytes!=NULL && bytesLength>0) { 224 // Already built. 225 return; 226 } 227 if(bytesLength==0) { 228 if(elementsLength==0) { 229 errorCode=U_INDEX_OUTOFBOUNDS_ERROR; 230 return; 231 } 232 uprv_sortArray(elements, elementsLength, (int32_t)sizeof(BytesTrieElement), 233 compareElementStrings, strings, 234 FALSE, // need not be a stable sort 235 &errorCode); 236 if(U_FAILURE(errorCode)) { 237 return; 238 } 239 // Duplicate strings are not allowed. 240 StringPiece prev=elements[0].getString(*strings); 241 for(int32_t i=1; i<elementsLength; ++i) { 242 StringPiece current=elements[i].getString(*strings); 243 if(prev==current) { 244 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 245 return; 246 } 247 prev=current; 248 } 249 } 250 // Create and byte-serialize the trie for the elements. 251 bytesLength=0; 252 int32_t capacity=strings->length(); 253 if(capacity<1024) { 254 capacity=1024; 255 } 256 if(bytesCapacity<capacity) { 257 uprv_free(bytes); 258 bytes=static_cast<char *>(uprv_malloc(capacity)); 259 if(bytes==NULL) { 260 errorCode=U_MEMORY_ALLOCATION_ERROR; 261 bytesCapacity=0; 262 return; 263 } 264 bytesCapacity=capacity; 265 } 266 StringTrieBuilder::build(buildOption, elementsLength, errorCode); 267 if(bytes==NULL) { 268 errorCode=U_MEMORY_ALLOCATION_ERROR; 269 } 270 } 271 272 BytesTrieBuilder & 273 BytesTrieBuilder::clear() { 274 strings->clear(); 275 elementsLength=0; 276 bytesLength=0; 277 return *this; 278 } 279 280 int32_t 281 BytesTrieBuilder::getElementStringLength(int32_t i) const { 282 return elements[i].getStringLength(*strings); 283 } 284 285 UChar 286 BytesTrieBuilder::getElementUnit(int32_t i, int32_t byteIndex) const { 287 return (uint8_t)elements[i].charAt(byteIndex, *strings); 288 } 289 290 int32_t 291 BytesTrieBuilder::getElementValue(int32_t i) const { 292 return elements[i].getValue(); 293 } 294 295 int32_t 296 BytesTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t byteIndex) const { 297 const BytesTrieElement &firstElement=elements[first]; 298 const BytesTrieElement &lastElement=elements[last]; 299 int32_t minStringLength=firstElement.getStringLength(*strings); 300 while(++byteIndex<minStringLength && 301 firstElement.charAt(byteIndex, *strings)== 302 lastElement.charAt(byteIndex, *strings)) {} 303 return byteIndex; 304 } 305 306 int32_t 307 BytesTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t byteIndex) const { 308 int32_t length=0; // Number of different bytes at byteIndex. 309 int32_t i=start; 310 do { 311 char byte=elements[i++].charAt(byteIndex, *strings); 312 while(i<limit && byte==elements[i].charAt(byteIndex, *strings)) { 313 ++i; 314 } 315 ++length; 316 } while(i<limit); 317 return length; 318 } 319 320 int32_t 321 BytesTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t byteIndex, int32_t count) const { 322 do { 323 char byte=elements[i++].charAt(byteIndex, *strings); 324 while(byte==elements[i].charAt(byteIndex, *strings)) { 325 ++i; 326 } 327 } while(--count>0); 328 return i; 329 } 330 331 int32_t 332 BytesTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t byteIndex, UChar byte) const { 333 char b=(char)byte; 334 while(b==elements[i].charAt(byteIndex, *strings)) { 335 ++i; 336 } 337 return i; 338 } 339 340 BytesTrieBuilder::BTLinearMatchNode::BTLinearMatchNode(const char *bytes, int32_t len, Node *nextNode) 341 : LinearMatchNode(len, nextNode), s(bytes) { 342 hash=static_cast<int32_t>( 343 static_cast<uint32_t>(hash)*37u + static_cast<uint32_t>(ustr_hashCharsN(bytes, len))); 344 } 345 346 UBool 347 BytesTrieBuilder::BTLinearMatchNode::operator==(const Node &other) const { 348 if(this==&other) { 349 return TRUE; 350 } 351 if(!LinearMatchNode::operator==(other)) { 352 return FALSE; 353 } 354 const BTLinearMatchNode &o=(const BTLinearMatchNode &)other; 355 return 0==uprv_memcmp(s, o.s, length); 356 } 357 358 void 359 BytesTrieBuilder::BTLinearMatchNode::write(StringTrieBuilder &builder) { 360 BytesTrieBuilder &b=(BytesTrieBuilder &)builder; 361 next->write(builder); 362 b.write(s, length); 363 offset=b.write(b.getMinLinearMatch()+length-1); 364 } 365 366 StringTrieBuilder::Node * 367 BytesTrieBuilder::createLinearMatchNode(int32_t i, int32_t byteIndex, int32_t length, 368 Node *nextNode) const { 369 return new BTLinearMatchNode( 370 elements[i].getString(*strings).data()+byteIndex, 371 length, 372 nextNode); 373 } 374 375 UBool 376 BytesTrieBuilder::ensureCapacity(int32_t length) { 377 if(bytes==NULL) { 378 return FALSE; // previous memory allocation had failed 379 } 380 if(length>bytesCapacity) { 381 int32_t newCapacity=bytesCapacity; 382 do { 383 newCapacity*=2; 384 } while(newCapacity<=length); 385 char *newBytes=static_cast<char *>(uprv_malloc(newCapacity)); 386 if(newBytes==NULL) { 387 // unable to allocate memory 388 uprv_free(bytes); 389 bytes=NULL; 390 bytesCapacity=0; 391 return FALSE; 392 } 393 uprv_memcpy(newBytes+(newCapacity-bytesLength), 394 bytes+(bytesCapacity-bytesLength), bytesLength); 395 uprv_free(bytes); 396 bytes=newBytes; 397 bytesCapacity=newCapacity; 398 } 399 return TRUE; 400 } 401 402 int32_t 403 BytesTrieBuilder::write(int32_t byte) { 404 int32_t newLength=bytesLength+1; 405 if(ensureCapacity(newLength)) { 406 bytesLength=newLength; 407 bytes[bytesCapacity-bytesLength]=(char)byte; 408 } 409 return bytesLength; 410 } 411 412 int32_t 413 BytesTrieBuilder::write(const char *b, int32_t length) { 414 int32_t newLength=bytesLength+length; 415 if(ensureCapacity(newLength)) { 416 bytesLength=newLength; 417 uprv_memcpy(bytes+(bytesCapacity-bytesLength), b, length); 418 } 419 return bytesLength; 420 } 421 422 int32_t 423 BytesTrieBuilder::writeElementUnits(int32_t i, int32_t byteIndex, int32_t length) { 424 return write(elements[i].getString(*strings).data()+byteIndex, length); 425 } 426 427 int32_t 428 BytesTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) { 429 if(0<=i && i<=BytesTrie::kMaxOneByteValue) { 430 return write(((BytesTrie::kMinOneByteValueLead+i)<<1)|isFinal); 431 } 432 char intBytes[5]; 433 int32_t length=1; 434 if(i<0 || i>0xffffff) { 435 intBytes[0]=(char)BytesTrie::kFiveByteValueLead; 436 intBytes[1]=(char)((uint32_t)i>>24); 437 intBytes[2]=(char)((uint32_t)i>>16); 438 intBytes[3]=(char)((uint32_t)i>>8); 439 intBytes[4]=(char)i; 440 length=5; 441 // } else if(i<=BytesTrie::kMaxOneByteValue) { 442 // intBytes[0]=(char)(BytesTrie::kMinOneByteValueLead+i); 443 } else { 444 if(i<=BytesTrie::kMaxTwoByteValue) { 445 intBytes[0]=(char)(BytesTrie::kMinTwoByteValueLead+(i>>8)); 446 } else { 447 if(i<=BytesTrie::kMaxThreeByteValue) { 448 intBytes[0]=(char)(BytesTrie::kMinThreeByteValueLead+(i>>16)); 449 } else { 450 intBytes[0]=(char)BytesTrie::kFourByteValueLead; 451 intBytes[1]=(char)(i>>16); 452 length=2; 453 } 454 intBytes[length++]=(char)(i>>8); 455 } 456 intBytes[length++]=(char)i; 457 } 458 intBytes[0]=(char)((intBytes[0]<<1)|isFinal); 459 return write(intBytes, length); 460 } 461 462 int32_t 463 BytesTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) { 464 int32_t offset=write(node); 465 if(hasValue) { 466 offset=writeValueAndFinal(value, FALSE); 467 } 468 return offset; 469 } 470 471 int32_t 472 BytesTrieBuilder::writeDeltaTo(int32_t jumpTarget) { 473 int32_t i=bytesLength-jumpTarget; 474 U_ASSERT(i>=0); 475 if(i<=BytesTrie::kMaxOneByteDelta) { 476 return write(i); 477 } 478 char intBytes[5]; 479 int32_t length; 480 if(i<=BytesTrie::kMaxTwoByteDelta) { 481 intBytes[0]=(char)(BytesTrie::kMinTwoByteDeltaLead+(i>>8)); 482 length=1; 483 } else { 484 if(i<=BytesTrie::kMaxThreeByteDelta) { 485 intBytes[0]=(char)(BytesTrie::kMinThreeByteDeltaLead+(i>>16)); 486 length=2; 487 } else { 488 if(i<=0xffffff) { 489 intBytes[0]=(char)BytesTrie::kFourByteDeltaLead; 490 length=3; 491 } else { 492 intBytes[0]=(char)BytesTrie::kFiveByteDeltaLead; 493 intBytes[1]=(char)(i>>24); 494 length=4; 495 } 496 intBytes[1]=(char)(i>>16); 497 } 498 intBytes[1]=(char)(i>>8); 499 } 500 intBytes[length++]=(char)i; 501 return write(intBytes, length); 502 } 503 504 U_NAMESPACE_END 505