1 /* 2 ******************************************************************************* 3 * Copyright (C) 2010-2011, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * file name: bytestriebuilder.cpp 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * created on: 2010sep25 12 * created by: Markus W. Scherer 13 */ 14 15 #include "unicode/utypes.h" 16 #include "unicode/bytestrie.h" 17 #include "unicode/bytestriebuilder.h" 18 #include "unicode/stringpiece.h" 19 #include "charstr.h" 20 #include "cmemory.h" 21 #include "uhash.h" 22 #include "uarrsort.h" 23 #include "uassert.h" 24 25 U_NAMESPACE_BEGIN 26 27 /* 28 * Note: This builder implementation stores (bytes, value) pairs with full copies 29 * of the byte sequences, until the BytesTrie is built. 30 * It might(!) take less memory if we collected the data in a temporary, dynamic trie. 31 */ 32 33 class BytesTrieElement : public UMemory { 34 public: 35 // Use compiler's default constructor, initializes nothing. 36 37 void setTo(const StringPiece &s, int32_t val, CharString &strings, UErrorCode &errorCode); 38 39 StringPiece getString(const CharString &strings) const { 40 int32_t offset=stringOffset; 41 int32_t length; 42 if(offset>=0) { 43 length=(uint8_t)strings[offset++]; 44 } else { 45 offset=~offset; 46 length=((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1]; 47 offset+=2; 48 } 49 return StringPiece(strings.data()+offset, length); 50 } 51 int32_t getStringLength(const CharString &strings) const { 52 int32_t offset=stringOffset; 53 if(offset>=0) { 54 return (uint8_t)strings[offset]; 55 } else { 56 offset=~offset; 57 return ((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1]; 58 } 59 } 60 61 char charAt(int32_t index, const CharString &strings) const { return data(strings)[index]; } 62 63 int32_t getValue() const { return value; } 64 65 int32_t compareStringTo(const BytesTrieElement &o, const CharString &strings) const; 66 67 private: 68 const char *data(const CharString &strings) const { 69 int32_t offset=stringOffset; 70 if(offset>=0) { 71 ++offset; 72 } else { 73 offset=~offset+2; 74 } 75 return strings.data()+offset; 76 } 77 78 // If the stringOffset is non-negative, then the first strings byte contains 79 // the string length. 80 // If the stringOffset is negative, then the first two strings bytes contain 81 // the string length (big-endian), and the offset needs to be bit-inverted. 82 // (Compared with a stringLength field here, this saves 3 bytes per string for most strings.) 83 int32_t stringOffset; 84 int32_t value; 85 }; 86 87 void 88 BytesTrieElement::setTo(const StringPiece &s, int32_t val, 89 CharString &strings, UErrorCode &errorCode) { 90 if(U_FAILURE(errorCode)) { 91 return; 92 } 93 int32_t length=s.length(); 94 if(length>0xffff) { 95 // Too long: We store the length in 1 or 2 bytes. 96 errorCode=U_INDEX_OUTOFBOUNDS_ERROR; 97 return; 98 } 99 int32_t offset=strings.length(); 100 if(length>0xff) { 101 offset=~offset; 102 strings.append((char)(length>>8), errorCode); 103 } 104 strings.append((char)length, errorCode); 105 stringOffset=offset; 106 value=val; 107 strings.append(s, errorCode); 108 } 109 110 int32_t 111 BytesTrieElement::compareStringTo(const BytesTrieElement &other, const CharString &strings) const { 112 // TODO: add StringPiece::compare(), see ticket #8187 113 StringPiece thisString=getString(strings); 114 StringPiece otherString=other.getString(strings); 115 int32_t lengthDiff=thisString.length()-otherString.length(); 116 int32_t commonLength; 117 if(lengthDiff<=0) { 118 commonLength=thisString.length(); 119 } else { 120 commonLength=otherString.length(); 121 } 122 int32_t diff=uprv_memcmp(thisString.data(), otherString.data(), commonLength); 123 return diff!=0 ? diff : lengthDiff; 124 } 125 126 BytesTrieBuilder::BytesTrieBuilder(UErrorCode &errorCode) 127 : strings(NULL), elements(NULL), elementsCapacity(0), elementsLength(0), 128 bytes(NULL), bytesCapacity(0), bytesLength(0) { 129 if(U_FAILURE(errorCode)) { 130 return; 131 } 132 strings=new CharString(); 133 if(strings==NULL) { 134 errorCode=U_MEMORY_ALLOCATION_ERROR; 135 } 136 } 137 138 BytesTrieBuilder::~BytesTrieBuilder() { 139 delete strings; 140 delete[] elements; 141 uprv_free(bytes); 142 } 143 144 BytesTrieBuilder & 145 BytesTrieBuilder::add(const StringPiece &s, int32_t value, UErrorCode &errorCode) { 146 if(U_FAILURE(errorCode)) { 147 return *this; 148 } 149 if(bytesLength>0) { 150 // Cannot add elements after building. 151 errorCode=U_NO_WRITE_PERMISSION; 152 return *this; 153 } 154 if(elementsLength==elementsCapacity) { 155 int32_t newCapacity; 156 if(elementsCapacity==0) { 157 newCapacity=1024; 158 } else { 159 newCapacity=4*elementsCapacity; 160 } 161 BytesTrieElement *newElements=new BytesTrieElement[newCapacity]; 162 if(newElements==NULL) { 163 errorCode=U_MEMORY_ALLOCATION_ERROR; 164 } 165 if(elementsLength>0) { 166 uprv_memcpy(newElements, elements, elementsLength*sizeof(BytesTrieElement)); 167 } 168 delete[] elements; 169 elements=newElements; 170 elementsCapacity=newCapacity; 171 } 172 elements[elementsLength++].setTo(s, value, *strings, errorCode); 173 return *this; 174 } 175 176 U_CDECL_BEGIN 177 178 static int32_t U_CALLCONV 179 compareElementStrings(const void *context, const void *left, const void *right) { 180 const CharString *strings=reinterpret_cast<const CharString *>(context); 181 const BytesTrieElement *leftElement=reinterpret_cast<const BytesTrieElement *>(left); 182 const BytesTrieElement *rightElement=reinterpret_cast<const BytesTrieElement *>(right); 183 return leftElement->compareStringTo(*rightElement, *strings); 184 } 185 186 U_CDECL_END 187 188 BytesTrie * 189 BytesTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { 190 buildBytes(buildOption, errorCode); 191 BytesTrie *newTrie=NULL; 192 if(U_SUCCESS(errorCode)) { 193 newTrie=new BytesTrie(bytes, bytes+(bytesCapacity-bytesLength)); 194 if(newTrie==NULL) { 195 errorCode=U_MEMORY_ALLOCATION_ERROR; 196 } else { 197 bytes=NULL; // The new trie now owns the array. 198 bytesCapacity=0; 199 } 200 } 201 return newTrie; 202 } 203 204 StringPiece 205 BytesTrieBuilder::buildStringPiece(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { 206 buildBytes(buildOption, errorCode); 207 StringPiece result; 208 if(U_SUCCESS(errorCode)) { 209 result.set(bytes+(bytesCapacity-bytesLength), bytesLength); 210 } 211 return result; 212 } 213 214 void 215 BytesTrieBuilder::buildBytes(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { 216 if(U_FAILURE(errorCode)) { 217 return; 218 } 219 if(bytes!=NULL && bytesLength>0) { 220 // Already built. 221 return; 222 } 223 if(bytesLength==0) { 224 if(elementsLength==0) { 225 errorCode=U_INDEX_OUTOFBOUNDS_ERROR; 226 return; 227 } 228 uprv_sortArray(elements, elementsLength, (int32_t)sizeof(BytesTrieElement), 229 compareElementStrings, strings, 230 FALSE, // need not be a stable sort 231 &errorCode); 232 if(U_FAILURE(errorCode)) { 233 return; 234 } 235 // Duplicate strings are not allowed. 236 StringPiece prev=elements[0].getString(*strings); 237 for(int32_t i=1; i<elementsLength; ++i) { 238 StringPiece current=elements[i].getString(*strings); 239 if(prev==current) { 240 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 241 return; 242 } 243 prev=current; 244 } 245 } 246 // Create and byte-serialize the trie for the elements. 247 bytesLength=0; 248 int32_t capacity=strings->length(); 249 if(capacity<1024) { 250 capacity=1024; 251 } 252 if(bytesCapacity<capacity) { 253 uprv_free(bytes); 254 bytes=reinterpret_cast<char *>(uprv_malloc(capacity)); 255 if(bytes==NULL) { 256 errorCode=U_MEMORY_ALLOCATION_ERROR; 257 bytesCapacity=0; 258 return; 259 } 260 bytesCapacity=capacity; 261 } 262 StringTrieBuilder::build(buildOption, elementsLength, errorCode); 263 if(bytes==NULL) { 264 errorCode=U_MEMORY_ALLOCATION_ERROR; 265 } 266 } 267 268 BytesTrieBuilder & 269 BytesTrieBuilder::clear() { 270 strings->clear(); 271 elementsLength=0; 272 bytesLength=0; 273 return *this; 274 } 275 276 int32_t 277 BytesTrieBuilder::getElementStringLength(int32_t i) const { 278 return elements[i].getStringLength(*strings); 279 } 280 281 UChar 282 BytesTrieBuilder::getElementUnit(int32_t i, int32_t byteIndex) const { 283 return (uint8_t)elements[i].charAt(byteIndex, *strings); 284 } 285 286 int32_t 287 BytesTrieBuilder::getElementValue(int32_t i) const { 288 return elements[i].getValue(); 289 } 290 291 int32_t 292 BytesTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t byteIndex) const { 293 const BytesTrieElement &firstElement=elements[first]; 294 const BytesTrieElement &lastElement=elements[last]; 295 int32_t minStringLength=firstElement.getStringLength(*strings); 296 while(++byteIndex<minStringLength && 297 firstElement.charAt(byteIndex, *strings)== 298 lastElement.charAt(byteIndex, *strings)) {} 299 return byteIndex; 300 } 301 302 int32_t 303 BytesTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t byteIndex) const { 304 int32_t length=0; // Number of different bytes at byteIndex. 305 int32_t i=start; 306 do { 307 char byte=elements[i++].charAt(byteIndex, *strings); 308 while(i<limit && byte==elements[i].charAt(byteIndex, *strings)) { 309 ++i; 310 } 311 ++length; 312 } while(i<limit); 313 return length; 314 } 315 316 int32_t 317 BytesTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t byteIndex, int32_t count) const { 318 do { 319 char byte=elements[i++].charAt(byteIndex, *strings); 320 while(byte==elements[i].charAt(byteIndex, *strings)) { 321 ++i; 322 } 323 } while(--count>0); 324 return i; 325 } 326 327 int32_t 328 BytesTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t byteIndex, UChar byte) const { 329 char b=(char)byte; 330 while(b==elements[i].charAt(byteIndex, *strings)) { 331 ++i; 332 } 333 return i; 334 } 335 336 BytesTrieBuilder::BTLinearMatchNode::BTLinearMatchNode(const char *bytes, int32_t len, Node *nextNode) 337 : LinearMatchNode(len, nextNode), s(bytes) { 338 hash=hash*37+uhash_hashCharsN(bytes, len); 339 } 340 341 UBool 342 BytesTrieBuilder::BTLinearMatchNode::operator==(const Node &other) const { 343 if(this==&other) { 344 return TRUE; 345 } 346 if(!LinearMatchNode::operator==(other)) { 347 return FALSE; 348 } 349 const BTLinearMatchNode &o=(const BTLinearMatchNode &)other; 350 return 0==uprv_memcmp(s, o.s, length); 351 } 352 353 void 354 BytesTrieBuilder::BTLinearMatchNode::write(StringTrieBuilder &builder) { 355 BytesTrieBuilder &b=(BytesTrieBuilder &)builder; 356 next->write(builder); 357 b.write(s, length); 358 offset=b.write(b.getMinLinearMatch()+length-1); 359 } 360 361 StringTrieBuilder::Node * 362 BytesTrieBuilder::createLinearMatchNode(int32_t i, int32_t byteIndex, int32_t length, 363 Node *nextNode) const { 364 return new BTLinearMatchNode( 365 elements[i].getString(*strings).data()+byteIndex, 366 length, 367 nextNode); 368 } 369 370 UBool 371 BytesTrieBuilder::ensureCapacity(int32_t length) { 372 if(bytes==NULL) { 373 return FALSE; // previous memory allocation had failed 374 } 375 if(length>bytesCapacity) { 376 int32_t newCapacity=bytesCapacity; 377 do { 378 newCapacity*=2; 379 } while(newCapacity<=length); 380 char *newBytes=reinterpret_cast<char *>(uprv_malloc(newCapacity)); 381 if(newBytes==NULL) { 382 // unable to allocate memory 383 uprv_free(bytes); 384 bytes=NULL; 385 bytesCapacity=0; 386 return FALSE; 387 } 388 uprv_memcpy(newBytes+(newCapacity-bytesLength), 389 bytes+(bytesCapacity-bytesLength), bytesLength); 390 uprv_free(bytes); 391 bytes=newBytes; 392 bytesCapacity=newCapacity; 393 } 394 return TRUE; 395 } 396 397 int32_t 398 BytesTrieBuilder::write(int32_t byte) { 399 int32_t newLength=bytesLength+1; 400 if(ensureCapacity(newLength)) { 401 bytesLength=newLength; 402 bytes[bytesCapacity-bytesLength]=(char)byte; 403 } 404 return bytesLength; 405 } 406 407 int32_t 408 BytesTrieBuilder::write(const char *b, int32_t length) { 409 int32_t newLength=bytesLength+length; 410 if(ensureCapacity(newLength)) { 411 bytesLength=newLength; 412 uprv_memcpy(bytes+(bytesCapacity-bytesLength), b, length); 413 } 414 return bytesLength; 415 } 416 417 int32_t 418 BytesTrieBuilder::writeElementUnits(int32_t i, int32_t byteIndex, int32_t length) { 419 return write(elements[i].getString(*strings).data()+byteIndex, length); 420 } 421 422 int32_t 423 BytesTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) { 424 if(0<=i && i<=BytesTrie::kMaxOneByteValue) { 425 return write(((BytesTrie::kMinOneByteValueLead+i)<<1)|isFinal); 426 } 427 char intBytes[5]; 428 int32_t length=1; 429 if(i<0 || i>0xffffff) { 430 intBytes[0]=(char)BytesTrie::kFiveByteValueLead; 431 intBytes[1]=(char)(i>>24); 432 intBytes[2]=(char)(i>>16); 433 intBytes[3]=(char)(i>>8); 434 intBytes[4]=(char)i; 435 length=5; 436 // } else if(i<=BytesTrie::kMaxOneByteValue) { 437 // intBytes[0]=(char)(BytesTrie::kMinOneByteValueLead+i); 438 } else { 439 if(i<=BytesTrie::kMaxTwoByteValue) { 440 intBytes[0]=(char)(BytesTrie::kMinTwoByteValueLead+(i>>8)); 441 } else { 442 if(i<=BytesTrie::kMaxThreeByteValue) { 443 intBytes[0]=(char)(BytesTrie::kMinThreeByteValueLead+(i>>16)); 444 } else { 445 intBytes[0]=(char)BytesTrie::kFourByteValueLead; 446 intBytes[1]=(char)(i>>16); 447 length=2; 448 } 449 intBytes[length++]=(char)(i>>8); 450 } 451 intBytes[length++]=(char)i; 452 } 453 intBytes[0]=(char)((intBytes[0]<<1)|isFinal); 454 return write(intBytes, length); 455 } 456 457 int32_t 458 BytesTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) { 459 int32_t offset=write(node); 460 if(hasValue) { 461 offset=writeValueAndFinal(value, FALSE); 462 } 463 return offset; 464 } 465 466 int32_t 467 BytesTrieBuilder::writeDeltaTo(int32_t jumpTarget) { 468 int32_t i=bytesLength-jumpTarget; 469 U_ASSERT(i>=0); 470 if(i<=BytesTrie::kMaxOneByteDelta) { 471 return write(i); 472 } 473 char intBytes[5]; 474 int32_t length; 475 if(i<=BytesTrie::kMaxTwoByteDelta) { 476 intBytes[0]=(char)(BytesTrie::kMinTwoByteDeltaLead+(i>>8)); 477 length=1; 478 } else { 479 if(i<=BytesTrie::kMaxThreeByteDelta) { 480 intBytes[0]=(char)(BytesTrie::kMinThreeByteDeltaLead+(i>>16)); 481 length=2; 482 } else { 483 if(i<=0xffffff) { 484 intBytes[0]=(char)BytesTrie::kFourByteDeltaLead; 485 length=3; 486 } else { 487 intBytes[0]=(char)BytesTrie::kFiveByteDeltaLead; 488 intBytes[1]=(char)(i>>24); 489 length=4; 490 } 491 intBytes[1]=(char)(i>>16); 492 } 493 intBytes[1]=(char)(i>>8); 494 } 495 intBytes[length++]=(char)i; 496 return write(intBytes, length); 497 } 498 499 U_NAMESPACE_END 500