1 /* 2 ******************************************************************************* 3 * Copyright (C) 2010-2012, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * file name: bytestriebuilder.cpp 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * created on: 2010sep25 12 * created by: Markus W. Scherer 13 */ 14 15 #include "unicode/utypes.h" 16 #include "unicode/bytestrie.h" 17 #include "unicode/bytestriebuilder.h" 18 #include "unicode/stringpiece.h" 19 #include "charstr.h" 20 #include "cmemory.h" 21 #include "uhash.h" 22 #include "uarrsort.h" 23 #include "uassert.h" 24 #include "ustr_imp.h" 25 26 U_NAMESPACE_BEGIN 27 28 /* 29 * Note: This builder implementation stores (bytes, value) pairs with full copies 30 * of the byte sequences, until the BytesTrie is built. 31 * It might(!) take less memory if we collected the data in a temporary, dynamic trie. 32 */ 33 34 class BytesTrieElement : public UMemory { 35 public: 36 // Use compiler's default constructor, initializes nothing. 37 38 void setTo(const StringPiece &s, int32_t val, CharString &strings, UErrorCode &errorCode); 39 40 StringPiece getString(const CharString &strings) const { 41 int32_t offset=stringOffset; 42 int32_t length; 43 if(offset>=0) { 44 length=(uint8_t)strings[offset++]; 45 } else { 46 offset=~offset; 47 length=((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1]; 48 offset+=2; 49 } 50 return StringPiece(strings.data()+offset, length); 51 } 52 int32_t getStringLength(const CharString &strings) const { 53 int32_t offset=stringOffset; 54 if(offset>=0) { 55 return (uint8_t)strings[offset]; 56 } else { 57 offset=~offset; 58 return ((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1]; 59 } 60 } 61 62 char charAt(int32_t index, const CharString &strings) const { return data(strings)[index]; } 63 64 int32_t getValue() const { return value; } 65 66 int32_t compareStringTo(const BytesTrieElement &o, const CharString &strings) const; 67 68 private: 69 const char *data(const CharString &strings) const { 70 int32_t offset=stringOffset; 71 if(offset>=0) { 72 ++offset; 73 } else { 74 offset=~offset+2; 75 } 76 return strings.data()+offset; 77 } 78 79 // If the stringOffset is non-negative, then the first strings byte contains 80 // the string length. 81 // If the stringOffset is negative, then the first two strings bytes contain 82 // the string length (big-endian), and the offset needs to be bit-inverted. 83 // (Compared with a stringLength field here, this saves 3 bytes per string for most strings.) 84 int32_t stringOffset; 85 int32_t value; 86 }; 87 88 void 89 BytesTrieElement::setTo(const StringPiece &s, int32_t val, 90 CharString &strings, UErrorCode &errorCode) { 91 if(U_FAILURE(errorCode)) { 92 return; 93 } 94 int32_t length=s.length(); 95 if(length>0xffff) { 96 // Too long: We store the length in 1 or 2 bytes. 97 errorCode=U_INDEX_OUTOFBOUNDS_ERROR; 98 return; 99 } 100 int32_t offset=strings.length(); 101 if(length>0xff) { 102 offset=~offset; 103 strings.append((char)(length>>8), errorCode); 104 } 105 strings.append((char)length, errorCode); 106 stringOffset=offset; 107 value=val; 108 strings.append(s, errorCode); 109 } 110 111 int32_t 112 BytesTrieElement::compareStringTo(const BytesTrieElement &other, const CharString &strings) const { 113 // TODO: add StringPiece::compare(), see ticket #8187 114 StringPiece thisString=getString(strings); 115 StringPiece otherString=other.getString(strings); 116 int32_t lengthDiff=thisString.length()-otherString.length(); 117 int32_t commonLength; 118 if(lengthDiff<=0) { 119 commonLength=thisString.length(); 120 } else { 121 commonLength=otherString.length(); 122 } 123 int32_t diff=uprv_memcmp(thisString.data(), otherString.data(), commonLength); 124 return diff!=0 ? diff : lengthDiff; 125 } 126 127 BytesTrieBuilder::BytesTrieBuilder(UErrorCode &errorCode) 128 : strings(NULL), elements(NULL), elementsCapacity(0), elementsLength(0), 129 bytes(NULL), bytesCapacity(0), bytesLength(0) { 130 if(U_FAILURE(errorCode)) { 131 return; 132 } 133 strings=new CharString(); 134 if(strings==NULL) { 135 errorCode=U_MEMORY_ALLOCATION_ERROR; 136 } 137 } 138 139 BytesTrieBuilder::~BytesTrieBuilder() { 140 delete strings; 141 delete[] elements; 142 uprv_free(bytes); 143 } 144 145 BytesTrieBuilder & 146 BytesTrieBuilder::add(const StringPiece &s, int32_t value, UErrorCode &errorCode) { 147 if(U_FAILURE(errorCode)) { 148 return *this; 149 } 150 if(bytesLength>0) { 151 // Cannot add elements after building. 152 errorCode=U_NO_WRITE_PERMISSION; 153 return *this; 154 } 155 if(elementsLength==elementsCapacity) { 156 int32_t newCapacity; 157 if(elementsCapacity==0) { 158 newCapacity=1024; 159 } else { 160 newCapacity=4*elementsCapacity; 161 } 162 BytesTrieElement *newElements=new BytesTrieElement[newCapacity]; 163 if(newElements==NULL) { 164 errorCode=U_MEMORY_ALLOCATION_ERROR; 165 return *this; // error instead of dereferencing null 166 } 167 if(elementsLength>0) { 168 uprv_memcpy(newElements, elements, elementsLength*sizeof(BytesTrieElement)); 169 } 170 delete[] elements; 171 elements=newElements; 172 elementsCapacity=newCapacity; 173 } 174 elements[elementsLength++].setTo(s, value, *strings, errorCode); 175 return *this; 176 } 177 178 U_CDECL_BEGIN 179 180 static int32_t U_CALLCONV 181 compareElementStrings(const void *context, const void *left, const void *right) { 182 const CharString *strings=static_cast<const CharString *>(context); 183 const BytesTrieElement *leftElement=static_cast<const BytesTrieElement *>(left); 184 const BytesTrieElement *rightElement=static_cast<const BytesTrieElement *>(right); 185 return leftElement->compareStringTo(*rightElement, *strings); 186 } 187 188 U_CDECL_END 189 190 BytesTrie * 191 BytesTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { 192 buildBytes(buildOption, errorCode); 193 BytesTrie *newTrie=NULL; 194 if(U_SUCCESS(errorCode)) { 195 newTrie=new BytesTrie(bytes, bytes+(bytesCapacity-bytesLength)); 196 if(newTrie==NULL) { 197 errorCode=U_MEMORY_ALLOCATION_ERROR; 198 } else { 199 bytes=NULL; // The new trie now owns the array. 200 bytesCapacity=0; 201 } 202 } 203 return newTrie; 204 } 205 206 StringPiece 207 BytesTrieBuilder::buildStringPiece(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { 208 buildBytes(buildOption, errorCode); 209 StringPiece result; 210 if(U_SUCCESS(errorCode)) { 211 result.set(bytes+(bytesCapacity-bytesLength), bytesLength); 212 } 213 return result; 214 } 215 216 void 217 BytesTrieBuilder::buildBytes(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { 218 if(U_FAILURE(errorCode)) { 219 return; 220 } 221 if(bytes!=NULL && bytesLength>0) { 222 // Already built. 223 return; 224 } 225 if(bytesLength==0) { 226 if(elementsLength==0) { 227 errorCode=U_INDEX_OUTOFBOUNDS_ERROR; 228 return; 229 } 230 uprv_sortArray(elements, elementsLength, (int32_t)sizeof(BytesTrieElement), 231 compareElementStrings, strings, 232 FALSE, // need not be a stable sort 233 &errorCode); 234 if(U_FAILURE(errorCode)) { 235 return; 236 } 237 // Duplicate strings are not allowed. 238 StringPiece prev=elements[0].getString(*strings); 239 for(int32_t i=1; i<elementsLength; ++i) { 240 StringPiece current=elements[i].getString(*strings); 241 if(prev==current) { 242 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 243 return; 244 } 245 prev=current; 246 } 247 } 248 // Create and byte-serialize the trie for the elements. 249 bytesLength=0; 250 int32_t capacity=strings->length(); 251 if(capacity<1024) { 252 capacity=1024; 253 } 254 if(bytesCapacity<capacity) { 255 uprv_free(bytes); 256 bytes=static_cast<char *>(uprv_malloc(capacity)); 257 if(bytes==NULL) { 258 errorCode=U_MEMORY_ALLOCATION_ERROR; 259 bytesCapacity=0; 260 return; 261 } 262 bytesCapacity=capacity; 263 } 264 StringTrieBuilder::build(buildOption, elementsLength, errorCode); 265 if(bytes==NULL) { 266 errorCode=U_MEMORY_ALLOCATION_ERROR; 267 } 268 } 269 270 BytesTrieBuilder & 271 BytesTrieBuilder::clear() { 272 strings->clear(); 273 elementsLength=0; 274 bytesLength=0; 275 return *this; 276 } 277 278 int32_t 279 BytesTrieBuilder::getElementStringLength(int32_t i) const { 280 return elements[i].getStringLength(*strings); 281 } 282 283 UChar 284 BytesTrieBuilder::getElementUnit(int32_t i, int32_t byteIndex) const { 285 return (uint8_t)elements[i].charAt(byteIndex, *strings); 286 } 287 288 int32_t 289 BytesTrieBuilder::getElementValue(int32_t i) const { 290 return elements[i].getValue(); 291 } 292 293 int32_t 294 BytesTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t byteIndex) const { 295 const BytesTrieElement &firstElement=elements[first]; 296 const BytesTrieElement &lastElement=elements[last]; 297 int32_t minStringLength=firstElement.getStringLength(*strings); 298 while(++byteIndex<minStringLength && 299 firstElement.charAt(byteIndex, *strings)== 300 lastElement.charAt(byteIndex, *strings)) {} 301 return byteIndex; 302 } 303 304 int32_t 305 BytesTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t byteIndex) const { 306 int32_t length=0; // Number of different bytes at byteIndex. 307 int32_t i=start; 308 do { 309 char byte=elements[i++].charAt(byteIndex, *strings); 310 while(i<limit && byte==elements[i].charAt(byteIndex, *strings)) { 311 ++i; 312 } 313 ++length; 314 } while(i<limit); 315 return length; 316 } 317 318 int32_t 319 BytesTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t byteIndex, int32_t count) const { 320 do { 321 char byte=elements[i++].charAt(byteIndex, *strings); 322 while(byte==elements[i].charAt(byteIndex, *strings)) { 323 ++i; 324 } 325 } while(--count>0); 326 return i; 327 } 328 329 int32_t 330 BytesTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t byteIndex, UChar byte) const { 331 char b=(char)byte; 332 while(b==elements[i].charAt(byteIndex, *strings)) { 333 ++i; 334 } 335 return i; 336 } 337 338 BytesTrieBuilder::BTLinearMatchNode::BTLinearMatchNode(const char *bytes, int32_t len, Node *nextNode) 339 : LinearMatchNode(len, nextNode), s(bytes) { 340 hash=hash*37+ustr_hashCharsN(bytes, len); 341 } 342 343 UBool 344 BytesTrieBuilder::BTLinearMatchNode::operator==(const Node &other) const { 345 if(this==&other) { 346 return TRUE; 347 } 348 if(!LinearMatchNode::operator==(other)) { 349 return FALSE; 350 } 351 const BTLinearMatchNode &o=(const BTLinearMatchNode &)other; 352 return 0==uprv_memcmp(s, o.s, length); 353 } 354 355 void 356 BytesTrieBuilder::BTLinearMatchNode::write(StringTrieBuilder &builder) { 357 BytesTrieBuilder &b=(BytesTrieBuilder &)builder; 358 next->write(builder); 359 b.write(s, length); 360 offset=b.write(b.getMinLinearMatch()+length-1); 361 } 362 363 StringTrieBuilder::Node * 364 BytesTrieBuilder::createLinearMatchNode(int32_t i, int32_t byteIndex, int32_t length, 365 Node *nextNode) const { 366 return new BTLinearMatchNode( 367 elements[i].getString(*strings).data()+byteIndex, 368 length, 369 nextNode); 370 } 371 372 UBool 373 BytesTrieBuilder::ensureCapacity(int32_t length) { 374 if(bytes==NULL) { 375 return FALSE; // previous memory allocation had failed 376 } 377 if(length>bytesCapacity) { 378 int32_t newCapacity=bytesCapacity; 379 do { 380 newCapacity*=2; 381 } while(newCapacity<=length); 382 char *newBytes=static_cast<char *>(uprv_malloc(newCapacity)); 383 if(newBytes==NULL) { 384 // unable to allocate memory 385 uprv_free(bytes); 386 bytes=NULL; 387 bytesCapacity=0; 388 return FALSE; 389 } 390 uprv_memcpy(newBytes+(newCapacity-bytesLength), 391 bytes+(bytesCapacity-bytesLength), bytesLength); 392 uprv_free(bytes); 393 bytes=newBytes; 394 bytesCapacity=newCapacity; 395 } 396 return TRUE; 397 } 398 399 int32_t 400 BytesTrieBuilder::write(int32_t byte) { 401 int32_t newLength=bytesLength+1; 402 if(ensureCapacity(newLength)) { 403 bytesLength=newLength; 404 bytes[bytesCapacity-bytesLength]=(char)byte; 405 } 406 return bytesLength; 407 } 408 409 int32_t 410 BytesTrieBuilder::write(const char *b, int32_t length) { 411 int32_t newLength=bytesLength+length; 412 if(ensureCapacity(newLength)) { 413 bytesLength=newLength; 414 uprv_memcpy(bytes+(bytesCapacity-bytesLength), b, length); 415 } 416 return bytesLength; 417 } 418 419 int32_t 420 BytesTrieBuilder::writeElementUnits(int32_t i, int32_t byteIndex, int32_t length) { 421 return write(elements[i].getString(*strings).data()+byteIndex, length); 422 } 423 424 int32_t 425 BytesTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) { 426 if(0<=i && i<=BytesTrie::kMaxOneByteValue) { 427 return write(((BytesTrie::kMinOneByteValueLead+i)<<1)|isFinal); 428 } 429 char intBytes[5]; 430 int32_t length=1; 431 if(i<0 || i>0xffffff) { 432 intBytes[0]=(char)BytesTrie::kFiveByteValueLead; 433 intBytes[1]=(char)((uint32_t)i>>24); 434 intBytes[2]=(char)((uint32_t)i>>16); 435 intBytes[3]=(char)((uint32_t)i>>8); 436 intBytes[4]=(char)i; 437 length=5; 438 // } else if(i<=BytesTrie::kMaxOneByteValue) { 439 // intBytes[0]=(char)(BytesTrie::kMinOneByteValueLead+i); 440 } else { 441 if(i<=BytesTrie::kMaxTwoByteValue) { 442 intBytes[0]=(char)(BytesTrie::kMinTwoByteValueLead+(i>>8)); 443 } else { 444 if(i<=BytesTrie::kMaxThreeByteValue) { 445 intBytes[0]=(char)(BytesTrie::kMinThreeByteValueLead+(i>>16)); 446 } else { 447 intBytes[0]=(char)BytesTrie::kFourByteValueLead; 448 intBytes[1]=(char)(i>>16); 449 length=2; 450 } 451 intBytes[length++]=(char)(i>>8); 452 } 453 intBytes[length++]=(char)i; 454 } 455 intBytes[0]=(char)((intBytes[0]<<1)|isFinal); 456 return write(intBytes, length); 457 } 458 459 int32_t 460 BytesTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) { 461 int32_t offset=write(node); 462 if(hasValue) { 463 offset=writeValueAndFinal(value, FALSE); 464 } 465 return offset; 466 } 467 468 int32_t 469 BytesTrieBuilder::writeDeltaTo(int32_t jumpTarget) { 470 int32_t i=bytesLength-jumpTarget; 471 U_ASSERT(i>=0); 472 if(i<=BytesTrie::kMaxOneByteDelta) { 473 return write(i); 474 } 475 char intBytes[5]; 476 int32_t length; 477 if(i<=BytesTrie::kMaxTwoByteDelta) { 478 intBytes[0]=(char)(BytesTrie::kMinTwoByteDeltaLead+(i>>8)); 479 length=1; 480 } else { 481 if(i<=BytesTrie::kMaxThreeByteDelta) { 482 intBytes[0]=(char)(BytesTrie::kMinThreeByteDeltaLead+(i>>16)); 483 length=2; 484 } else { 485 if(i<=0xffffff) { 486 intBytes[0]=(char)BytesTrie::kFourByteDeltaLead; 487 length=3; 488 } else { 489 intBytes[0]=(char)BytesTrie::kFiveByteDeltaLead; 490 intBytes[1]=(char)(i>>24); 491 length=4; 492 } 493 intBytes[1]=(char)(i>>16); 494 } 495 intBytes[1]=(char)(i>>8); 496 } 497 intBytes[length++]=(char)i; 498 return write(intBytes, length); 499 } 500 501 U_NAMESPACE_END 502