1 /* 2 ******************************************************************************* 3 * Copyright (C) 2010-2012, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * file name: ucharstriebuilder.h 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * created on: 2010nov14 12 * created by: Markus W. Scherer 13 */ 14 15 #include "unicode/utypes.h" 16 #include "unicode/ucharstrie.h" 17 #include "unicode/ucharstriebuilder.h" 18 #include "unicode/unistr.h" 19 #include "unicode/ustring.h" 20 #include "cmemory.h" 21 #include "uarrsort.h" 22 #include "uassert.h" 23 #include "uhash.h" 24 #include "ustr_imp.h" 25 26 U_NAMESPACE_BEGIN 27 28 /* 29 * Note: This builder implementation stores (string, value) pairs with full copies 30 * of the 16-bit-unit sequences, until the UCharsTrie is built. 31 * It might(!) take less memory if we collected the data in a temporary, dynamic trie. 32 */ 33 34 class UCharsTrieElement : public UMemory { 35 public: 36 // Use compiler's default constructor, initializes nothing. 37 38 void setTo(const UnicodeString &s, int32_t val, UnicodeString &strings, UErrorCode &errorCode); 39 40 UnicodeString getString(const UnicodeString &strings) const { 41 int32_t length=strings[stringOffset]; 42 return strings.tempSubString(stringOffset+1, length); 43 } 44 int32_t getStringLength(const UnicodeString &strings) const { 45 return strings[stringOffset]; 46 } 47 48 UChar charAt(int32_t index, const UnicodeString &strings) const { 49 return strings[stringOffset+1+index]; 50 } 51 52 int32_t getValue() const { return value; } 53 54 int32_t compareStringTo(const UCharsTrieElement &o, const UnicodeString &strings) const; 55 56 private: 57 // The first strings unit contains the string length. 58 // (Compared with a stringLength field here, this saves 2 bytes per string.) 59 int32_t stringOffset; 60 int32_t value; 61 }; 62 63 void 64 UCharsTrieElement::setTo(const UnicodeString &s, int32_t val, 65 UnicodeString &strings, UErrorCode &errorCode) { 66 if(U_FAILURE(errorCode)) { 67 return; 68 } 69 int32_t length=s.length(); 70 if(length>0xffff) { 71 // Too long: We store the length in 1 unit. 72 errorCode=U_INDEX_OUTOFBOUNDS_ERROR; 73 return; 74 } 75 stringOffset=strings.length(); 76 strings.append((UChar)length); 77 value=val; 78 strings.append(s); 79 } 80 81 int32_t 82 UCharsTrieElement::compareStringTo(const UCharsTrieElement &other, const UnicodeString &strings) const { 83 return getString(strings).compare(other.getString(strings)); 84 } 85 86 UCharsTrieBuilder::UCharsTrieBuilder(UErrorCode & /*errorCode*/) 87 : elements(NULL), elementsCapacity(0), elementsLength(0), 88 uchars(NULL), ucharsCapacity(0), ucharsLength(0) {} 89 90 UCharsTrieBuilder::~UCharsTrieBuilder() { 91 delete[] elements; 92 uprv_free(uchars); 93 } 94 95 UCharsTrieBuilder & 96 UCharsTrieBuilder::add(const UnicodeString &s, int32_t value, UErrorCode &errorCode) { 97 if(U_FAILURE(errorCode)) { 98 return *this; 99 } 100 if(ucharsLength>0) { 101 // Cannot add elements after building. 102 errorCode=U_NO_WRITE_PERMISSION; 103 return *this; 104 } 105 if(elementsLength==elementsCapacity) { 106 int32_t newCapacity; 107 if(elementsCapacity==0) { 108 newCapacity=1024; 109 } else { 110 newCapacity=4*elementsCapacity; 111 } 112 UCharsTrieElement *newElements=new UCharsTrieElement[newCapacity]; 113 if(newElements==NULL) { 114 errorCode=U_MEMORY_ALLOCATION_ERROR; 115 return *this; 116 } 117 if(elementsLength>0) { 118 uprv_memcpy(newElements, elements, elementsLength*sizeof(UCharsTrieElement)); 119 } 120 delete[] elements; 121 elements=newElements; 122 elementsCapacity=newCapacity; 123 } 124 elements[elementsLength++].setTo(s, value, strings, errorCode); 125 if(U_SUCCESS(errorCode) && strings.isBogus()) { 126 errorCode=U_MEMORY_ALLOCATION_ERROR; 127 } 128 return *this; 129 } 130 131 U_CDECL_BEGIN 132 133 static int32_t U_CALLCONV 134 compareElementStrings(const void *context, const void *left, const void *right) { 135 const UnicodeString *strings=static_cast<const UnicodeString *>(context); 136 const UCharsTrieElement *leftElement=static_cast<const UCharsTrieElement *>(left); 137 const UCharsTrieElement *rightElement=static_cast<const UCharsTrieElement *>(right); 138 return leftElement->compareStringTo(*rightElement, *strings); 139 } 140 141 U_CDECL_END 142 143 UCharsTrie * 144 UCharsTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { 145 buildUChars(buildOption, errorCode); 146 UCharsTrie *newTrie=NULL; 147 if(U_SUCCESS(errorCode)) { 148 newTrie=new UCharsTrie(uchars, uchars+(ucharsCapacity-ucharsLength)); 149 if(newTrie==NULL) { 150 errorCode=U_MEMORY_ALLOCATION_ERROR; 151 } else { 152 uchars=NULL; // The new trie now owns the array. 153 ucharsCapacity=0; 154 } 155 } 156 return newTrie; 157 } 158 159 UnicodeString & 160 UCharsTrieBuilder::buildUnicodeString(UStringTrieBuildOption buildOption, UnicodeString &result, 161 UErrorCode &errorCode) { 162 buildUChars(buildOption, errorCode); 163 if(U_SUCCESS(errorCode)) { 164 result.setTo(FALSE, uchars+(ucharsCapacity-ucharsLength), ucharsLength); 165 } 166 return result; 167 } 168 169 void 170 UCharsTrieBuilder::buildUChars(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { 171 if(U_FAILURE(errorCode)) { 172 return; 173 } 174 if(uchars!=NULL && ucharsLength>0) { 175 // Already built. 176 return; 177 } 178 if(ucharsLength==0) { 179 if(elementsLength==0) { 180 errorCode=U_INDEX_OUTOFBOUNDS_ERROR; 181 return; 182 } 183 if(strings.isBogus()) { 184 errorCode=U_MEMORY_ALLOCATION_ERROR; 185 return; 186 } 187 uprv_sortArray(elements, elementsLength, (int32_t)sizeof(UCharsTrieElement), 188 compareElementStrings, &strings, 189 FALSE, // need not be a stable sort 190 &errorCode); 191 if(U_FAILURE(errorCode)) { 192 return; 193 } 194 // Duplicate strings are not allowed. 195 UnicodeString prev=elements[0].getString(strings); 196 for(int32_t i=1; i<elementsLength; ++i) { 197 UnicodeString current=elements[i].getString(strings); 198 if(prev==current) { 199 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 200 return; 201 } 202 prev.fastCopyFrom(current); 203 } 204 } 205 // Create and UChar-serialize the trie for the elements. 206 ucharsLength=0; 207 int32_t capacity=strings.length(); 208 if(capacity<1024) { 209 capacity=1024; 210 } 211 if(ucharsCapacity<capacity) { 212 uprv_free(uchars); 213 uchars=static_cast<UChar *>(uprv_malloc(capacity*2)); 214 if(uchars==NULL) { 215 errorCode=U_MEMORY_ALLOCATION_ERROR; 216 ucharsCapacity=0; 217 return; 218 } 219 ucharsCapacity=capacity; 220 } 221 StringTrieBuilder::build(buildOption, elementsLength, errorCode); 222 if(uchars==NULL) { 223 errorCode=U_MEMORY_ALLOCATION_ERROR; 224 } 225 } 226 227 int32_t 228 UCharsTrieBuilder::getElementStringLength(int32_t i) const { 229 return elements[i].getStringLength(strings); 230 } 231 232 UChar 233 UCharsTrieBuilder::getElementUnit(int32_t i, int32_t unitIndex) const { 234 return elements[i].charAt(unitIndex, strings); 235 } 236 237 int32_t 238 UCharsTrieBuilder::getElementValue(int32_t i) const { 239 return elements[i].getValue(); 240 } 241 242 int32_t 243 UCharsTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const { 244 const UCharsTrieElement &firstElement=elements[first]; 245 const UCharsTrieElement &lastElement=elements[last]; 246 int32_t minStringLength=firstElement.getStringLength(strings); 247 while(++unitIndex<minStringLength && 248 firstElement.charAt(unitIndex, strings)== 249 lastElement.charAt(unitIndex, strings)) {} 250 return unitIndex; 251 } 252 253 int32_t 254 UCharsTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const { 255 int32_t length=0; // Number of different units at unitIndex. 256 int32_t i=start; 257 do { 258 UChar unit=elements[i++].charAt(unitIndex, strings); 259 while(i<limit && unit==elements[i].charAt(unitIndex, strings)) { 260 ++i; 261 } 262 ++length; 263 } while(i<limit); 264 return length; 265 } 266 267 int32_t 268 UCharsTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const { 269 do { 270 UChar unit=elements[i++].charAt(unitIndex, strings); 271 while(unit==elements[i].charAt(unitIndex, strings)) { 272 ++i; 273 } 274 } while(--count>0); 275 return i; 276 } 277 278 int32_t 279 UCharsTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const { 280 while(unit==elements[i].charAt(unitIndex, strings)) { 281 ++i; 282 } 283 return i; 284 } 285 286 UCharsTrieBuilder::UCTLinearMatchNode::UCTLinearMatchNode(const UChar *units, int32_t len, Node *nextNode) 287 : LinearMatchNode(len, nextNode), s(units) { 288 hash=hash*37+ustr_hashUCharsN(units, len); 289 } 290 291 UBool 292 UCharsTrieBuilder::UCTLinearMatchNode::operator==(const Node &other) const { 293 if(this==&other) { 294 return TRUE; 295 } 296 if(!LinearMatchNode::operator==(other)) { 297 return FALSE; 298 } 299 const UCTLinearMatchNode &o=(const UCTLinearMatchNode &)other; 300 return 0==u_memcmp(s, o.s, length); 301 } 302 303 void 304 UCharsTrieBuilder::UCTLinearMatchNode::write(StringTrieBuilder &builder) { 305 UCharsTrieBuilder &b=(UCharsTrieBuilder &)builder; 306 next->write(builder); 307 b.write(s, length); 308 offset=b.writeValueAndType(hasValue, value, b.getMinLinearMatch()+length-1); 309 } 310 311 StringTrieBuilder::Node * 312 UCharsTrieBuilder::createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length, 313 Node *nextNode) const { 314 return new UCTLinearMatchNode( 315 elements[i].getString(strings).getBuffer()+unitIndex, 316 length, 317 nextNode); 318 } 319 320 UBool 321 UCharsTrieBuilder::ensureCapacity(int32_t length) { 322 if(uchars==NULL) { 323 return FALSE; // previous memory allocation had failed 324 } 325 if(length>ucharsCapacity) { 326 int32_t newCapacity=ucharsCapacity; 327 do { 328 newCapacity*=2; 329 } while(newCapacity<=length); 330 UChar *newUChars=static_cast<UChar *>(uprv_malloc(newCapacity*2)); 331 if(newUChars==NULL) { 332 // unable to allocate memory 333 uprv_free(uchars); 334 uchars=NULL; 335 ucharsCapacity=0; 336 return FALSE; 337 } 338 u_memcpy(newUChars+(newCapacity-ucharsLength), 339 uchars+(ucharsCapacity-ucharsLength), ucharsLength); 340 uprv_free(uchars); 341 uchars=newUChars; 342 ucharsCapacity=newCapacity; 343 } 344 return TRUE; 345 } 346 347 int32_t 348 UCharsTrieBuilder::write(int32_t unit) { 349 int32_t newLength=ucharsLength+1; 350 if(ensureCapacity(newLength)) { 351 ucharsLength=newLength; 352 uchars[ucharsCapacity-ucharsLength]=(UChar)unit; 353 } 354 return ucharsLength; 355 } 356 357 int32_t 358 UCharsTrieBuilder::write(const UChar *s, int32_t length) { 359 int32_t newLength=ucharsLength+length; 360 if(ensureCapacity(newLength)) { 361 ucharsLength=newLength; 362 u_memcpy(uchars+(ucharsCapacity-ucharsLength), s, length); 363 } 364 return ucharsLength; 365 } 366 367 int32_t 368 UCharsTrieBuilder::writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) { 369 return write(elements[i].getString(strings).getBuffer()+unitIndex, length); 370 } 371 372 int32_t 373 UCharsTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) { 374 if(0<=i && i<=UCharsTrie::kMaxOneUnitValue) { 375 return write(i|(isFinal<<15)); 376 } 377 UChar intUnits[3]; 378 int32_t length; 379 if(i<0 || i>UCharsTrie::kMaxTwoUnitValue) { 380 intUnits[0]=(UChar)(UCharsTrie::kThreeUnitValueLead); 381 intUnits[1]=(UChar)((uint32_t)i>>16); 382 intUnits[2]=(UChar)i; 383 length=3; 384 // } else if(i<=UCharsTrie::kMaxOneUnitValue) { 385 // intUnits[0]=(UChar)(i); 386 // length=1; 387 } else { 388 intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitValueLead+(i>>16)); 389 intUnits[1]=(UChar)i; 390 length=2; 391 } 392 intUnits[0]=(UChar)(intUnits[0]|(isFinal<<15)); 393 return write(intUnits, length); 394 } 395 396 int32_t 397 UCharsTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) { 398 if(!hasValue) { 399 return write(node); 400 } 401 UChar intUnits[3]; 402 int32_t length; 403 if(value<0 || value>UCharsTrie::kMaxTwoUnitNodeValue) { 404 intUnits[0]=(UChar)(UCharsTrie::kThreeUnitNodeValueLead); 405 intUnits[1]=(UChar)((uint32_t)value>>16); 406 intUnits[2]=(UChar)value; 407 length=3; 408 } else if(value<=UCharsTrie::kMaxOneUnitNodeValue) { 409 intUnits[0]=(UChar)((value+1)<<6); 410 length=1; 411 } else { 412 intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitNodeValueLead+((value>>10)&0x7fc0)); 413 intUnits[1]=(UChar)value; 414 length=2; 415 } 416 intUnits[0]|=(UChar)node; 417 return write(intUnits, length); 418 } 419 420 int32_t 421 UCharsTrieBuilder::writeDeltaTo(int32_t jumpTarget) { 422 int32_t i=ucharsLength-jumpTarget; 423 U_ASSERT(i>=0); 424 if(i<=UCharsTrie::kMaxOneUnitDelta) { 425 return write(i); 426 } 427 UChar intUnits[3]; 428 int32_t length; 429 if(i<=UCharsTrie::kMaxTwoUnitDelta) { 430 intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitDeltaLead+(i>>16)); 431 length=1; 432 } else { 433 intUnits[0]=(UChar)(UCharsTrie::kThreeUnitDeltaLead); 434 intUnits[1]=(UChar)(i>>16); 435 length=2; 436 } 437 intUnits[length++]=(UChar)i; 438 return write(intUnits, length); 439 } 440 441 U_NAMESPACE_END 442