1 /* 2 *************************************************************************** 3 * Copyright (C) 1999-2010 International Business Machines Corporation * 4 * and others. All rights reserved. * 5 *************************************************************************** 6 */ 7 8 #include "unicode/utypes.h" 9 10 #if !UCONFIG_NO_BREAK_ITERATION 11 12 #include "unicode/utypes.h" 13 #include "rbbidata.h" 14 #include "rbbirb.h" 15 #include "utrie.h" 16 #include "udatamem.h" 17 #include "cmemory.h" 18 #include "cstring.h" 19 #include "umutex.h" 20 21 #include "uassert.h" 22 23 24 //----------------------------------------------------------------------------------- 25 // 26 // Trie access folding function. Copied as-is from properties code in uchar.c 27 // 28 //----------------------------------------------------------------------------------- 29 U_CDECL_BEGIN 30 static int32_t U_CALLCONV 31 getFoldingOffset(uint32_t data) { 32 /* if bit 15 is set, then the folding offset is in bits 14..0 of the 16-bit trie result */ 33 if(data&0x8000) { 34 return (int32_t)(data&0x7fff); 35 } else { 36 return 0; 37 } 38 } 39 U_CDECL_END 40 41 U_NAMESPACE_BEGIN 42 43 //----------------------------------------------------------------------------- 44 // 45 // Constructors. 46 // 47 //----------------------------------------------------------------------------- 48 RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status) { 49 init(data, status); 50 } 51 52 RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, enum EDontAdopt, UErrorCode &status) { 53 init(data, status); 54 fDontFreeData = TRUE; 55 } 56 57 RBBIDataWrapper::RBBIDataWrapper(UDataMemory* udm, UErrorCode &status) { 58 const RBBIDataHeader *d = (const RBBIDataHeader *) 59 // ((char *)&(udm->pHeader->info) + udm->pHeader->info.size); 60 // taking into consideration the padding added in by udata_write 61 ((char *)(udm->pHeader) + udm->pHeader->dataHeader.headerSize); 62 init(d, status); 63 fUDataMem = udm; 64 } 65 66 //----------------------------------------------------------------------------- 67 // 68 // init(). Does most of the work of construction, shared between the 69 // constructors. 70 // 71 //----------------------------------------------------------------------------- 72 void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) { 73 if (U_FAILURE(status)) { 74 return; 75 } 76 fHeader = data; 77 if (fHeader->fMagic != 0xb1a0 || fHeader->fFormatVersion[0] != 3) 78 { 79 status = U_INVALID_FORMAT_ERROR; 80 return; 81 } 82 // Note: in ICU version 3.2 and earlier, there was a formatVersion 1 83 // that is no longer supported. At that time fFormatVersion was 84 // an int32_t field, rather than an array of 4 bytes. 85 86 fDontFreeData = FALSE; 87 fUDataMem = NULL; 88 fReverseTable = NULL; 89 fSafeFwdTable = NULL; 90 fSafeRevTable = NULL; 91 if (data->fFTableLen != 0) { 92 fForwardTable = (RBBIStateTable *)((char *)data + fHeader->fFTable); 93 } 94 if (data->fRTableLen != 0) { 95 fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable); 96 } 97 if (data->fSFTableLen != 0) { 98 fSafeFwdTable = (RBBIStateTable *)((char *)data + fHeader->fSFTable); 99 } 100 if (data->fSRTableLen != 0) { 101 fSafeRevTable = (RBBIStateTable *)((char *)data + fHeader->fSRTable); 102 } 103 104 105 utrie_unserialize(&fTrie, 106 (uint8_t *)data + fHeader->fTrie, 107 fHeader->fTrieLen, 108 &status); 109 if (U_FAILURE(status)) { 110 return; 111 } 112 fTrie.getFoldingOffset=getFoldingOffset; 113 114 115 fRuleSource = (UChar *)((char *)data + fHeader->fRuleSource); 116 fRuleString.setTo(TRUE, fRuleSource, -1); 117 U_ASSERT(data->fRuleSourceLen > 0); 118 119 fRuleStatusTable = (int32_t *)((char *)data + fHeader->fStatusTable); 120 fStatusMaxIdx = data->fStatusTableLen / sizeof(int32_t); 121 122 fRefCount = 1; 123 124 #ifdef RBBI_DEBUG 125 char *debugEnv = getenv("U_RBBIDEBUG"); 126 if (debugEnv && uprv_strstr(debugEnv, "data")) {this->printData();} 127 #endif 128 } 129 130 131 //----------------------------------------------------------------------------- 132 // 133 // Destructor. Don't call this - use removeReference() instead. 134 // 135 //----------------------------------------------------------------------------- 136 RBBIDataWrapper::~RBBIDataWrapper() { 137 U_ASSERT(fRefCount == 0); 138 if (fUDataMem) { 139 udata_close(fUDataMem); 140 } else if (!fDontFreeData) { 141 uprv_free((void *)fHeader); 142 } 143 } 144 145 146 147 //----------------------------------------------------------------------------- 148 // 149 // Operator == Consider two RBBIDataWrappers to be equal if they 150 // refer to the same underlying data. Although 151 // the data wrappers are normally shared between 152 // iterator instances, it's possible to independently 153 // open the same data twice, and get two instances, which 154 // should still be ==. 155 // 156 //----------------------------------------------------------------------------- 157 UBool RBBIDataWrapper::operator ==(const RBBIDataWrapper &other) const { 158 if (fHeader == other.fHeader) { 159 return TRUE; 160 } 161 if (fHeader->fLength != other.fHeader->fLength) { 162 return FALSE; 163 } 164 if (uprv_memcmp(fHeader, other.fHeader, fHeader->fLength) == 0) { 165 return TRUE; 166 } 167 return FALSE; 168 } 169 170 int32_t RBBIDataWrapper::hashCode() { 171 return fHeader->fFTableLen; 172 } 173 174 175 176 //----------------------------------------------------------------------------- 177 // 178 // Reference Counting. A single RBBIDataWrapper object is shared among 179 // however many RulesBasedBreakIterator instances are 180 // referencing the same data. 181 // 182 //----------------------------------------------------------------------------- 183 void RBBIDataWrapper::removeReference() { 184 if (umtx_atomic_dec(&fRefCount) == 0) { 185 delete this; 186 } 187 } 188 189 190 RBBIDataWrapper *RBBIDataWrapper::addReference() { 191 umtx_atomic_inc(&fRefCount); 192 return this; 193 } 194 195 196 197 //----------------------------------------------------------------------------- 198 // 199 // getRuleSourceString 200 // 201 //----------------------------------------------------------------------------- 202 const UnicodeString &RBBIDataWrapper::getRuleSourceString() const { 203 return fRuleString; 204 } 205 206 207 //----------------------------------------------------------------------------- 208 // 209 // print - debugging function to dump the runtime data tables. 210 // 211 //----------------------------------------------------------------------------- 212 #ifdef RBBI_DEBUG 213 void RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *table) { 214 uint32_t c; 215 uint32_t s; 216 217 RBBIDebugPrintf(" %s\n", heading); 218 219 RBBIDebugPrintf("State | Acc LA TagIx"); 220 for (c=0; c<fHeader->fCatCount; c++) {RBBIDebugPrintf("%3d ", c);} 221 RBBIDebugPrintf("\n------|---------------"); for (c=0;c<fHeader->fCatCount; c++) { 222 RBBIDebugPrintf("----"); 223 } 224 RBBIDebugPrintf("\n"); 225 226 if (table == NULL) { 227 RBBIDebugPrintf(" N U L L T A B L E\n\n"); 228 return; 229 } 230 for (s=0; s<table->fNumStates; s++) { 231 RBBIStateTableRow *row = (RBBIStateTableRow *) 232 (table->fTableData + (table->fRowLen * s)); 233 RBBIDebugPrintf("%4d | %3d %3d %3d ", s, row->fAccepting, row->fLookAhead, row->fTagIdx); 234 for (c=0; c<fHeader->fCatCount; c++) { 235 RBBIDebugPrintf("%3d ", row->fNextState[c]); 236 } 237 RBBIDebugPrintf("\n"); 238 } 239 RBBIDebugPrintf("\n"); 240 } 241 #endif 242 243 244 #ifdef RBBI_DEBUG 245 void RBBIDataWrapper::printData() { 246 RBBIDebugPrintf("RBBI Data at %p\n", (void *)fHeader); 247 RBBIDebugPrintf(" Version = {%d %d %d %d}\n", fHeader->fFormatVersion[0], fHeader->fFormatVersion[1], 248 fHeader->fFormatVersion[2], fHeader->fFormatVersion[3]); 249 RBBIDebugPrintf(" total length of data = %d\n", fHeader->fLength); 250 RBBIDebugPrintf(" number of character categories = %d\n\n", fHeader->fCatCount); 251 252 printTable("Forward State Transition Table", fForwardTable); 253 printTable("Reverse State Transition Table", fReverseTable); 254 printTable("Safe Forward State Transition Table", fSafeFwdTable); 255 printTable("Safe Reverse State Transition Table", fSafeRevTable); 256 257 RBBIDebugPrintf("\nOrignal Rules source:\n"); 258 for (int32_t c=0; fRuleSource[c] != 0; c++) { 259 RBBIDebugPrintf("%c", fRuleSource[c]); 260 } 261 RBBIDebugPrintf("\n\n"); 262 } 263 #endif 264 265 266 U_NAMESPACE_END 267 U_NAMESPACE_USE 268 269 //----------------------------------------------------------------------------- 270 // 271 // ubrk_swap - byte swap and char encoding swap of RBBI data 272 // 273 //----------------------------------------------------------------------------- 274 275 U_CAPI int32_t U_EXPORT2 276 ubrk_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, 277 UErrorCode *status) { 278 279 if (status == NULL || U_FAILURE(*status)) { 280 return 0; 281 } 282 if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) { 283 *status=U_ILLEGAL_ARGUMENT_ERROR; 284 return 0; 285 } 286 287 // 288 // Check that the data header is for for break data. 289 // (Header contents are defined in genbrk.cpp) 290 // 291 const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4); 292 if(!( pInfo->dataFormat[0]==0x42 && /* dataFormat="Brk " */ 293 pInfo->dataFormat[1]==0x72 && 294 pInfo->dataFormat[2]==0x6b && 295 pInfo->dataFormat[3]==0x20 && 296 pInfo->formatVersion[0]==3 )) { 297 udata_printError(ds, "ubrk_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n", 298 pInfo->dataFormat[0], pInfo->dataFormat[1], 299 pInfo->dataFormat[2], pInfo->dataFormat[3], 300 pInfo->formatVersion[0]); 301 *status=U_UNSUPPORTED_ERROR; 302 return 0; 303 } 304 305 // 306 // Swap the data header. (This is the generic ICU Data Header, not the RBBI Specific 307 // RBBIDataHeader). This swap also conveniently gets us 308 // the size of the ICU d.h., which lets us locate the start 309 // of the RBBI specific data. 310 // 311 int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status); 312 313 314 // 315 // Get the RRBI Data Header, and check that it appears to be OK. 316 // 317 // Note: ICU 3.2 and earlier, RBBIDataHeader::fDataFormat was actually 318 // an int32_t with a value of 1. Starting with ICU 3.4, 319 // RBBI's fDataFormat matches the dataFormat field from the 320 // UDataInfo header, four int8_t bytes. The value is {3,1,0,0} 321 // 322 const uint8_t *inBytes =(const uint8_t *)inData+headerSize; 323 RBBIDataHeader *rbbiDH = (RBBIDataHeader *)inBytes; 324 if (ds->readUInt32(rbbiDH->fMagic) != 0xb1a0 || 325 rbbiDH->fFormatVersion[0] != 3 || 326 ds->readUInt32(rbbiDH->fLength) < sizeof(RBBIDataHeader)) 327 { 328 udata_printError(ds, "ubrk_swap(): RBBI Data header is invalid.\n"); 329 *status=U_UNSUPPORTED_ERROR; 330 return 0; 331 } 332 333 // 334 // Prefight operation? Just return the size 335 // 336 int32_t breakDataLength = ds->readUInt32(rbbiDH->fLength); 337 int32_t totalSize = headerSize + breakDataLength; 338 if (length < 0) { 339 return totalSize; 340 } 341 342 // 343 // Check that length passed in is consistent with length from RBBI data header. 344 // 345 if (length < totalSize) { 346 udata_printError(ds, "ubrk_swap(): too few bytes (%d after ICU Data header) for break data.\n", 347 breakDataLength); 348 *status=U_INDEX_OUTOFBOUNDS_ERROR; 349 return 0; 350 } 351 352 353 // 354 // Swap the Data. Do the data itself first, then the RBBI Data Header, because 355 // we need to reference the header to locate the data, and an 356 // inplace swap of the header leaves it unusable. 357 // 358 uint8_t *outBytes = (uint8_t *)outData + headerSize; 359 RBBIDataHeader *outputDH = (RBBIDataHeader *)outBytes; 360 361 int32_t tableStartOffset; 362 int32_t tableLength; 363 364 // 365 // If not swapping in place, zero out the output buffer before starting. 366 // Individual tables and other data items within are aligned to 8 byte boundaries 367 // when originally created. Any unused space between items needs to be zero. 368 // 369 if (inBytes != outBytes) { 370 uprv_memset(outBytes, 0, breakDataLength); 371 } 372 373 // 374 // Each state table begins with several 32 bit fields. Calculate the size 375 // in bytes of these. 376 // 377 int32_t topSize = offsetof(RBBIStateTable, fTableData); 378 379 // Forward state table. 380 tableStartOffset = ds->readUInt32(rbbiDH->fFTable); 381 tableLength = ds->readUInt32(rbbiDH->fFTableLen); 382 383 if (tableLength > 0) { 384 ds->swapArray32(ds, inBytes+tableStartOffset, topSize, 385 outBytes+tableStartOffset, status); 386 ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize, 387 outBytes+tableStartOffset+topSize, status); 388 } 389 390 // Reverse state table. Same layout as forward table, above. 391 tableStartOffset = ds->readUInt32(rbbiDH->fRTable); 392 tableLength = ds->readUInt32(rbbiDH->fRTableLen); 393 394 if (tableLength > 0) { 395 ds->swapArray32(ds, inBytes+tableStartOffset, topSize, 396 outBytes+tableStartOffset, status); 397 ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize, 398 outBytes+tableStartOffset+topSize, status); 399 } 400 401 // Safe Forward state table. Same layout as forward table, above. 402 tableStartOffset = ds->readUInt32(rbbiDH->fSFTable); 403 tableLength = ds->readUInt32(rbbiDH->fSFTableLen); 404 405 if (tableLength > 0) { 406 ds->swapArray32(ds, inBytes+tableStartOffset, topSize, 407 outBytes+tableStartOffset, status); 408 ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize, 409 outBytes+tableStartOffset+topSize, status); 410 } 411 412 // Safe Reverse state table. Same layout as forward table, above. 413 tableStartOffset = ds->readUInt32(rbbiDH->fSRTable); 414 tableLength = ds->readUInt32(rbbiDH->fSRTableLen); 415 416 if (tableLength > 0) { 417 ds->swapArray32(ds, inBytes+tableStartOffset, topSize, 418 outBytes+tableStartOffset, status); 419 ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize, 420 outBytes+tableStartOffset+topSize, status); 421 } 422 423 // Trie table for character categories 424 utrie_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen), 425 outBytes+ds->readUInt32(rbbiDH->fTrie), status); 426 427 // Source Rules Text. It's UChar data 428 ds->swapArray16(ds, inBytes+ds->readUInt32(rbbiDH->fRuleSource), ds->readUInt32(rbbiDH->fRuleSourceLen), 429 outBytes+ds->readUInt32(rbbiDH->fRuleSource), status); 430 431 // Table of rule status values. It's all int_32 values 432 ds->swapArray32(ds, inBytes+ds->readUInt32(rbbiDH->fStatusTable), ds->readUInt32(rbbiDH->fStatusTableLen), 433 outBytes+ds->readUInt32(rbbiDH->fStatusTable), status); 434 435 // And, last, the header. 436 // It is all int32_t values except for fFormataVersion, which is an array of four bytes. 437 // Swap the whole thing as int32_t, then re-swap the one field. 438 // 439 ds->swapArray32(ds, inBytes, sizeof(RBBIDataHeader), outBytes, status); 440 ds->swapArray32(ds, outputDH->fFormatVersion, 4, outputDH->fFormatVersion, status); 441 442 return totalSize; 443 } 444 445 446 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 447