1 /* 2 *************************************************************************** 3 * Copyright (C) 1999-2014 International Business Machines Corporation * 4 * and others. All rights reserved. * 5 *************************************************************************** 6 */ 7 8 #include "unicode/utypes.h" 9 10 #if !UCONFIG_NO_BREAK_ITERATION 11 12 #include "unicode/utypes.h" 13 #include "rbbidata.h" 14 #include "rbbirb.h" 15 #include "utrie.h" 16 #include "udatamem.h" 17 #include "cmemory.h" 18 #include "cstring.h" 19 #include "umutex.h" 20 21 #include "uassert.h" 22 23 24 //----------------------------------------------------------------------------------- 25 // 26 // Trie access folding function. Copied as-is from properties code in uchar.c 27 // 28 //----------------------------------------------------------------------------------- 29 U_CDECL_BEGIN 30 static int32_t U_CALLCONV 31 getFoldingOffset(uint32_t data) { 32 /* if bit 15 is set, then the folding offset is in bits 14..0 of the 16-bit trie result */ 33 if(data&0x8000) { 34 return (int32_t)(data&0x7fff); 35 } else { 36 return 0; 37 } 38 } 39 U_CDECL_END 40 41 U_NAMESPACE_BEGIN 42 43 //----------------------------------------------------------------------------- 44 // 45 // Constructors. 46 // 47 //----------------------------------------------------------------------------- 48 RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status) { 49 init0(); 50 init(data, status); 51 } 52 53 RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, enum EDontAdopt, UErrorCode &status) { 54 init0(); 55 init(data, status); 56 fDontFreeData = TRUE; 57 } 58 59 RBBIDataWrapper::RBBIDataWrapper(UDataMemory* udm, UErrorCode &status) { 60 init0(); 61 if (U_FAILURE(status)) { 62 return; 63 } 64 const DataHeader *dh = udm->pHeader; 65 int32_t headerSize = dh->dataHeader.headerSize; 66 if ( !(headerSize >= 20 && 67 dh->info.isBigEndian == U_IS_BIG_ENDIAN && 68 dh->info.charsetFamily == U_CHARSET_FAMILY && 69 dh->info.dataFormat[0] == 0x42 && // dataFormat="Brk " 70 dh->info.dataFormat[1] == 0x72 && 71 dh->info.dataFormat[2] == 0x6b && 72 dh->info.dataFormat[3] == 0x20) 73 // Note: info.fFormatVersion is duplicated in the RBBIDataHeader, and is 74 // validated when checking that. 75 ) { 76 status = U_INVALID_FORMAT_ERROR; 77 return; 78 } 79 const char *dataAsBytes = reinterpret_cast<const char *>(dh); 80 const RBBIDataHeader *rbbidh = reinterpret_cast<const RBBIDataHeader *>(dataAsBytes + headerSize); 81 init(rbbidh, status); 82 fUDataMem = udm; 83 } 84 85 //----------------------------------------------------------------------------- 86 // 87 // init(). Does most of the work of construction, shared between the 88 // constructors. 89 // 90 //----------------------------------------------------------------------------- 91 void RBBIDataWrapper::init0() { 92 fHeader = NULL; 93 fForwardTable = NULL; 94 fReverseTable = NULL; 95 fSafeFwdTable = NULL; 96 fSafeRevTable = NULL; 97 fRuleSource = NULL; 98 fRuleStatusTable = NULL; 99 fUDataMem = NULL; 100 fRefCount = 0; 101 fDontFreeData = TRUE; 102 } 103 104 void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) { 105 if (U_FAILURE(status)) { 106 return; 107 } 108 fHeader = data; 109 if (fHeader->fMagic != 0xb1a0 || fHeader->fFormatVersion[0] != 3) 110 { 111 status = U_INVALID_FORMAT_ERROR; 112 return; 113 } 114 // Note: in ICU version 3.2 and earlier, there was a formatVersion 1 115 // that is no longer supported. At that time fFormatVersion was 116 // an int32_t field, rather than an array of 4 bytes. 117 118 fDontFreeData = FALSE; 119 if (data->fFTableLen != 0) { 120 fForwardTable = (RBBIStateTable *)((char *)data + fHeader->fFTable); 121 } 122 if (data->fRTableLen != 0) { 123 fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable); 124 } 125 if (data->fSFTableLen != 0) { 126 fSafeFwdTable = (RBBIStateTable *)((char *)data + fHeader->fSFTable); 127 } 128 if (data->fSRTableLen != 0) { 129 fSafeRevTable = (RBBIStateTable *)((char *)data + fHeader->fSRTable); 130 } 131 132 133 utrie_unserialize(&fTrie, 134 (uint8_t *)data + fHeader->fTrie, 135 fHeader->fTrieLen, 136 &status); 137 if (U_FAILURE(status)) { 138 return; 139 } 140 fTrie.getFoldingOffset=getFoldingOffset; 141 142 143 fRuleSource = (UChar *)((char *)data + fHeader->fRuleSource); 144 fRuleString.setTo(TRUE, fRuleSource, -1); 145 U_ASSERT(data->fRuleSourceLen > 0); 146 147 fRuleStatusTable = (int32_t *)((char *)data + fHeader->fStatusTable); 148 fStatusMaxIdx = data->fStatusTableLen / sizeof(int32_t); 149 150 fRefCount = 1; 151 152 #ifdef RBBI_DEBUG 153 char *debugEnv = getenv("U_RBBIDEBUG"); 154 if (debugEnv && uprv_strstr(debugEnv, "data")) {this->printData();} 155 #endif 156 } 157 158 159 //----------------------------------------------------------------------------- 160 // 161 // Destructor. Don't call this - use removeReference() instead. 162 // 163 //----------------------------------------------------------------------------- 164 RBBIDataWrapper::~RBBIDataWrapper() { 165 U_ASSERT(fRefCount == 0); 166 if (fUDataMem) { 167 udata_close(fUDataMem); 168 } else if (!fDontFreeData) { 169 uprv_free((void *)fHeader); 170 } 171 } 172 173 174 175 //----------------------------------------------------------------------------- 176 // 177 // Operator == Consider two RBBIDataWrappers to be equal if they 178 // refer to the same underlying data. Although 179 // the data wrappers are normally shared between 180 // iterator instances, it's possible to independently 181 // open the same data twice, and get two instances, which 182 // should still be ==. 183 // 184 //----------------------------------------------------------------------------- 185 UBool RBBIDataWrapper::operator ==(const RBBIDataWrapper &other) const { 186 if (fHeader == other.fHeader) { 187 return TRUE; 188 } 189 if (fHeader->fLength != other.fHeader->fLength) { 190 return FALSE; 191 } 192 if (uprv_memcmp(fHeader, other.fHeader, fHeader->fLength) == 0) { 193 return TRUE; 194 } 195 return FALSE; 196 } 197 198 int32_t RBBIDataWrapper::hashCode() { 199 return fHeader->fFTableLen; 200 } 201 202 203 204 //----------------------------------------------------------------------------- 205 // 206 // Reference Counting. A single RBBIDataWrapper object is shared among 207 // however many RulesBasedBreakIterator instances are 208 // referencing the same data. 209 // 210 //----------------------------------------------------------------------------- 211 void RBBIDataWrapper::removeReference() { 212 if (umtx_atomic_dec(&fRefCount) == 0) { 213 delete this; 214 } 215 } 216 217 218 RBBIDataWrapper *RBBIDataWrapper::addReference() { 219 umtx_atomic_inc(&fRefCount); 220 return this; 221 } 222 223 224 225 //----------------------------------------------------------------------------- 226 // 227 // getRuleSourceString 228 // 229 //----------------------------------------------------------------------------- 230 const UnicodeString &RBBIDataWrapper::getRuleSourceString() const { 231 return fRuleString; 232 } 233 234 235 //----------------------------------------------------------------------------- 236 // 237 // print - debugging function to dump the runtime data tables. 238 // 239 //----------------------------------------------------------------------------- 240 #ifdef RBBI_DEBUG 241 void RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *table) { 242 uint32_t c; 243 uint32_t s; 244 245 RBBIDebugPrintf(" %s\n", heading); 246 247 RBBIDebugPrintf("State | Acc LA TagIx"); 248 for (c=0; c<fHeader->fCatCount; c++) {RBBIDebugPrintf("%3d ", c);} 249 RBBIDebugPrintf("\n------|---------------"); for (c=0;c<fHeader->fCatCount; c++) { 250 RBBIDebugPrintf("----"); 251 } 252 RBBIDebugPrintf("\n"); 253 254 if (table == NULL) { 255 RBBIDebugPrintf(" N U L L T A B L E\n\n"); 256 return; 257 } 258 for (s=0; s<table->fNumStates; s++) { 259 RBBIStateTableRow *row = (RBBIStateTableRow *) 260 (table->fTableData + (table->fRowLen * s)); 261 RBBIDebugPrintf("%4d | %3d %3d %3d ", s, row->fAccepting, row->fLookAhead, row->fTagIdx); 262 for (c=0; c<fHeader->fCatCount; c++) { 263 RBBIDebugPrintf("%3d ", row->fNextState[c]); 264 } 265 RBBIDebugPrintf("\n"); 266 } 267 RBBIDebugPrintf("\n"); 268 } 269 #endif 270 271 272 #ifdef RBBI_DEBUG 273 void RBBIDataWrapper::printData() { 274 RBBIDebugPrintf("RBBI Data at %p\n", (void *)fHeader); 275 RBBIDebugPrintf(" Version = {%d %d %d %d}\n", fHeader->fFormatVersion[0], fHeader->fFormatVersion[1], 276 fHeader->fFormatVersion[2], fHeader->fFormatVersion[3]); 277 RBBIDebugPrintf(" total length of data = %d\n", fHeader->fLength); 278 RBBIDebugPrintf(" number of character categories = %d\n\n", fHeader->fCatCount); 279 280 printTable("Forward State Transition Table", fForwardTable); 281 printTable("Reverse State Transition Table", fReverseTable); 282 printTable("Safe Forward State Transition Table", fSafeFwdTable); 283 printTable("Safe Reverse State Transition Table", fSafeRevTable); 284 285 RBBIDebugPrintf("\nOrignal Rules source:\n"); 286 for (int32_t c=0; fRuleSource[c] != 0; c++) { 287 RBBIDebugPrintf("%c", fRuleSource[c]); 288 } 289 RBBIDebugPrintf("\n\n"); 290 } 291 #endif 292 293 294 U_NAMESPACE_END 295 U_NAMESPACE_USE 296 297 //----------------------------------------------------------------------------- 298 // 299 // ubrk_swap - byte swap and char encoding swap of RBBI data 300 // 301 //----------------------------------------------------------------------------- 302 303 U_CAPI int32_t U_EXPORT2 304 ubrk_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, 305 UErrorCode *status) { 306 307 if (status == NULL || U_FAILURE(*status)) { 308 return 0; 309 } 310 if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) { 311 *status=U_ILLEGAL_ARGUMENT_ERROR; 312 return 0; 313 } 314 315 // 316 // Check that the data header is for for break data. 317 // (Header contents are defined in genbrk.cpp) 318 // 319 const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4); 320 if(!( pInfo->dataFormat[0]==0x42 && /* dataFormat="Brk " */ 321 pInfo->dataFormat[1]==0x72 && 322 pInfo->dataFormat[2]==0x6b && 323 pInfo->dataFormat[3]==0x20 && 324 pInfo->formatVersion[0]==3 )) { 325 udata_printError(ds, "ubrk_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n", 326 pInfo->dataFormat[0], pInfo->dataFormat[1], 327 pInfo->dataFormat[2], pInfo->dataFormat[3], 328 pInfo->formatVersion[0]); 329 *status=U_UNSUPPORTED_ERROR; 330 return 0; 331 } 332 333 // 334 // Swap the data header. (This is the generic ICU Data Header, not the RBBI Specific 335 // RBBIDataHeader). This swap also conveniently gets us 336 // the size of the ICU d.h., which lets us locate the start 337 // of the RBBI specific data. 338 // 339 int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status); 340 341 342 // 343 // Get the RRBI Data Header, and check that it appears to be OK. 344 // 345 // Note: ICU 3.2 and earlier, RBBIDataHeader::fDataFormat was actually 346 // an int32_t with a value of 1. Starting with ICU 3.4, 347 // RBBI's fDataFormat matches the dataFormat field from the 348 // UDataInfo header, four int8_t bytes. The value is {3,1,0,0} 349 // 350 const uint8_t *inBytes =(const uint8_t *)inData+headerSize; 351 RBBIDataHeader *rbbiDH = (RBBIDataHeader *)inBytes; 352 if (ds->readUInt32(rbbiDH->fMagic) != 0xb1a0 || 353 rbbiDH->fFormatVersion[0] != 3 || 354 ds->readUInt32(rbbiDH->fLength) < sizeof(RBBIDataHeader)) 355 { 356 udata_printError(ds, "ubrk_swap(): RBBI Data header is invalid.\n"); 357 *status=U_UNSUPPORTED_ERROR; 358 return 0; 359 } 360 361 // 362 // Prefight operation? Just return the size 363 // 364 int32_t breakDataLength = ds->readUInt32(rbbiDH->fLength); 365 int32_t totalSize = headerSize + breakDataLength; 366 if (length < 0) { 367 return totalSize; 368 } 369 370 // 371 // Check that length passed in is consistent with length from RBBI data header. 372 // 373 if (length < totalSize) { 374 udata_printError(ds, "ubrk_swap(): too few bytes (%d after ICU Data header) for break data.\n", 375 breakDataLength); 376 *status=U_INDEX_OUTOFBOUNDS_ERROR; 377 return 0; 378 } 379 380 381 // 382 // Swap the Data. Do the data itself first, then the RBBI Data Header, because 383 // we need to reference the header to locate the data, and an 384 // inplace swap of the header leaves it unusable. 385 // 386 uint8_t *outBytes = (uint8_t *)outData + headerSize; 387 RBBIDataHeader *outputDH = (RBBIDataHeader *)outBytes; 388 389 int32_t tableStartOffset; 390 int32_t tableLength; 391 392 // 393 // If not swapping in place, zero out the output buffer before starting. 394 // Individual tables and other data items within are aligned to 8 byte boundaries 395 // when originally created. Any unused space between items needs to be zero. 396 // 397 if (inBytes != outBytes) { 398 uprv_memset(outBytes, 0, breakDataLength); 399 } 400 401 // 402 // Each state table begins with several 32 bit fields. Calculate the size 403 // in bytes of these. 404 // 405 int32_t topSize = offsetof(RBBIStateTable, fTableData); 406 407 // Forward state table. 408 tableStartOffset = ds->readUInt32(rbbiDH->fFTable); 409 tableLength = ds->readUInt32(rbbiDH->fFTableLen); 410 411 if (tableLength > 0) { 412 ds->swapArray32(ds, inBytes+tableStartOffset, topSize, 413 outBytes+tableStartOffset, status); 414 ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize, 415 outBytes+tableStartOffset+topSize, status); 416 } 417 418 // Reverse state table. Same layout as forward table, above. 419 tableStartOffset = ds->readUInt32(rbbiDH->fRTable); 420 tableLength = ds->readUInt32(rbbiDH->fRTableLen); 421 422 if (tableLength > 0) { 423 ds->swapArray32(ds, inBytes+tableStartOffset, topSize, 424 outBytes+tableStartOffset, status); 425 ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize, 426 outBytes+tableStartOffset+topSize, status); 427 } 428 429 // Safe Forward state table. Same layout as forward table, above. 430 tableStartOffset = ds->readUInt32(rbbiDH->fSFTable); 431 tableLength = ds->readUInt32(rbbiDH->fSFTableLen); 432 433 if (tableLength > 0) { 434 ds->swapArray32(ds, inBytes+tableStartOffset, topSize, 435 outBytes+tableStartOffset, status); 436 ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize, 437 outBytes+tableStartOffset+topSize, status); 438 } 439 440 // Safe Reverse state table. Same layout as forward table, above. 441 tableStartOffset = ds->readUInt32(rbbiDH->fSRTable); 442 tableLength = ds->readUInt32(rbbiDH->fSRTableLen); 443 444 if (tableLength > 0) { 445 ds->swapArray32(ds, inBytes+tableStartOffset, topSize, 446 outBytes+tableStartOffset, status); 447 ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize, 448 outBytes+tableStartOffset+topSize, status); 449 } 450 451 // Trie table for character categories 452 utrie_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen), 453 outBytes+ds->readUInt32(rbbiDH->fTrie), status); 454 455 // Source Rules Text. It's UChar data 456 ds->swapArray16(ds, inBytes+ds->readUInt32(rbbiDH->fRuleSource), ds->readUInt32(rbbiDH->fRuleSourceLen), 457 outBytes+ds->readUInt32(rbbiDH->fRuleSource), status); 458 459 // Table of rule status values. It's all int_32 values 460 ds->swapArray32(ds, inBytes+ds->readUInt32(rbbiDH->fStatusTable), ds->readUInt32(rbbiDH->fStatusTableLen), 461 outBytes+ds->readUInt32(rbbiDH->fStatusTable), status); 462 463 // And, last, the header. 464 // It is all int32_t values except for fFormataVersion, which is an array of four bytes. 465 // Swap the whole thing as int32_t, then re-swap the one field. 466 // 467 ds->swapArray32(ds, inBytes, sizeof(RBBIDataHeader), outBytes, status); 468 ds->swapArray32(ds, outputDH->fFormatVersion, 4, outputDH->fFormatVersion, status); 469 470 return totalSize; 471 } 472 473 474 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 475