Home | History | Annotate | Download | only in common
      1 /*
      2 ***************************************************************************
      3 *   Copyright (C) 1999-2008 International Business Machines Corporation   *
      4 *   and others. All rights reserved.                                      *
      5 ***************************************************************************
      6 */
      7 
      8 #include "unicode/utypes.h"
      9 
     10 #if !UCONFIG_NO_BREAK_ITERATION
     11 
     12 #include "unicode/utypes.h"
     13 #include "rbbidata.h"
     14 #include "rbbirb.h"
     15 #include "utrie.h"
     16 #include "udatamem.h"
     17 #include "cmemory.h"
     18 #include "cstring.h"
     19 #include "umutex.h"
     20 
     21 #include "uassert.h"
     22 
     23 
     24 //-----------------------------------------------------------------------------------
     25 //
     26 //   Trie access folding function.  Copied as-is from properties code in uchar.c
     27 //
     28 //-----------------------------------------------------------------------------------
     29 U_CDECL_BEGIN
     30 static int32_t U_CALLCONV
     31 getFoldingOffset(uint32_t data) {
     32     /* if bit 15 is set, then the folding offset is in bits 14..0 of the 16-bit trie result */
     33     if(data&0x8000) {
     34         return (int32_t)(data&0x7fff);
     35     } else {
     36         return 0;
     37     }
     38 }
     39 U_CDECL_END
     40 
     41 U_NAMESPACE_BEGIN
     42 
     43 //-----------------------------------------------------------------------------
     44 //
     45 //    Constructors.
     46 //
     47 //-----------------------------------------------------------------------------
     48 RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status) {
     49     init(data, status);
     50 }
     51 
     52 RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, enum EDontAdopt, UErrorCode &status) {
     53     init(data, status);
     54     fDontFreeData = TRUE;
     55 }
     56 
     57 RBBIDataWrapper::RBBIDataWrapper(UDataMemory* udm, UErrorCode &status) {
     58     const RBBIDataHeader *d = (const RBBIDataHeader *)
     59         // ((char *)&(udm->pHeader->info) + udm->pHeader->info.size);
     60         // taking into consideration the padding added in by udata_write
     61         ((char *)(udm->pHeader) + udm->pHeader->dataHeader.headerSize);
     62     init(d, status);
     63     fUDataMem = udm;
     64 }
     65 
     66 //-----------------------------------------------------------------------------
     67 //
     68 //    init().   Does most of the work of construction, shared between the
     69 //              constructors.
     70 //
     71 //-----------------------------------------------------------------------------
     72 void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
     73     if (U_FAILURE(status)) {
     74         return;
     75     }
     76     fHeader = data;
     77     if (fHeader->fMagic != 0xb1a0 ||
     78         !(fHeader->fFormatVersion[0] == 3 ||         // ICU 3.4
     79           *(int32_t *)fHeader->fFormatVersion == 1))  // ICU 3.2 and earlier.
     80     {
     81         status = U_INVALID_FORMAT_ERROR;
     82         return;
     83     }
     84 
     85     fDontFreeData = FALSE;
     86     fUDataMem     = NULL;
     87     fReverseTable = NULL;
     88     fSafeFwdTable = NULL;
     89     fSafeRevTable = NULL;
     90     if (data->fFTableLen != 0) {
     91         fForwardTable = (RBBIStateTable *)((char *)data + fHeader->fFTable);
     92     }
     93     if (data->fRTableLen != 0) {
     94         fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable);
     95     }
     96     if (data->fSFTableLen != 0) {
     97         fSafeFwdTable = (RBBIStateTable *)((char *)data + fHeader->fSFTable);
     98     }
     99     if (data->fSRTableLen != 0) {
    100         fSafeRevTable = (RBBIStateTable *)((char *)data + fHeader->fSRTable);
    101     }
    102 
    103 
    104     utrie_unserialize(&fTrie,
    105                        (uint8_t *)data + fHeader->fTrie,
    106                        fHeader->fTrieLen,
    107                        &status);
    108     if (U_FAILURE(status)) {
    109         return;
    110     }
    111     fTrie.getFoldingOffset=getFoldingOffset;
    112 
    113 
    114     fRuleSource   = (UChar *)((char *)data + fHeader->fRuleSource);
    115     fRuleString.setTo(TRUE, fRuleSource, -1);
    116     U_ASSERT(data->fRuleSourceLen > 0);
    117 
    118     fRuleStatusTable = (int32_t *)((char *)data + fHeader->fStatusTable);
    119     fStatusMaxIdx    = data->fStatusTableLen / sizeof(int32_t);
    120 
    121     fRefCount = 1;
    122 
    123 #ifdef RBBI_DEBUG
    124     char *debugEnv = getenv("U_RBBIDEBUG");
    125     if (debugEnv && uprv_strstr(debugEnv, "data")) {this->printData();}
    126 #endif
    127 }
    128 
    129 
    130 //-----------------------------------------------------------------------------
    131 //
    132 //    Destructor.     Don't call this - use removeReference() instead.
    133 //
    134 //-----------------------------------------------------------------------------
    135 RBBIDataWrapper::~RBBIDataWrapper() {
    136     U_ASSERT(fRefCount == 0);
    137     if (fUDataMem) {
    138         udata_close(fUDataMem);
    139     } else if (!fDontFreeData) {
    140         uprv_free((void *)fHeader);
    141     }
    142 }
    143 
    144 
    145 
    146 //-----------------------------------------------------------------------------
    147 //
    148 //   Operator ==    Consider two RBBIDataWrappers to be equal if they
    149 //                  refer to the same underlying data.  Although
    150 //                  the data wrappers are normally shared between
    151 //                  iterator instances, it's possible to independently
    152 //                  open the same data twice, and get two instances, which
    153 //                  should still be ==.
    154 //
    155 //-----------------------------------------------------------------------------
    156 UBool RBBIDataWrapper::operator ==(const RBBIDataWrapper &other) const {
    157     if (fHeader == other.fHeader) {
    158         return TRUE;
    159     }
    160     if (fHeader->fLength != other.fHeader->fLength) {
    161         return FALSE;
    162     }
    163     if (uprv_memcmp(fHeader, other.fHeader, fHeader->fLength) == 0) {
    164         return TRUE;
    165     }
    166     return FALSE;
    167 }
    168 
    169 int32_t  RBBIDataWrapper::hashCode() {
    170     return fHeader->fFTableLen;
    171 }
    172 
    173 
    174 
    175 //-----------------------------------------------------------------------------
    176 //
    177 //    Reference Counting.   A single RBBIDataWrapper object is shared among
    178 //                          however many RulesBasedBreakIterator instances are
    179 //                          referencing the same data.
    180 //
    181 //-----------------------------------------------------------------------------
    182 void RBBIDataWrapper::removeReference() {
    183     if (umtx_atomic_dec(&fRefCount) == 0) {
    184         delete this;
    185     }
    186 }
    187 
    188 
    189 RBBIDataWrapper *RBBIDataWrapper::addReference() {
    190    umtx_atomic_inc(&fRefCount);
    191    return this;
    192 }
    193 
    194 
    195 
    196 //-----------------------------------------------------------------------------
    197 //
    198 //  getRuleSourceString
    199 //
    200 //-----------------------------------------------------------------------------
    201 const UnicodeString &RBBIDataWrapper::getRuleSourceString() const {
    202     return fRuleString;
    203 }
    204 
    205 
    206 //-----------------------------------------------------------------------------
    207 //
    208 //  print   -  debugging function to dump the runtime data tables.
    209 //
    210 //-----------------------------------------------------------------------------
    211 #ifdef RBBI_DEBUG
    212 void  RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *table) {
    213     uint32_t   c;
    214     uint32_t   s;
    215 
    216     RBBIDebugPrintf("   %s\n", heading);
    217 
    218     RBBIDebugPrintf("State |  Acc  LA TagIx");
    219     for (c=0; c<fHeader->fCatCount; c++) {RBBIDebugPrintf("%3d ", c);}
    220     RBBIDebugPrintf("\n------|---------------"); for (c=0;c<fHeader->fCatCount; c++) {
    221         RBBIDebugPrintf("----");
    222     }
    223     RBBIDebugPrintf("\n");
    224 
    225     if (table == NULL) {
    226         RBBIDebugPrintf("         N U L L   T A B L E\n\n");
    227         return;
    228     }
    229     for (s=0; s<table->fNumStates; s++) {
    230         RBBIStateTableRow *row = (RBBIStateTableRow *)
    231                                   (table->fTableData + (table->fRowLen * s));
    232         RBBIDebugPrintf("%4d  |  %3d %3d %3d ", s, row->fAccepting, row->fLookAhead, row->fTagIdx);
    233         for (c=0; c<fHeader->fCatCount; c++)  {
    234             RBBIDebugPrintf("%3d ", row->fNextState[c]);
    235         }
    236         RBBIDebugPrintf("\n");
    237     }
    238     RBBIDebugPrintf("\n");
    239 }
    240 #endif
    241 
    242 
    243 #ifdef RBBI_DEBUG
    244 void  RBBIDataWrapper::printData() {
    245     RBBIDebugPrintf("RBBI Data at %p\n", (void *)fHeader);
    246     RBBIDebugPrintf("   Version = {%d %d %d %d}\n", fHeader->fFormatVersion[0], fHeader->fFormatVersion[1],
    247                                                     fHeader->fFormatVersion[2], fHeader->fFormatVersion[3]);
    248     RBBIDebugPrintf("   total length of data  = %d\n", fHeader->fLength);
    249     RBBIDebugPrintf("   number of character categories = %d\n\n", fHeader->fCatCount);
    250 
    251     printTable("Forward State Transition Table", fForwardTable);
    252     printTable("Reverse State Transition Table", fReverseTable);
    253     printTable("Safe Forward State Transition Table", fSafeFwdTable);
    254     printTable("Safe Reverse State Transition Table", fSafeRevTable);
    255 
    256     RBBIDebugPrintf("\nOrignal Rules source:\n");
    257     for (int32_t c=0; fRuleSource[c] != 0; c++) {
    258         RBBIDebugPrintf("%c", fRuleSource[c]);
    259     }
    260     RBBIDebugPrintf("\n\n");
    261 }
    262 #endif
    263 
    264 
    265 U_NAMESPACE_END
    266 U_NAMESPACE_USE
    267 
    268 //-----------------------------------------------------------------------------
    269 //
    270 //  ubrk_swap   -  byte swap and char encoding swap of RBBI data
    271 //
    272 //-----------------------------------------------------------------------------
    273 
    274 U_CAPI int32_t U_EXPORT2
    275 ubrk_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
    276            UErrorCode *status) {
    277 
    278     if (status == NULL || U_FAILURE(*status)) {
    279         return 0;
    280     }
    281     if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) {
    282         *status=U_ILLEGAL_ARGUMENT_ERROR;
    283         return 0;
    284     }
    285 
    286     //
    287     //  Check that the data header is for for break data.
    288     //    (Header contents are defined in genbrk.cpp)
    289     //
    290     const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4);
    291     if(!(  pInfo->dataFormat[0]==0x42 &&   /* dataFormat="Brk " */
    292            pInfo->dataFormat[1]==0x72 &&
    293            pInfo->dataFormat[2]==0x6b &&
    294            pInfo->dataFormat[3]==0x20 &&
    295            pInfo->formatVersion[0]==3  )) {
    296         udata_printError(ds, "ubrk_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n",
    297                          pInfo->dataFormat[0], pInfo->dataFormat[1],
    298                          pInfo->dataFormat[2], pInfo->dataFormat[3],
    299                          pInfo->formatVersion[0]);
    300         *status=U_UNSUPPORTED_ERROR;
    301         return 0;
    302     }
    303 
    304     //
    305     // Swap the data header.  (This is the generic ICU Data Header, not the RBBI Specific
    306     //                         RBBIDataHeader).  This swap also conveniently gets us
    307     //                         the size of the ICU d.h., which lets us locate the start
    308     //                         of the RBBI specific data.
    309     //
    310     int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
    311 
    312 
    313     //
    314     // Get the RRBI Data Header, and check that it appears to be OK.
    315     //
    316     //    Note:  ICU 3.2 and earlier, RBBIDataHeader::fDataFormat was actually
    317     //           an int32_t with a value of 1.  Starting with ICU 3.4,
    318     //           RBBI's fDataFormat matches the dataFormat field from the
    319     //           UDataInfo header, four int8_t bytes.  The value is {3,1,0,0}
    320     //
    321     const uint8_t  *inBytes =(const uint8_t *)inData+headerSize;
    322     RBBIDataHeader *rbbiDH = (RBBIDataHeader *)inBytes;
    323     UBool           formatVersionOne = ds->readUInt32(*(int32_t *)rbbiDH->fFormatVersion) == 1;
    324     if (ds->readUInt32(rbbiDH->fMagic)   != 0xb1a0 ||
    325         !(formatVersionOne || rbbiDH->fFormatVersion[0] == 3)   ||
    326         ds->readUInt32(rbbiDH->fLength)  <  sizeof(RBBIDataHeader))
    327     {
    328         udata_printError(ds, "ubrk_swap(): RBBI Data header is invalid.\n");
    329         *status=U_UNSUPPORTED_ERROR;
    330         return 0;
    331     }
    332 
    333     //
    334     // Prefight operation?  Just return the size
    335     //
    336     int32_t breakDataLength = ds->readUInt32(rbbiDH->fLength);
    337     int32_t totalSize = headerSize + breakDataLength;
    338     if (length < 0) {
    339         return totalSize;
    340     }
    341 
    342     //
    343     // Check that length passed in is consistent with length from RBBI data header.
    344     //
    345     if (length < totalSize) {
    346         udata_printError(ds, "ubrk_swap(): too few bytes (%d after ICU Data header) for break data.\n",
    347                             breakDataLength);
    348         *status=U_INDEX_OUTOFBOUNDS_ERROR;
    349         return 0;
    350         }
    351 
    352 
    353     //
    354     // Swap the Data.  Do the data itself first, then the RBBI Data Header, because
    355     //                 we need to reference the header to locate the data, and an
    356     //                 inplace swap of the header leaves it unusable.
    357     //
    358     uint8_t         *outBytes = (uint8_t *)outData + headerSize;
    359     RBBIDataHeader  *outputDH = (RBBIDataHeader *)outBytes;
    360 
    361     int32_t   tableStartOffset;
    362     int32_t   tableLength;
    363 
    364     //
    365     // If not swapping in place, zero out the output buffer before starting.
    366     //    Individual tables and other data items within are aligned to 8 byte boundaries
    367     //    when originally created.  Any unused space between items needs to be zero.
    368     //
    369     if (inBytes != outBytes) {
    370         uprv_memset(outBytes, 0, breakDataLength);
    371     }
    372 
    373     //
    374     // Each state table begins with several 32 bit fields.  Calculate the size
    375     //   in bytes of these.
    376     //
    377     int32_t         topSize = offsetof(RBBIStateTable, fTableData);
    378 
    379     // Forward state table.
    380     tableStartOffset = ds->readUInt32(rbbiDH->fFTable);
    381     tableLength      = ds->readUInt32(rbbiDH->fFTableLen);
    382 
    383     if (tableLength > 0) {
    384         ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
    385                             outBytes+tableStartOffset, status);
    386         ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
    387                             outBytes+tableStartOffset+topSize, status);
    388     }
    389 
    390     // Reverse state table.  Same layout as forward table, above.
    391     tableStartOffset = ds->readUInt32(rbbiDH->fRTable);
    392     tableLength      = ds->readUInt32(rbbiDH->fRTableLen);
    393 
    394     if (tableLength > 0) {
    395         ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
    396                             outBytes+tableStartOffset, status);
    397         ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
    398                             outBytes+tableStartOffset+topSize, status);
    399     }
    400 
    401     // Safe Forward state table.  Same layout as forward table, above.
    402     tableStartOffset = ds->readUInt32(rbbiDH->fSFTable);
    403     tableLength      = ds->readUInt32(rbbiDH->fSFTableLen);
    404 
    405     if (tableLength > 0) {
    406         ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
    407                             outBytes+tableStartOffset, status);
    408         ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
    409                             outBytes+tableStartOffset+topSize, status);
    410     }
    411 
    412     // Safe Reverse state table.  Same layout as forward table, above.
    413     tableStartOffset = ds->readUInt32(rbbiDH->fSRTable);
    414     tableLength      = ds->readUInt32(rbbiDH->fSRTableLen);
    415 
    416     if (tableLength > 0) {
    417         ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
    418                             outBytes+tableStartOffset, status);
    419         ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
    420                             outBytes+tableStartOffset+topSize, status);
    421     }
    422 
    423     // Trie table for character categories
    424     utrie_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen),
    425                             outBytes+ds->readUInt32(rbbiDH->fTrie), status);
    426 
    427     // Source Rules Text.  It's UChar data
    428     ds->swapArray16(ds, inBytes+ds->readUInt32(rbbiDH->fRuleSource), ds->readUInt32(rbbiDH->fRuleSourceLen),
    429                         outBytes+ds->readUInt32(rbbiDH->fRuleSource), status);
    430 
    431     // Table of rule status values.  It's all int_32 values
    432     ds->swapArray32(ds, inBytes+ds->readUInt32(rbbiDH->fStatusTable), ds->readUInt32(rbbiDH->fStatusTableLen),
    433                         outBytes+ds->readUInt32(rbbiDH->fStatusTable), status);
    434 
    435     // And, last, the header.
    436     //   For the old version one format, the entire header consists of int32_t values.
    437     //   For the newer formats, the fDataFormat field is an array of four bytes.
    438     //   Swap the whole thing as int32_t, then, for the newer format, re-swap the one field.
    439     //
    440     ds->swapArray32(ds, inBytes, sizeof(RBBIDataHeader), outBytes, status);
    441     if (formatVersionOne == FALSE) {
    442         ds->swapArray32(ds, outputDH->fFormatVersion, 4, outputDH->fFormatVersion, status);
    443     }
    444 
    445 
    446     return totalSize;
    447 }
    448 
    449 
    450 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
    451