Home | History | Annotate | Download | only in common
      1 /*
      2 ***************************************************************************
      3 *   Copyright (C) 1999-2010 International Business Machines Corporation   *
      4 *   and others. All rights reserved.                                      *
      5 ***************************************************************************
      6 */
      7 
      8 #include "unicode/utypes.h"
      9 
     10 #if !UCONFIG_NO_BREAK_ITERATION
     11 
     12 #include "unicode/utypes.h"
     13 #include "rbbidata.h"
     14 #include "rbbirb.h"
     15 #include "utrie.h"
     16 #include "udatamem.h"
     17 #include "cmemory.h"
     18 #include "cstring.h"
     19 #include "umutex.h"
     20 
     21 #include "uassert.h"
     22 
     23 
     24 //-----------------------------------------------------------------------------------
     25 //
     26 //   Trie access folding function.  Copied as-is from properties code in uchar.c
     27 //
     28 //-----------------------------------------------------------------------------------
     29 U_CDECL_BEGIN
     30 static int32_t U_CALLCONV
     31 getFoldingOffset(uint32_t data) {
     32     /* if bit 15 is set, then the folding offset is in bits 14..0 of the 16-bit trie result */
     33     if(data&0x8000) {
     34         return (int32_t)(data&0x7fff);
     35     } else {
     36         return 0;
     37     }
     38 }
     39 U_CDECL_END
     40 
     41 U_NAMESPACE_BEGIN
     42 
     43 //-----------------------------------------------------------------------------
     44 //
     45 //    Constructors.
     46 //
     47 //-----------------------------------------------------------------------------
     48 RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status) {
     49     init(data, status);
     50 }
     51 
     52 RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, enum EDontAdopt, UErrorCode &status) {
     53     init(data, status);
     54     fDontFreeData = TRUE;
     55 }
     56 
     57 RBBIDataWrapper::RBBIDataWrapper(UDataMemory* udm, UErrorCode &status) {
     58     const RBBIDataHeader *d = (const RBBIDataHeader *)
     59         // ((char *)&(udm->pHeader->info) + udm->pHeader->info.size);
     60         // taking into consideration the padding added in by udata_write
     61         ((char *)(udm->pHeader) + udm->pHeader->dataHeader.headerSize);
     62     init(d, status);
     63     fUDataMem = udm;
     64 }
     65 
     66 //-----------------------------------------------------------------------------
     67 //
     68 //    init().   Does most of the work of construction, shared between the
     69 //              constructors.
     70 //
     71 //-----------------------------------------------------------------------------
     72 void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
     73     if (U_FAILURE(status)) {
     74         return;
     75     }
     76     fHeader = data;
     77     if (fHeader->fMagic != 0xb1a0 || fHeader->fFormatVersion[0] != 3)
     78     {
     79         status = U_INVALID_FORMAT_ERROR;
     80         return;
     81     }
     82     // Note: in ICU version 3.2 and earlier, there was a formatVersion 1
     83     //       that is no longer supported.  At that time fFormatVersion was
     84     //       an int32_t field, rather than an array of 4 bytes.
     85 
     86     fDontFreeData = FALSE;
     87     fUDataMem     = NULL;
     88     fReverseTable = NULL;
     89     fSafeFwdTable = NULL;
     90     fSafeRevTable = NULL;
     91     if (data->fFTableLen != 0) {
     92         fForwardTable = (RBBIStateTable *)((char *)data + fHeader->fFTable);
     93     }
     94     if (data->fRTableLen != 0) {
     95         fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable);
     96     }
     97     if (data->fSFTableLen != 0) {
     98         fSafeFwdTable = (RBBIStateTable *)((char *)data + fHeader->fSFTable);
     99     }
    100     if (data->fSRTableLen != 0) {
    101         fSafeRevTable = (RBBIStateTable *)((char *)data + fHeader->fSRTable);
    102     }
    103 
    104 
    105     utrie_unserialize(&fTrie,
    106                        (uint8_t *)data + fHeader->fTrie,
    107                        fHeader->fTrieLen,
    108                        &status);
    109     if (U_FAILURE(status)) {
    110         return;
    111     }
    112     fTrie.getFoldingOffset=getFoldingOffset;
    113 
    114 
    115     fRuleSource   = (UChar *)((char *)data + fHeader->fRuleSource);
    116     fRuleString.setTo(TRUE, fRuleSource, -1);
    117     U_ASSERT(data->fRuleSourceLen > 0);
    118 
    119     fRuleStatusTable = (int32_t *)((char *)data + fHeader->fStatusTable);
    120     fStatusMaxIdx    = data->fStatusTableLen / sizeof(int32_t);
    121 
    122     fRefCount = 1;
    123 
    124 #ifdef RBBI_DEBUG
    125     char *debugEnv = getenv("U_RBBIDEBUG");
    126     if (debugEnv && uprv_strstr(debugEnv, "data")) {this->printData();}
    127 #endif
    128 }
    129 
    130 
    131 //-----------------------------------------------------------------------------
    132 //
    133 //    Destructor.     Don't call this - use removeReference() instead.
    134 //
    135 //-----------------------------------------------------------------------------
    136 RBBIDataWrapper::~RBBIDataWrapper() {
    137     U_ASSERT(fRefCount == 0);
    138     if (fUDataMem) {
    139         udata_close(fUDataMem);
    140     } else if (!fDontFreeData) {
    141         uprv_free((void *)fHeader);
    142     }
    143 }
    144 
    145 
    146 
    147 //-----------------------------------------------------------------------------
    148 //
    149 //   Operator ==    Consider two RBBIDataWrappers to be equal if they
    150 //                  refer to the same underlying data.  Although
    151 //                  the data wrappers are normally shared between
    152 //                  iterator instances, it's possible to independently
    153 //                  open the same data twice, and get two instances, which
    154 //                  should still be ==.
    155 //
    156 //-----------------------------------------------------------------------------
    157 UBool RBBIDataWrapper::operator ==(const RBBIDataWrapper &other) const {
    158     if (fHeader == other.fHeader) {
    159         return TRUE;
    160     }
    161     if (fHeader->fLength != other.fHeader->fLength) {
    162         return FALSE;
    163     }
    164     if (uprv_memcmp(fHeader, other.fHeader, fHeader->fLength) == 0) {
    165         return TRUE;
    166     }
    167     return FALSE;
    168 }
    169 
    170 int32_t  RBBIDataWrapper::hashCode() {
    171     return fHeader->fFTableLen;
    172 }
    173 
    174 
    175 
    176 //-----------------------------------------------------------------------------
    177 //
    178 //    Reference Counting.   A single RBBIDataWrapper object is shared among
    179 //                          however many RulesBasedBreakIterator instances are
    180 //                          referencing the same data.
    181 //
    182 //-----------------------------------------------------------------------------
    183 void RBBIDataWrapper::removeReference() {
    184     if (umtx_atomic_dec(&fRefCount) == 0) {
    185         delete this;
    186     }
    187 }
    188 
    189 
    190 RBBIDataWrapper *RBBIDataWrapper::addReference() {
    191    umtx_atomic_inc(&fRefCount);
    192    return this;
    193 }
    194 
    195 
    196 
    197 //-----------------------------------------------------------------------------
    198 //
    199 //  getRuleSourceString
    200 //
    201 //-----------------------------------------------------------------------------
    202 const UnicodeString &RBBIDataWrapper::getRuleSourceString() const {
    203     return fRuleString;
    204 }
    205 
    206 
    207 //-----------------------------------------------------------------------------
    208 //
    209 //  print   -  debugging function to dump the runtime data tables.
    210 //
    211 //-----------------------------------------------------------------------------
    212 #ifdef RBBI_DEBUG
    213 void  RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *table) {
    214     uint32_t   c;
    215     uint32_t   s;
    216 
    217     RBBIDebugPrintf("   %s\n", heading);
    218 
    219     RBBIDebugPrintf("State |  Acc  LA TagIx");
    220     for (c=0; c<fHeader->fCatCount; c++) {RBBIDebugPrintf("%3d ", c);}
    221     RBBIDebugPrintf("\n------|---------------"); for (c=0;c<fHeader->fCatCount; c++) {
    222         RBBIDebugPrintf("----");
    223     }
    224     RBBIDebugPrintf("\n");
    225 
    226     if (table == NULL) {
    227         RBBIDebugPrintf("         N U L L   T A B L E\n\n");
    228         return;
    229     }
    230     for (s=0; s<table->fNumStates; s++) {
    231         RBBIStateTableRow *row = (RBBIStateTableRow *)
    232                                   (table->fTableData + (table->fRowLen * s));
    233         RBBIDebugPrintf("%4d  |  %3d %3d %3d ", s, row->fAccepting, row->fLookAhead, row->fTagIdx);
    234         for (c=0; c<fHeader->fCatCount; c++)  {
    235             RBBIDebugPrintf("%3d ", row->fNextState[c]);
    236         }
    237         RBBIDebugPrintf("\n");
    238     }
    239     RBBIDebugPrintf("\n");
    240 }
    241 #endif
    242 
    243 
    244 #ifdef RBBI_DEBUG
    245 void  RBBIDataWrapper::printData() {
    246     RBBIDebugPrintf("RBBI Data at %p\n", (void *)fHeader);
    247     RBBIDebugPrintf("   Version = {%d %d %d %d}\n", fHeader->fFormatVersion[0], fHeader->fFormatVersion[1],
    248                                                     fHeader->fFormatVersion[2], fHeader->fFormatVersion[3]);
    249     RBBIDebugPrintf("   total length of data  = %d\n", fHeader->fLength);
    250     RBBIDebugPrintf("   number of character categories = %d\n\n", fHeader->fCatCount);
    251 
    252     printTable("Forward State Transition Table", fForwardTable);
    253     printTable("Reverse State Transition Table", fReverseTable);
    254     printTable("Safe Forward State Transition Table", fSafeFwdTable);
    255     printTable("Safe Reverse State Transition Table", fSafeRevTable);
    256 
    257     RBBIDebugPrintf("\nOrignal Rules source:\n");
    258     for (int32_t c=0; fRuleSource[c] != 0; c++) {
    259         RBBIDebugPrintf("%c", fRuleSource[c]);
    260     }
    261     RBBIDebugPrintf("\n\n");
    262 }
    263 #endif
    264 
    265 
    266 U_NAMESPACE_END
    267 U_NAMESPACE_USE
    268 
    269 //-----------------------------------------------------------------------------
    270 //
    271 //  ubrk_swap   -  byte swap and char encoding swap of RBBI data
    272 //
    273 //-----------------------------------------------------------------------------
    274 
    275 U_CAPI int32_t U_EXPORT2
    276 ubrk_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
    277            UErrorCode *status) {
    278 
    279     if (status == NULL || U_FAILURE(*status)) {
    280         return 0;
    281     }
    282     if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) {
    283         *status=U_ILLEGAL_ARGUMENT_ERROR;
    284         return 0;
    285     }
    286 
    287     //
    288     //  Check that the data header is for for break data.
    289     //    (Header contents are defined in genbrk.cpp)
    290     //
    291     const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4);
    292     if(!(  pInfo->dataFormat[0]==0x42 &&   /* dataFormat="Brk " */
    293            pInfo->dataFormat[1]==0x72 &&
    294            pInfo->dataFormat[2]==0x6b &&
    295            pInfo->dataFormat[3]==0x20 &&
    296            pInfo->formatVersion[0]==3  )) {
    297         udata_printError(ds, "ubrk_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n",
    298                          pInfo->dataFormat[0], pInfo->dataFormat[1],
    299                          pInfo->dataFormat[2], pInfo->dataFormat[3],
    300                          pInfo->formatVersion[0]);
    301         *status=U_UNSUPPORTED_ERROR;
    302         return 0;
    303     }
    304 
    305     //
    306     // Swap the data header.  (This is the generic ICU Data Header, not the RBBI Specific
    307     //                         RBBIDataHeader).  This swap also conveniently gets us
    308     //                         the size of the ICU d.h., which lets us locate the start
    309     //                         of the RBBI specific data.
    310     //
    311     int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
    312 
    313 
    314     //
    315     // Get the RRBI Data Header, and check that it appears to be OK.
    316     //
    317     //    Note:  ICU 3.2 and earlier, RBBIDataHeader::fDataFormat was actually
    318     //           an int32_t with a value of 1.  Starting with ICU 3.4,
    319     //           RBBI's fDataFormat matches the dataFormat field from the
    320     //           UDataInfo header, four int8_t bytes.  The value is {3,1,0,0}
    321     //
    322     const uint8_t  *inBytes =(const uint8_t *)inData+headerSize;
    323     RBBIDataHeader *rbbiDH = (RBBIDataHeader *)inBytes;
    324     if (ds->readUInt32(rbbiDH->fMagic) != 0xb1a0 ||
    325         rbbiDH->fFormatVersion[0] != 3 ||
    326         ds->readUInt32(rbbiDH->fLength)  <  sizeof(RBBIDataHeader))
    327     {
    328         udata_printError(ds, "ubrk_swap(): RBBI Data header is invalid.\n");
    329         *status=U_UNSUPPORTED_ERROR;
    330         return 0;
    331     }
    332 
    333     //
    334     // Prefight operation?  Just return the size
    335     //
    336     int32_t breakDataLength = ds->readUInt32(rbbiDH->fLength);
    337     int32_t totalSize = headerSize + breakDataLength;
    338     if (length < 0) {
    339         return totalSize;
    340     }
    341 
    342     //
    343     // Check that length passed in is consistent with length from RBBI data header.
    344     //
    345     if (length < totalSize) {
    346         udata_printError(ds, "ubrk_swap(): too few bytes (%d after ICU Data header) for break data.\n",
    347                             breakDataLength);
    348         *status=U_INDEX_OUTOFBOUNDS_ERROR;
    349         return 0;
    350         }
    351 
    352 
    353     //
    354     // Swap the Data.  Do the data itself first, then the RBBI Data Header, because
    355     //                 we need to reference the header to locate the data, and an
    356     //                 inplace swap of the header leaves it unusable.
    357     //
    358     uint8_t         *outBytes = (uint8_t *)outData + headerSize;
    359     RBBIDataHeader  *outputDH = (RBBIDataHeader *)outBytes;
    360 
    361     int32_t   tableStartOffset;
    362     int32_t   tableLength;
    363 
    364     //
    365     // If not swapping in place, zero out the output buffer before starting.
    366     //    Individual tables and other data items within are aligned to 8 byte boundaries
    367     //    when originally created.  Any unused space between items needs to be zero.
    368     //
    369     if (inBytes != outBytes) {
    370         uprv_memset(outBytes, 0, breakDataLength);
    371     }
    372 
    373     //
    374     // Each state table begins with several 32 bit fields.  Calculate the size
    375     //   in bytes of these.
    376     //
    377     int32_t         topSize = offsetof(RBBIStateTable, fTableData);
    378 
    379     // Forward state table.
    380     tableStartOffset = ds->readUInt32(rbbiDH->fFTable);
    381     tableLength      = ds->readUInt32(rbbiDH->fFTableLen);
    382 
    383     if (tableLength > 0) {
    384         ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
    385                             outBytes+tableStartOffset, status);
    386         ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
    387                             outBytes+tableStartOffset+topSize, status);
    388     }
    389 
    390     // Reverse state table.  Same layout as forward table, above.
    391     tableStartOffset = ds->readUInt32(rbbiDH->fRTable);
    392     tableLength      = ds->readUInt32(rbbiDH->fRTableLen);
    393 
    394     if (tableLength > 0) {
    395         ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
    396                             outBytes+tableStartOffset, status);
    397         ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
    398                             outBytes+tableStartOffset+topSize, status);
    399     }
    400 
    401     // Safe Forward state table.  Same layout as forward table, above.
    402     tableStartOffset = ds->readUInt32(rbbiDH->fSFTable);
    403     tableLength      = ds->readUInt32(rbbiDH->fSFTableLen);
    404 
    405     if (tableLength > 0) {
    406         ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
    407                             outBytes+tableStartOffset, status);
    408         ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
    409                             outBytes+tableStartOffset+topSize, status);
    410     }
    411 
    412     // Safe Reverse state table.  Same layout as forward table, above.
    413     tableStartOffset = ds->readUInt32(rbbiDH->fSRTable);
    414     tableLength      = ds->readUInt32(rbbiDH->fSRTableLen);
    415 
    416     if (tableLength > 0) {
    417         ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
    418                             outBytes+tableStartOffset, status);
    419         ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
    420                             outBytes+tableStartOffset+topSize, status);
    421     }
    422 
    423     // Trie table for character categories
    424     utrie_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen),
    425                             outBytes+ds->readUInt32(rbbiDH->fTrie), status);
    426 
    427     // Source Rules Text.  It's UChar data
    428     ds->swapArray16(ds, inBytes+ds->readUInt32(rbbiDH->fRuleSource), ds->readUInt32(rbbiDH->fRuleSourceLen),
    429                         outBytes+ds->readUInt32(rbbiDH->fRuleSource), status);
    430 
    431     // Table of rule status values.  It's all int_32 values
    432     ds->swapArray32(ds, inBytes+ds->readUInt32(rbbiDH->fStatusTable), ds->readUInt32(rbbiDH->fStatusTableLen),
    433                         outBytes+ds->readUInt32(rbbiDH->fStatusTable), status);
    434 
    435     // And, last, the header.
    436     //   It is all int32_t values except for fFormataVersion, which is an array of four bytes.
    437     //   Swap the whole thing as int32_t, then re-swap the one field.
    438     //
    439     ds->swapArray32(ds, inBytes, sizeof(RBBIDataHeader), outBytes, status);
    440     ds->swapArray32(ds, outputDH->fFormatVersion, 4, outputDH->fFormatVersion, status);
    441 
    442     return totalSize;
    443 }
    444 
    445 
    446 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
    447