Home | History | Annotate | Download | only in common
      1 /*
      2 ***************************************************************************
      3 *   Copyright (C) 1999-2014 International Business Machines Corporation   *
      4 *   and others. All rights reserved.                                      *
      5 ***************************************************************************
      6 */
      7 
      8 #include "unicode/utypes.h"
      9 
     10 #if !UCONFIG_NO_BREAK_ITERATION
     11 
     12 #include "unicode/utypes.h"
     13 #include "rbbidata.h"
     14 #include "rbbirb.h"
     15 #include "utrie.h"
     16 #include "udatamem.h"
     17 #include "cmemory.h"
     18 #include "cstring.h"
     19 #include "umutex.h"
     20 
     21 #include "uassert.h"
     22 
     23 
     24 //-----------------------------------------------------------------------------------
     25 //
     26 //   Trie access folding function.  Copied as-is from properties code in uchar.c
     27 //
     28 //-----------------------------------------------------------------------------------
     29 U_CDECL_BEGIN
     30 static int32_t U_CALLCONV
     31 getFoldingOffset(uint32_t data) {
     32     /* if bit 15 is set, then the folding offset is in bits 14..0 of the 16-bit trie result */
     33     if(data&0x8000) {
     34         return (int32_t)(data&0x7fff);
     35     } else {
     36         return 0;
     37     }
     38 }
     39 U_CDECL_END
     40 
     41 U_NAMESPACE_BEGIN
     42 
     43 //-----------------------------------------------------------------------------
     44 //
     45 //    Constructors.
     46 //
     47 //-----------------------------------------------------------------------------
     48 RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status) {
     49     init0();
     50     init(data, status);
     51 }
     52 
     53 RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, enum EDontAdopt, UErrorCode &status) {
     54     init0();
     55     init(data, status);
     56     fDontFreeData = TRUE;
     57 }
     58 
     59 RBBIDataWrapper::RBBIDataWrapper(UDataMemory* udm, UErrorCode &status) {
     60     init0();
     61     if (U_FAILURE(status)) {
     62         return;
     63     }
     64     const DataHeader *dh = udm->pHeader;
     65     int32_t headerSize = dh->dataHeader.headerSize;
     66     if (  !(headerSize >= 20 &&
     67             dh->info.isBigEndian == U_IS_BIG_ENDIAN &&
     68             dh->info.charsetFamily == U_CHARSET_FAMILY &&
     69             dh->info.dataFormat[0] == 0x42 &&  // dataFormat="Brk "
     70             dh->info.dataFormat[1] == 0x72 &&
     71             dh->info.dataFormat[2] == 0x6b &&
     72             dh->info.dataFormat[3] == 0x20)
     73             // Note: info.fFormatVersion is duplicated in the RBBIDataHeader, and is
     74             //       validated when checking that.
     75         ) {
     76         status = U_INVALID_FORMAT_ERROR;
     77         return;
     78     }
     79     const char *dataAsBytes = reinterpret_cast<const char *>(dh);
     80     const RBBIDataHeader *rbbidh = reinterpret_cast<const RBBIDataHeader *>(dataAsBytes + headerSize);
     81     init(rbbidh, status);
     82     fUDataMem = udm;
     83 }
     84 
     85 //-----------------------------------------------------------------------------
     86 //
     87 //    init().   Does most of the work of construction, shared between the
     88 //              constructors.
     89 //
     90 //-----------------------------------------------------------------------------
     91 void RBBIDataWrapper::init0() {
     92     fHeader = NULL;
     93     fForwardTable = NULL;
     94     fReverseTable = NULL;
     95     fSafeFwdTable = NULL;
     96     fSafeRevTable = NULL;
     97     fRuleSource = NULL;
     98     fRuleStatusTable = NULL;
     99     fUDataMem = NULL;
    100     fRefCount = 0;
    101     fDontFreeData = TRUE;
    102 }
    103 
    104 void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
    105     if (U_FAILURE(status)) {
    106         return;
    107     }
    108     fHeader = data;
    109     if (fHeader->fMagic != 0xb1a0 || fHeader->fFormatVersion[0] != 3)
    110     {
    111         status = U_INVALID_FORMAT_ERROR;
    112         return;
    113     }
    114     // Note: in ICU version 3.2 and earlier, there was a formatVersion 1
    115     //       that is no longer supported.  At that time fFormatVersion was
    116     //       an int32_t field, rather than an array of 4 bytes.
    117 
    118     fDontFreeData = FALSE;
    119     if (data->fFTableLen != 0) {
    120         fForwardTable = (RBBIStateTable *)((char *)data + fHeader->fFTable);
    121     }
    122     if (data->fRTableLen != 0) {
    123         fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable);
    124     }
    125     if (data->fSFTableLen != 0) {
    126         fSafeFwdTable = (RBBIStateTable *)((char *)data + fHeader->fSFTable);
    127     }
    128     if (data->fSRTableLen != 0) {
    129         fSafeRevTable = (RBBIStateTable *)((char *)data + fHeader->fSRTable);
    130     }
    131 
    132 
    133     utrie_unserialize(&fTrie,
    134                        (uint8_t *)data + fHeader->fTrie,
    135                        fHeader->fTrieLen,
    136                        &status);
    137     if (U_FAILURE(status)) {
    138         return;
    139     }
    140     fTrie.getFoldingOffset=getFoldingOffset;
    141 
    142 
    143     fRuleSource   = (UChar *)((char *)data + fHeader->fRuleSource);
    144     fRuleString.setTo(TRUE, fRuleSource, -1);
    145     U_ASSERT(data->fRuleSourceLen > 0);
    146 
    147     fRuleStatusTable = (int32_t *)((char *)data + fHeader->fStatusTable);
    148     fStatusMaxIdx    = data->fStatusTableLen / sizeof(int32_t);
    149 
    150     fRefCount = 1;
    151 
    152 #ifdef RBBI_DEBUG
    153     char *debugEnv = getenv("U_RBBIDEBUG");
    154     if (debugEnv && uprv_strstr(debugEnv, "data")) {this->printData();}
    155 #endif
    156 }
    157 
    158 
    159 //-----------------------------------------------------------------------------
    160 //
    161 //    Destructor.     Don't call this - use removeReference() instead.
    162 //
    163 //-----------------------------------------------------------------------------
    164 RBBIDataWrapper::~RBBIDataWrapper() {
    165     U_ASSERT(fRefCount == 0);
    166     if (fUDataMem) {
    167         udata_close(fUDataMem);
    168     } else if (!fDontFreeData) {
    169         uprv_free((void *)fHeader);
    170     }
    171 }
    172 
    173 
    174 
    175 //-----------------------------------------------------------------------------
    176 //
    177 //   Operator ==    Consider two RBBIDataWrappers to be equal if they
    178 //                  refer to the same underlying data.  Although
    179 //                  the data wrappers are normally shared between
    180 //                  iterator instances, it's possible to independently
    181 //                  open the same data twice, and get two instances, which
    182 //                  should still be ==.
    183 //
    184 //-----------------------------------------------------------------------------
    185 UBool RBBIDataWrapper::operator ==(const RBBIDataWrapper &other) const {
    186     if (fHeader == other.fHeader) {
    187         return TRUE;
    188     }
    189     if (fHeader->fLength != other.fHeader->fLength) {
    190         return FALSE;
    191     }
    192     if (uprv_memcmp(fHeader, other.fHeader, fHeader->fLength) == 0) {
    193         return TRUE;
    194     }
    195     return FALSE;
    196 }
    197 
    198 int32_t  RBBIDataWrapper::hashCode() {
    199     return fHeader->fFTableLen;
    200 }
    201 
    202 
    203 
    204 //-----------------------------------------------------------------------------
    205 //
    206 //    Reference Counting.   A single RBBIDataWrapper object is shared among
    207 //                          however many RulesBasedBreakIterator instances are
    208 //                          referencing the same data.
    209 //
    210 //-----------------------------------------------------------------------------
    211 void RBBIDataWrapper::removeReference() {
    212     if (umtx_atomic_dec(&fRefCount) == 0) {
    213         delete this;
    214     }
    215 }
    216 
    217 
    218 RBBIDataWrapper *RBBIDataWrapper::addReference() {
    219    umtx_atomic_inc(&fRefCount);
    220    return this;
    221 }
    222 
    223 
    224 
    225 //-----------------------------------------------------------------------------
    226 //
    227 //  getRuleSourceString
    228 //
    229 //-----------------------------------------------------------------------------
    230 const UnicodeString &RBBIDataWrapper::getRuleSourceString() const {
    231     return fRuleString;
    232 }
    233 
    234 
    235 //-----------------------------------------------------------------------------
    236 //
    237 //  print   -  debugging function to dump the runtime data tables.
    238 //
    239 //-----------------------------------------------------------------------------
    240 #ifdef RBBI_DEBUG
    241 void  RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *table) {
    242     uint32_t   c;
    243     uint32_t   s;
    244 
    245     RBBIDebugPrintf("   %s\n", heading);
    246 
    247     RBBIDebugPrintf("State |  Acc  LA TagIx");
    248     for (c=0; c<fHeader->fCatCount; c++) {RBBIDebugPrintf("%3d ", c);}
    249     RBBIDebugPrintf("\n------|---------------"); for (c=0;c<fHeader->fCatCount; c++) {
    250         RBBIDebugPrintf("----");
    251     }
    252     RBBIDebugPrintf("\n");
    253 
    254     if (table == NULL) {
    255         RBBIDebugPrintf("         N U L L   T A B L E\n\n");
    256         return;
    257     }
    258     for (s=0; s<table->fNumStates; s++) {
    259         RBBIStateTableRow *row = (RBBIStateTableRow *)
    260                                   (table->fTableData + (table->fRowLen * s));
    261         RBBIDebugPrintf("%4d  |  %3d %3d %3d ", s, row->fAccepting, row->fLookAhead, row->fTagIdx);
    262         for (c=0; c<fHeader->fCatCount; c++)  {
    263             RBBIDebugPrintf("%3d ", row->fNextState[c]);
    264         }
    265         RBBIDebugPrintf("\n");
    266     }
    267     RBBIDebugPrintf("\n");
    268 }
    269 #endif
    270 
    271 
    272 #ifdef RBBI_DEBUG
    273 void  RBBIDataWrapper::printData() {
    274     RBBIDebugPrintf("RBBI Data at %p\n", (void *)fHeader);
    275     RBBIDebugPrintf("   Version = {%d %d %d %d}\n", fHeader->fFormatVersion[0], fHeader->fFormatVersion[1],
    276                                                     fHeader->fFormatVersion[2], fHeader->fFormatVersion[3]);
    277     RBBIDebugPrintf("   total length of data  = %d\n", fHeader->fLength);
    278     RBBIDebugPrintf("   number of character categories = %d\n\n", fHeader->fCatCount);
    279 
    280     printTable("Forward State Transition Table", fForwardTable);
    281     printTable("Reverse State Transition Table", fReverseTable);
    282     printTable("Safe Forward State Transition Table", fSafeFwdTable);
    283     printTable("Safe Reverse State Transition Table", fSafeRevTable);
    284 
    285     RBBIDebugPrintf("\nOrignal Rules source:\n");
    286     for (int32_t c=0; fRuleSource[c] != 0; c++) {
    287         RBBIDebugPrintf("%c", fRuleSource[c]);
    288     }
    289     RBBIDebugPrintf("\n\n");
    290 }
    291 #endif
    292 
    293 
    294 U_NAMESPACE_END
    295 U_NAMESPACE_USE
    296 
    297 //-----------------------------------------------------------------------------
    298 //
    299 //  ubrk_swap   -  byte swap and char encoding swap of RBBI data
    300 //
    301 //-----------------------------------------------------------------------------
    302 
    303 U_CAPI int32_t U_EXPORT2
    304 ubrk_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
    305            UErrorCode *status) {
    306 
    307     if (status == NULL || U_FAILURE(*status)) {
    308         return 0;
    309     }
    310     if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) {
    311         *status=U_ILLEGAL_ARGUMENT_ERROR;
    312         return 0;
    313     }
    314 
    315     //
    316     //  Check that the data header is for for break data.
    317     //    (Header contents are defined in genbrk.cpp)
    318     //
    319     const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4);
    320     if(!(  pInfo->dataFormat[0]==0x42 &&   /* dataFormat="Brk " */
    321            pInfo->dataFormat[1]==0x72 &&
    322            pInfo->dataFormat[2]==0x6b &&
    323            pInfo->dataFormat[3]==0x20 &&
    324            pInfo->formatVersion[0]==3  )) {
    325         udata_printError(ds, "ubrk_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n",
    326                          pInfo->dataFormat[0], pInfo->dataFormat[1],
    327                          pInfo->dataFormat[2], pInfo->dataFormat[3],
    328                          pInfo->formatVersion[0]);
    329         *status=U_UNSUPPORTED_ERROR;
    330         return 0;
    331     }
    332 
    333     //
    334     // Swap the data header.  (This is the generic ICU Data Header, not the RBBI Specific
    335     //                         RBBIDataHeader).  This swap also conveniently gets us
    336     //                         the size of the ICU d.h., which lets us locate the start
    337     //                         of the RBBI specific data.
    338     //
    339     int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
    340 
    341 
    342     //
    343     // Get the RRBI Data Header, and check that it appears to be OK.
    344     //
    345     //    Note:  ICU 3.2 and earlier, RBBIDataHeader::fDataFormat was actually
    346     //           an int32_t with a value of 1.  Starting with ICU 3.4,
    347     //           RBBI's fDataFormat matches the dataFormat field from the
    348     //           UDataInfo header, four int8_t bytes.  The value is {3,1,0,0}
    349     //
    350     const uint8_t  *inBytes =(const uint8_t *)inData+headerSize;
    351     RBBIDataHeader *rbbiDH = (RBBIDataHeader *)inBytes;
    352     if (ds->readUInt32(rbbiDH->fMagic) != 0xb1a0 ||
    353         rbbiDH->fFormatVersion[0] != 3 ||
    354         ds->readUInt32(rbbiDH->fLength)  <  sizeof(RBBIDataHeader))
    355     {
    356         udata_printError(ds, "ubrk_swap(): RBBI Data header is invalid.\n");
    357         *status=U_UNSUPPORTED_ERROR;
    358         return 0;
    359     }
    360 
    361     //
    362     // Prefight operation?  Just return the size
    363     //
    364     int32_t breakDataLength = ds->readUInt32(rbbiDH->fLength);
    365     int32_t totalSize = headerSize + breakDataLength;
    366     if (length < 0) {
    367         return totalSize;
    368     }
    369 
    370     //
    371     // Check that length passed in is consistent with length from RBBI data header.
    372     //
    373     if (length < totalSize) {
    374         udata_printError(ds, "ubrk_swap(): too few bytes (%d after ICU Data header) for break data.\n",
    375                             breakDataLength);
    376         *status=U_INDEX_OUTOFBOUNDS_ERROR;
    377         return 0;
    378         }
    379 
    380 
    381     //
    382     // Swap the Data.  Do the data itself first, then the RBBI Data Header, because
    383     //                 we need to reference the header to locate the data, and an
    384     //                 inplace swap of the header leaves it unusable.
    385     //
    386     uint8_t         *outBytes = (uint8_t *)outData + headerSize;
    387     RBBIDataHeader  *outputDH = (RBBIDataHeader *)outBytes;
    388 
    389     int32_t   tableStartOffset;
    390     int32_t   tableLength;
    391 
    392     //
    393     // If not swapping in place, zero out the output buffer before starting.
    394     //    Individual tables and other data items within are aligned to 8 byte boundaries
    395     //    when originally created.  Any unused space between items needs to be zero.
    396     //
    397     if (inBytes != outBytes) {
    398         uprv_memset(outBytes, 0, breakDataLength);
    399     }
    400 
    401     //
    402     // Each state table begins with several 32 bit fields.  Calculate the size
    403     //   in bytes of these.
    404     //
    405     int32_t         topSize = offsetof(RBBIStateTable, fTableData);
    406 
    407     // Forward state table.
    408     tableStartOffset = ds->readUInt32(rbbiDH->fFTable);
    409     tableLength      = ds->readUInt32(rbbiDH->fFTableLen);
    410 
    411     if (tableLength > 0) {
    412         ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
    413                             outBytes+tableStartOffset, status);
    414         ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
    415                             outBytes+tableStartOffset+topSize, status);
    416     }
    417 
    418     // Reverse state table.  Same layout as forward table, above.
    419     tableStartOffset = ds->readUInt32(rbbiDH->fRTable);
    420     tableLength      = ds->readUInt32(rbbiDH->fRTableLen);
    421 
    422     if (tableLength > 0) {
    423         ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
    424                             outBytes+tableStartOffset, status);
    425         ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
    426                             outBytes+tableStartOffset+topSize, status);
    427     }
    428 
    429     // Safe Forward state table.  Same layout as forward table, above.
    430     tableStartOffset = ds->readUInt32(rbbiDH->fSFTable);
    431     tableLength      = ds->readUInt32(rbbiDH->fSFTableLen);
    432 
    433     if (tableLength > 0) {
    434         ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
    435                             outBytes+tableStartOffset, status);
    436         ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
    437                             outBytes+tableStartOffset+topSize, status);
    438     }
    439 
    440     // Safe Reverse state table.  Same layout as forward table, above.
    441     tableStartOffset = ds->readUInt32(rbbiDH->fSRTable);
    442     tableLength      = ds->readUInt32(rbbiDH->fSRTableLen);
    443 
    444     if (tableLength > 0) {
    445         ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
    446                             outBytes+tableStartOffset, status);
    447         ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
    448                             outBytes+tableStartOffset+topSize, status);
    449     }
    450 
    451     // Trie table for character categories
    452     utrie_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen),
    453                             outBytes+ds->readUInt32(rbbiDH->fTrie), status);
    454 
    455     // Source Rules Text.  It's UChar data
    456     ds->swapArray16(ds, inBytes+ds->readUInt32(rbbiDH->fRuleSource), ds->readUInt32(rbbiDH->fRuleSourceLen),
    457                         outBytes+ds->readUInt32(rbbiDH->fRuleSource), status);
    458 
    459     // Table of rule status values.  It's all int_32 values
    460     ds->swapArray32(ds, inBytes+ds->readUInt32(rbbiDH->fStatusTable), ds->readUInt32(rbbiDH->fStatusTableLen),
    461                         outBytes+ds->readUInt32(rbbiDH->fStatusTable), status);
    462 
    463     // And, last, the header.
    464     //   It is all int32_t values except for fFormataVersion, which is an array of four bytes.
    465     //   Swap the whole thing as int32_t, then re-swap the one field.
    466     //
    467     ds->swapArray32(ds, inBytes, sizeof(RBBIDataHeader), outBytes, status);
    468     ds->swapArray32(ds, outputDH->fFormatVersion, 4, outputDH->fFormatVersion, status);
    469 
    470     return totalSize;
    471 }
    472 
    473 
    474 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
    475