Home | History | Annotate | Download | only in common
      1 /*
      2 *******************************************************************************
      3 * Copyright (C) 2012, International Business Machines
      4 * Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 * dictionarydata.h
      7 *
      8 * created on: 2012may31
      9 * created by: Markus W. Scherer & Maxime Serrano
     10 */
     11 
     12 #include "dictionarydata.h"
     13 #include "unicode/ucharstrie.h"
     14 #include "unicode/bytestrie.h"
     15 #include "unicode/udata.h"
     16 #include "cmemory.h"
     17 
     18 #if !UCONFIG_NO_BREAK_ITERATION
     19 
     20 U_NAMESPACE_BEGIN
     21 
     22 #ifndef CYGWINMSVC /* On Cygwin/MSVC, the error redefinition of symbols occurs.*/
     23 const int32_t DictionaryData::TRIE_TYPE_BYTES;
     24 const int32_t DictionaryData::TRIE_TYPE_UCHARS;
     25 #endif
     26 
     27 DictionaryMatcher::~DictionaryMatcher() {
     28 }
     29 
     30 UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
     31     udata_close(file);
     32 }
     33 
     34 int32_t UCharsDictionaryMatcher::getType() const {
     35     return DictionaryData::TRIE_TYPE_UCHARS;
     36 }
     37 
     38 int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const {
     39     UCharsTrie uct(characters);
     40     UChar32 c = utext_next32(text);
     41     if (c < 0) {
     42         return 0;
     43     }
     44     UStringTrieResult result = uct.first(c);
     45     int32_t numChars = 1;
     46     count = 0;
     47     for (;;) {
     48         if (USTRINGTRIE_HAS_VALUE(result)) {
     49             if (count < limit) {
     50                 if (values != NULL) {
     51                     values[count] = uct.getValue();
     52                 }
     53                 lengths[count++] = numChars;
     54             }
     55             if (result == USTRINGTRIE_FINAL_VALUE) {
     56                 break;
     57             }
     58         }
     59         else if (result == USTRINGTRIE_NO_MATCH) {
     60             break;
     61         }
     62 
     63         // TODO: why do we have a text limit if the UText knows its length?
     64         if (numChars >= maxLength) {
     65             break;
     66         }
     67 
     68         c = utext_next32(text);
     69         if (c < 0) {
     70             break;
     71         }
     72         ++numChars;
     73         result = uct.next(c);
     74     }
     75     return numChars;
     76 }
     77 
     78 BytesDictionaryMatcher::~BytesDictionaryMatcher() {
     79     udata_close(file);
     80 }
     81 
     82 UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
     83     if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
     84         if (c == 0x200D) {
     85             return 0xFF;
     86         } else if (c == 0x200C) {
     87             return 0xFE;
     88         }
     89         int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
     90         if (delta < 0 || 0xFD < delta) {
     91             return U_SENTINEL;
     92         }
     93         return (UChar32)delta;
     94     }
     95     return c;
     96 }
     97 
     98 int32_t BytesDictionaryMatcher::getType() const {
     99     return DictionaryData::TRIE_TYPE_BYTES;
    100 }
    101 
    102 int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const {
    103     BytesTrie bt(characters);
    104     UChar32 c = utext_next32(text);
    105     if (c < 0) {
    106         return 0;
    107     }
    108     UStringTrieResult result = bt.first(transform(c));
    109     int32_t numChars = 1;
    110     count = 0;
    111     for (;;) {
    112         if (USTRINGTRIE_HAS_VALUE(result)) {
    113             if (count < limit) {
    114                 if (values != NULL) {
    115                     values[count] = bt.getValue();
    116             }
    117                 lengths[count++] = numChars;
    118             }
    119             if (result == USTRINGTRIE_FINAL_VALUE) {
    120                 break;
    121             }
    122         }
    123         else if (result == USTRINGTRIE_NO_MATCH) {
    124             break;
    125         }
    126 
    127         // TODO: why do we have a text limit if the UText knows its length?
    128         if (numChars >= maxLength) {
    129             break;
    130         }
    131 
    132         c = utext_next32(text);
    133         if (c < 0) {
    134             break;
    135         }
    136         ++numChars;
    137         result = bt.next(transform(c));
    138     }
    139     return numChars;
    140 }
    141 
    142 
    143 U_NAMESPACE_END
    144 
    145 U_NAMESPACE_USE
    146 
    147 U_CAPI int32_t U_EXPORT2
    148 udict_swap(const UDataSwapper *ds, const void *inData, int32_t length,
    149            void *outData, UErrorCode *pErrorCode) {
    150     const UDataInfo *pInfo;
    151     int32_t headerSize;
    152     const uint8_t *inBytes;
    153     uint8_t *outBytes;
    154     const int32_t *inIndexes;
    155     int32_t indexes[DictionaryData::IX_COUNT];
    156     int32_t i, offset, size;
    157 
    158     headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
    159     if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0;
    160     pInfo = (const UDataInfo *)((const char *)inData + 4);
    161     if (!(pInfo->dataFormat[0] == 0x44 &&
    162           pInfo->dataFormat[1] == 0x69 &&
    163           pInfo->dataFormat[2] == 0x63 &&
    164           pInfo->dataFormat[3] == 0x74 &&
    165           pInfo->formatVersion[0] == 1)) {
    166         udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
    167                          pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
    168         *pErrorCode = U_UNSUPPORTED_ERROR;
    169         return 0;
    170     }
    171 
    172     inBytes = (const uint8_t *)inData + headerSize;
    173     outBytes = (uint8_t *)outData + headerSize;
    174 
    175     inIndexes = (const int32_t *)inBytes;
    176     if (length >= 0) {
    177         length -= headerSize;
    178         if (length < (int32_t)(sizeof(indexes))) {
    179             udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
    180             *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
    181             return 0;
    182         }
    183     }
    184 
    185     for (i = 0; i < DictionaryData::IX_COUNT; i++) {
    186         indexes[i] = udata_readInt32(ds, inIndexes[i]);
    187     }
    188 
    189     size = indexes[DictionaryData::IX_TOTAL_SIZE];
    190 
    191     if (length >= 0) {
    192         if (length < size) {
    193             udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
    194             *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
    195             return 0;
    196         }
    197 
    198         if (inBytes != outBytes) {
    199             uprv_memcpy(outBytes, inBytes, size);
    200         }
    201 
    202         offset = 0;
    203         ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
    204         offset = (int32_t)sizeof(indexes);
    205         int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
    206         int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];
    207 
    208         if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
    209             ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
    210         } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
    211             // nothing to do
    212         } else {
    213             udata_printError(ds, "udict_swap(): unknown trie type!\n");
    214             *pErrorCode = U_UNSUPPORTED_ERROR;
    215             return 0;
    216         }
    217 
    218         // these next two sections are empty in the current format,
    219         // but may be used later.
    220         offset = nextOffset;
    221         nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
    222         offset = nextOffset;
    223         nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
    224         offset = nextOffset;
    225     }
    226     return headerSize + size;
    227 }
    228 #endif
    229