Home | History | Annotate | Download | only in common
      1 /*
      2 *******************************************************************************
      3 * Copyright (C) 2013, International Business Machines
      4 * Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 * dictionarydata.h
      7 *
      8 * created on: 2012may31
      9 * created by: Markus W. Scherer & Maxime Serrano
     10 */
     11 
     12 #include "dictionarydata.h"
     13 #include "unicode/ucharstrie.h"
     14 #include "unicode/bytestrie.h"
     15 #include "unicode/udata.h"
     16 #include "cmemory.h"
     17 
     18 #if !UCONFIG_NO_BREAK_ITERATION
     19 
     20 U_NAMESPACE_BEGIN
     21 
     22 const int32_t  DictionaryData::TRIE_TYPE_BYTES = 0;
     23 const int32_t  DictionaryData::TRIE_TYPE_UCHARS = 1;
     24 const int32_t  DictionaryData::TRIE_TYPE_MASK = 7;
     25 const int32_t  DictionaryData::TRIE_HAS_VALUES = 8;
     26 
     27 const int32_t  DictionaryData::TRANSFORM_NONE = 0;
     28 const int32_t  DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000;
     29 const int32_t  DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000;
     30 const int32_t  DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff;
     31 
     32 DictionaryMatcher::~DictionaryMatcher() {
     33 }
     34 
     35 UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
     36     udata_close(file);
     37 }
     38 
     39 int32_t UCharsDictionaryMatcher::getType() const {
     40     return DictionaryData::TRIE_TYPE_UCHARS;
     41 }
     42 
     43 int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const {
     44     UCharsTrie uct(characters);
     45     UChar32 c = utext_next32(text);
     46     if (c < 0) {
     47         return 0;
     48     }
     49     UStringTrieResult result = uct.first(c);
     50     int32_t numChars = 1;
     51     count = 0;
     52     for (;;) {
     53         if (USTRINGTRIE_HAS_VALUE(result)) {
     54             if (count < limit) {
     55                 if (values != NULL) {
     56                     values[count] = uct.getValue();
     57                 }
     58                 lengths[count++] = numChars;
     59             }
     60             if (result == USTRINGTRIE_FINAL_VALUE) {
     61                 break;
     62             }
     63         }
     64         else if (result == USTRINGTRIE_NO_MATCH) {
     65             break;
     66         }
     67 
     68         // TODO: why do we have a text limit if the UText knows its length?
     69         if (numChars >= maxLength) {
     70             break;
     71         }
     72 
     73         c = utext_next32(text);
     74         if (c < 0) {
     75             break;
     76         }
     77         ++numChars;
     78         result = uct.next(c);
     79     }
     80     return numChars;
     81 }
     82 
     83 BytesDictionaryMatcher::~BytesDictionaryMatcher() {
     84     udata_close(file);
     85 }
     86 
     87 UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
     88     if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
     89         if (c == 0x200D) {
     90             return 0xFF;
     91         } else if (c == 0x200C) {
     92             return 0xFE;
     93         }
     94         int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
     95         if (delta < 0 || 0xFD < delta) {
     96             return U_SENTINEL;
     97         }
     98         return (UChar32)delta;
     99     }
    100     return c;
    101 }
    102 
    103 int32_t BytesDictionaryMatcher::getType() const {
    104     return DictionaryData::TRIE_TYPE_BYTES;
    105 }
    106 
    107 int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const {
    108     BytesTrie bt(characters);
    109     UChar32 c = utext_next32(text);
    110     if (c < 0) {
    111         return 0;
    112     }
    113     UStringTrieResult result = bt.first(transform(c));
    114     int32_t numChars = 1;
    115     count = 0;
    116     for (;;) {
    117         if (USTRINGTRIE_HAS_VALUE(result)) {
    118             if (count < limit) {
    119                 if (values != NULL) {
    120                     values[count] = bt.getValue();
    121             }
    122                 lengths[count++] = numChars;
    123             }
    124             if (result == USTRINGTRIE_FINAL_VALUE) {
    125                 break;
    126             }
    127         }
    128         else if (result == USTRINGTRIE_NO_MATCH) {
    129             break;
    130         }
    131 
    132         // TODO: why do we have a text limit if the UText knows its length?
    133         if (numChars >= maxLength) {
    134             break;
    135         }
    136 
    137         c = utext_next32(text);
    138         if (c < 0) {
    139             break;
    140         }
    141         ++numChars;
    142         result = bt.next(transform(c));
    143     }
    144     return numChars;
    145 }
    146 
    147 
    148 U_NAMESPACE_END
    149 
    150 U_NAMESPACE_USE
    151 
    152 U_CAPI int32_t U_EXPORT2
    153 udict_swap(const UDataSwapper *ds, const void *inData, int32_t length,
    154            void *outData, UErrorCode *pErrorCode) {
    155     const UDataInfo *pInfo;
    156     int32_t headerSize;
    157     const uint8_t *inBytes;
    158     uint8_t *outBytes;
    159     const int32_t *inIndexes;
    160     int32_t indexes[DictionaryData::IX_COUNT];
    161     int32_t i, offset, size;
    162 
    163     headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
    164     if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0;
    165     pInfo = (const UDataInfo *)((const char *)inData + 4);
    166     if (!(pInfo->dataFormat[0] == 0x44 &&
    167           pInfo->dataFormat[1] == 0x69 &&
    168           pInfo->dataFormat[2] == 0x63 &&
    169           pInfo->dataFormat[3] == 0x74 &&
    170           pInfo->formatVersion[0] == 1)) {
    171         udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
    172                          pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
    173         *pErrorCode = U_UNSUPPORTED_ERROR;
    174         return 0;
    175     }
    176 
    177     inBytes = (const uint8_t *)inData + headerSize;
    178     outBytes = (uint8_t *)outData + headerSize;
    179 
    180     inIndexes = (const int32_t *)inBytes;
    181     if (length >= 0) {
    182         length -= headerSize;
    183         if (length < (int32_t)(sizeof(indexes))) {
    184             udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
    185             *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
    186             return 0;
    187         }
    188     }
    189 
    190     for (i = 0; i < DictionaryData::IX_COUNT; i++) {
    191         indexes[i] = udata_readInt32(ds, inIndexes[i]);
    192     }
    193 
    194     size = indexes[DictionaryData::IX_TOTAL_SIZE];
    195 
    196     if (length >= 0) {
    197         if (length < size) {
    198             udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
    199             *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
    200             return 0;
    201         }
    202 
    203         if (inBytes != outBytes) {
    204             uprv_memcpy(outBytes, inBytes, size);
    205         }
    206 
    207         offset = 0;
    208         ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
    209         offset = (int32_t)sizeof(indexes);
    210         int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
    211         int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];
    212 
    213         if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
    214             ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
    215         } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
    216             // nothing to do
    217         } else {
    218             udata_printError(ds, "udict_swap(): unknown trie type!\n");
    219             *pErrorCode = U_UNSUPPORTED_ERROR;
    220             return 0;
    221         }
    222 
    223         // these next two sections are empty in the current format,
    224         // but may be used later.
    225         offset = nextOffset;
    226         nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
    227         offset = nextOffset;
    228         nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
    229         offset = nextOffset;
    230     }
    231     return headerSize + size;
    232 }
    233 #endif
    234