Home | History | Annotate | Download | only in common
      1 /*
      2 *******************************************************************************
      3 * Copyright (C) 2014, International Business Machines
      4 * Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 * dictionarydata.h
      7 *
      8 * created on: 2012may31
      9 * created by: Markus W. Scherer & Maxime Serrano
     10 */
     11 
     12 #include "dictionarydata.h"
     13 #include "unicode/ucharstrie.h"
     14 #include "unicode/bytestrie.h"
     15 #include "unicode/udata.h"
     16 #include "cmemory.h"
     17 
     18 #if !UCONFIG_NO_BREAK_ITERATION
     19 
     20 U_NAMESPACE_BEGIN
     21 
     22 const int32_t  DictionaryData::TRIE_TYPE_BYTES = 0;
     23 const int32_t  DictionaryData::TRIE_TYPE_UCHARS = 1;
     24 const int32_t  DictionaryData::TRIE_TYPE_MASK = 7;
     25 const int32_t  DictionaryData::TRIE_HAS_VALUES = 8;
     26 
     27 const int32_t  DictionaryData::TRANSFORM_NONE = 0;
     28 const int32_t  DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000;
     29 const int32_t  DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000;
     30 const int32_t  DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff;
     31 
     32 DictionaryMatcher::~DictionaryMatcher() {
     33 }
     34 
     35 UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
     36     udata_close(file);
     37 }
     38 
     39 int32_t UCharsDictionaryMatcher::getType() const {
     40     return DictionaryData::TRIE_TYPE_UCHARS;
     41 }
     42 
     43 int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
     44                             int32_t *lengths, int32_t *cpLengths, int32_t *values,
     45                             int32_t *prefix) const {
     46 
     47     UCharsTrie uct(characters);
     48     int32_t startingTextIndex = utext_getNativeIndex(text);
     49     int32_t wordCount = 0;
     50     int32_t codePointsMatched = 0;
     51 
     52     for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
     53         UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c);
     54         int32_t lengthMatched = utext_getNativeIndex(text) - startingTextIndex;
     55         codePointsMatched += 1;
     56         if (USTRINGTRIE_HAS_VALUE(result)) {
     57             if (wordCount < limit) {
     58                 if (values != NULL) {
     59                     values[wordCount] = uct.getValue();
     60                 }
     61                 if (lengths != NULL) {
     62                     lengths[wordCount] = lengthMatched;
     63                 }
     64                 if (cpLengths != NULL) {
     65                     cpLengths[wordCount] = codePointsMatched;
     66                 }
     67                 ++wordCount;
     68             }
     69             if (result == USTRINGTRIE_FINAL_VALUE) {
     70                 break;
     71             }
     72         }
     73         else if (result == USTRINGTRIE_NO_MATCH) {
     74             break;
     75         }
     76         if (lengthMatched >= maxLength) {
     77             break;
     78         }
     79     }
     80 
     81     if (prefix != NULL) {
     82         *prefix = codePointsMatched;
     83     }
     84     return wordCount;
     85 }
     86 
     87 BytesDictionaryMatcher::~BytesDictionaryMatcher() {
     88     udata_close(file);
     89 }
     90 
     91 UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
     92     if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
     93         if (c == 0x200D) {
     94             return 0xFF;
     95         } else if (c == 0x200C) {
     96             return 0xFE;
     97         }
     98         int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
     99         if (delta < 0 || 0xFD < delta) {
    100             return U_SENTINEL;
    101         }
    102         return (UChar32)delta;
    103     }
    104     return c;
    105 }
    106 
    107 int32_t BytesDictionaryMatcher::getType() const {
    108     return DictionaryData::TRIE_TYPE_BYTES;
    109 }
    110 
    111 int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
    112                             int32_t *lengths, int32_t *cpLengths, int32_t *values,
    113                             int32_t *prefix) const {
    114     BytesTrie bt(characters);
    115     int32_t startingTextIndex = utext_getNativeIndex(text);
    116     int32_t wordCount = 0;
    117     int32_t codePointsMatched = 0;
    118 
    119     for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
    120         UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c));
    121         int32_t lengthMatched = utext_getNativeIndex(text) - startingTextIndex;
    122         codePointsMatched += 1;
    123         if (USTRINGTRIE_HAS_VALUE(result)) {
    124             if (wordCount < limit) {
    125                 if (values != NULL) {
    126                     values[wordCount] = bt.getValue();
    127                 }
    128                 if (lengths != NULL) {
    129                     lengths[wordCount] = lengthMatched;
    130                 }
    131                 if (cpLengths != NULL) {
    132                     cpLengths[wordCount] = codePointsMatched;
    133                 }
    134                 ++wordCount;
    135             }
    136             if (result == USTRINGTRIE_FINAL_VALUE) {
    137                 break;
    138             }
    139         }
    140         else if (result == USTRINGTRIE_NO_MATCH) {
    141             break;
    142         }
    143         if (lengthMatched >= maxLength) {
    144             break;
    145         }
    146     }
    147 
    148     if (prefix != NULL) {
    149         *prefix = codePointsMatched;
    150     }
    151     return wordCount;
    152 }
    153 
    154 
    155 U_NAMESPACE_END
    156 
    157 U_NAMESPACE_USE
    158 
    159 U_CAPI int32_t U_EXPORT2
    160 udict_swap(const UDataSwapper *ds, const void *inData, int32_t length,
    161            void *outData, UErrorCode *pErrorCode) {
    162     const UDataInfo *pInfo;
    163     int32_t headerSize;
    164     const uint8_t *inBytes;
    165     uint8_t *outBytes;
    166     const int32_t *inIndexes;
    167     int32_t indexes[DictionaryData::IX_COUNT];
    168     int32_t i, offset, size;
    169 
    170     headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
    171     if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0;
    172     pInfo = (const UDataInfo *)((const char *)inData + 4);
    173     if (!(pInfo->dataFormat[0] == 0x44 &&
    174           pInfo->dataFormat[1] == 0x69 &&
    175           pInfo->dataFormat[2] == 0x63 &&
    176           pInfo->dataFormat[3] == 0x74 &&
    177           pInfo->formatVersion[0] == 1)) {
    178         udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
    179                          pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
    180         *pErrorCode = U_UNSUPPORTED_ERROR;
    181         return 0;
    182     }
    183 
    184     inBytes = (const uint8_t *)inData + headerSize;
    185     outBytes = (uint8_t *)outData + headerSize;
    186 
    187     inIndexes = (const int32_t *)inBytes;
    188     if (length >= 0) {
    189         length -= headerSize;
    190         if (length < (int32_t)(sizeof(indexes))) {
    191             udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
    192             *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
    193             return 0;
    194         }
    195     }
    196 
    197     for (i = 0; i < DictionaryData::IX_COUNT; i++) {
    198         indexes[i] = udata_readInt32(ds, inIndexes[i]);
    199     }
    200 
    201     size = indexes[DictionaryData::IX_TOTAL_SIZE];
    202 
    203     if (length >= 0) {
    204         if (length < size) {
    205             udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
    206             *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
    207             return 0;
    208         }
    209 
    210         if (inBytes != outBytes) {
    211             uprv_memcpy(outBytes, inBytes, size);
    212         }
    213 
    214         offset = 0;
    215         ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
    216         offset = (int32_t)sizeof(indexes);
    217         int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
    218         int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];
    219 
    220         if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
    221             ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
    222         } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
    223             // nothing to do
    224         } else {
    225             udata_printError(ds, "udict_swap(): unknown trie type!\n");
    226             *pErrorCode = U_UNSUPPORTED_ERROR;
    227             return 0;
    228         }
    229 
    230         // these next two sections are empty in the current format,
    231         // but may be used later.
    232         offset = nextOffset;
    233         nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
    234         offset = nextOffset;
    235         nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
    236         offset = nextOffset;
    237     }
    238     return headerSize + size;
    239 }
    240 #endif
    241