Home | History | Annotate | Download | only in common
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 2014-2016, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 * dictionarydata.h
      9 *
     10 * created on: 2012may31
     11 * created by: Markus W. Scherer & Maxime Serrano
     12 */
     13 
     14 #include "dictionarydata.h"
     15 #include "unicode/ucharstrie.h"
     16 #include "unicode/bytestrie.h"
     17 #include "unicode/udata.h"
     18 #include "cmemory.h"
     19 
     20 #if !UCONFIG_NO_BREAK_ITERATION
     21 
     22 U_NAMESPACE_BEGIN
     23 
     24 const int32_t  DictionaryData::TRIE_TYPE_BYTES = 0;
     25 const int32_t  DictionaryData::TRIE_TYPE_UCHARS = 1;
     26 const int32_t  DictionaryData::TRIE_TYPE_MASK = 7;
     27 const int32_t  DictionaryData::TRIE_HAS_VALUES = 8;
     28 
     29 const int32_t  DictionaryData::TRANSFORM_NONE = 0;
     30 const int32_t  DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000;
     31 const int32_t  DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000;
     32 const int32_t  DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff;
     33 
     34 DictionaryMatcher::~DictionaryMatcher() {
     35 }
     36 
     37 UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
     38     udata_close(file);
     39 }
     40 
     41 int32_t UCharsDictionaryMatcher::getType() const {
     42     return DictionaryData::TRIE_TYPE_UCHARS;
     43 }
     44 
     45 int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
     46                             int32_t *lengths, int32_t *cpLengths, int32_t *values,
     47                             int32_t *prefix) const {
     48 
     49     UCharsTrie uct(characters);
     50     int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
     51     int32_t wordCount = 0;
     52     int32_t codePointsMatched = 0;
     53 
     54     for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
     55         UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c);
     56         int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
     57         codePointsMatched += 1;
     58         if (USTRINGTRIE_HAS_VALUE(result)) {
     59             if (wordCount < limit) {
     60                 if (values != NULL) {
     61                     values[wordCount] = uct.getValue();
     62                 }
     63                 if (lengths != NULL) {
     64                     lengths[wordCount] = lengthMatched;
     65                 }
     66                 if (cpLengths != NULL) {
     67                     cpLengths[wordCount] = codePointsMatched;
     68                 }
     69                 ++wordCount;
     70             }
     71             if (result == USTRINGTRIE_FINAL_VALUE) {
     72                 break;
     73             }
     74         }
     75         else if (result == USTRINGTRIE_NO_MATCH) {
     76             break;
     77         }
     78         if (lengthMatched >= maxLength) {
     79             break;
     80         }
     81     }
     82 
     83     if (prefix != NULL) {
     84         *prefix = codePointsMatched;
     85     }
     86     return wordCount;
     87 }
     88 
     89 BytesDictionaryMatcher::~BytesDictionaryMatcher() {
     90     udata_close(file);
     91 }
     92 
     93 UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
     94     if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
     95         if (c == 0x200D) {
     96             return 0xFF;
     97         } else if (c == 0x200C) {
     98             return 0xFE;
     99         }
    100         int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
    101         if (delta < 0 || 0xFD < delta) {
    102             return U_SENTINEL;
    103         }
    104         return (UChar32)delta;
    105     }
    106     return c;
    107 }
    108 
    109 int32_t BytesDictionaryMatcher::getType() const {
    110     return DictionaryData::TRIE_TYPE_BYTES;
    111 }
    112 
    113 int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
    114                             int32_t *lengths, int32_t *cpLengths, int32_t *values,
    115                             int32_t *prefix) const {
    116     BytesTrie bt(characters);
    117     int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
    118     int32_t wordCount = 0;
    119     int32_t codePointsMatched = 0;
    120 
    121     for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
    122         UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c));
    123         int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
    124         codePointsMatched += 1;
    125         if (USTRINGTRIE_HAS_VALUE(result)) {
    126             if (wordCount < limit) {
    127                 if (values != NULL) {
    128                     values[wordCount] = bt.getValue();
    129                 }
    130                 if (lengths != NULL) {
    131                     lengths[wordCount] = lengthMatched;
    132                 }
    133                 if (cpLengths != NULL) {
    134                     cpLengths[wordCount] = codePointsMatched;
    135                 }
    136                 ++wordCount;
    137             }
    138             if (result == USTRINGTRIE_FINAL_VALUE) {
    139                 break;
    140             }
    141         }
    142         else if (result == USTRINGTRIE_NO_MATCH) {
    143             break;
    144         }
    145         if (lengthMatched >= maxLength) {
    146             break;
    147         }
    148     }
    149 
    150     if (prefix != NULL) {
    151         *prefix = codePointsMatched;
    152     }
    153     return wordCount;
    154 }
    155 
    156 
    157 U_NAMESPACE_END
    158 
    159 U_NAMESPACE_USE
    160 
    161 U_CAPI int32_t U_EXPORT2
    162 udict_swap(const UDataSwapper *ds, const void *inData, int32_t length,
    163            void *outData, UErrorCode *pErrorCode) {
    164     const UDataInfo *pInfo;
    165     int32_t headerSize;
    166     const uint8_t *inBytes;
    167     uint8_t *outBytes;
    168     const int32_t *inIndexes;
    169     int32_t indexes[DictionaryData::IX_COUNT];
    170     int32_t i, offset, size;
    171 
    172     headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
    173     if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0;
    174     pInfo = (const UDataInfo *)((const char *)inData + 4);
    175     if (!(pInfo->dataFormat[0] == 0x44 &&
    176           pInfo->dataFormat[1] == 0x69 &&
    177           pInfo->dataFormat[2] == 0x63 &&
    178           pInfo->dataFormat[3] == 0x74 &&
    179           pInfo->formatVersion[0] == 1)) {
    180         udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
    181                          pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
    182         *pErrorCode = U_UNSUPPORTED_ERROR;
    183         return 0;
    184     }
    185 
    186     inBytes = (const uint8_t *)inData + headerSize;
    187     outBytes = (uint8_t *)outData + headerSize;
    188 
    189     inIndexes = (const int32_t *)inBytes;
    190     if (length >= 0) {
    191         length -= headerSize;
    192         if (length < (int32_t)(sizeof(indexes))) {
    193             udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
    194             *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
    195             return 0;
    196         }
    197     }
    198 
    199     for (i = 0; i < DictionaryData::IX_COUNT; i++) {
    200         indexes[i] = udata_readInt32(ds, inIndexes[i]);
    201     }
    202 
    203     size = indexes[DictionaryData::IX_TOTAL_SIZE];
    204 
    205     if (length >= 0) {
    206         if (length < size) {
    207             udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
    208             *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
    209             return 0;
    210         }
    211 
    212         if (inBytes != outBytes) {
    213             uprv_memcpy(outBytes, inBytes, size);
    214         }
    215 
    216         offset = 0;
    217         ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
    218         offset = (int32_t)sizeof(indexes);
    219         int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
    220         int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];
    221 
    222         if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
    223             ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
    224         } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
    225             // nothing to do
    226         } else {
    227             udata_printError(ds, "udict_swap(): unknown trie type!\n");
    228             *pErrorCode = U_UNSUPPORTED_ERROR;
    229             return 0;
    230         }
    231 
    232         // these next two sections are empty in the current format,
    233         // but may be used later.
    234         offset = nextOffset;
    235         nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
    236         offset = nextOffset;
    237         nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
    238         offset = nextOffset;
    239     }
    240     return headerSize + size;
    241 }
    242 #endif
    243