1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2014-2016, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * dictionarydata.h 9 * 10 * created on: 2012may31 11 * created by: Markus W. Scherer & Maxime Serrano 12 */ 13 14 #include "dictionarydata.h" 15 #include "unicode/ucharstrie.h" 16 #include "unicode/bytestrie.h" 17 #include "unicode/udata.h" 18 #include "cmemory.h" 19 20 #if !UCONFIG_NO_BREAK_ITERATION 21 22 U_NAMESPACE_BEGIN 23 24 const int32_t DictionaryData::TRIE_TYPE_BYTES = 0; 25 const int32_t DictionaryData::TRIE_TYPE_UCHARS = 1; 26 const int32_t DictionaryData::TRIE_TYPE_MASK = 7; 27 const int32_t DictionaryData::TRIE_HAS_VALUES = 8; 28 29 const int32_t DictionaryData::TRANSFORM_NONE = 0; 30 const int32_t DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000; 31 const int32_t DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000; 32 const int32_t DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff; 33 34 DictionaryMatcher::~DictionaryMatcher() { 35 } 36 37 UCharsDictionaryMatcher::~UCharsDictionaryMatcher() { 38 udata_close(file); 39 } 40 41 int32_t UCharsDictionaryMatcher::getType() const { 42 return DictionaryData::TRIE_TYPE_UCHARS; 43 } 44 45 int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit, 46 int32_t *lengths, int32_t *cpLengths, int32_t *values, 47 int32_t *prefix) const { 48 49 UCharsTrie uct(characters); 50 int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text); 51 int32_t wordCount = 0; 52 int32_t codePointsMatched = 0; 53 54 for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) { 55 UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c); 56 int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex; 57 codePointsMatched += 1; 58 if (USTRINGTRIE_HAS_VALUE(result)) { 59 if (wordCount < limit) { 60 if (values != NULL) { 61 values[wordCount] = uct.getValue(); 62 } 63 if (lengths != NULL) { 64 lengths[wordCount] = lengthMatched; 65 } 66 if (cpLengths != NULL) { 67 cpLengths[wordCount] = codePointsMatched; 68 } 69 ++wordCount; 70 } 71 if (result == USTRINGTRIE_FINAL_VALUE) { 72 break; 73 } 74 } 75 else if (result == USTRINGTRIE_NO_MATCH) { 76 break; 77 } 78 if (lengthMatched >= maxLength) { 79 break; 80 } 81 } 82 83 if (prefix != NULL) { 84 *prefix = codePointsMatched; 85 } 86 return wordCount; 87 } 88 89 BytesDictionaryMatcher::~BytesDictionaryMatcher() { 90 udata_close(file); 91 } 92 93 UChar32 BytesDictionaryMatcher::transform(UChar32 c) const { 94 if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) { 95 if (c == 0x200D) { 96 return 0xFF; 97 } else if (c == 0x200C) { 98 return 0xFE; 99 } 100 int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK); 101 if (delta < 0 || 0xFD < delta) { 102 return U_SENTINEL; 103 } 104 return (UChar32)delta; 105 } 106 return c; 107 } 108 109 int32_t BytesDictionaryMatcher::getType() const { 110 return DictionaryData::TRIE_TYPE_BYTES; 111 } 112 113 int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit, 114 int32_t *lengths, int32_t *cpLengths, int32_t *values, 115 int32_t *prefix) const { 116 BytesTrie bt(characters); 117 int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text); 118 int32_t wordCount = 0; 119 int32_t codePointsMatched = 0; 120 121 for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) { 122 UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c)); 123 int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex; 124 codePointsMatched += 1; 125 if (USTRINGTRIE_HAS_VALUE(result)) { 126 if (wordCount < limit) { 127 if (values != NULL) { 128 values[wordCount] = bt.getValue(); 129 } 130 if (lengths != NULL) { 131 lengths[wordCount] = lengthMatched; 132 } 133 if (cpLengths != NULL) { 134 cpLengths[wordCount] = codePointsMatched; 135 } 136 ++wordCount; 137 } 138 if (result == USTRINGTRIE_FINAL_VALUE) { 139 break; 140 } 141 } 142 else if (result == USTRINGTRIE_NO_MATCH) { 143 break; 144 } 145 if (lengthMatched >= maxLength) { 146 break; 147 } 148 } 149 150 if (prefix != NULL) { 151 *prefix = codePointsMatched; 152 } 153 return wordCount; 154 } 155 156 157 U_NAMESPACE_END 158 159 U_NAMESPACE_USE 160 161 U_CAPI int32_t U_EXPORT2 162 udict_swap(const UDataSwapper *ds, const void *inData, int32_t length, 163 void *outData, UErrorCode *pErrorCode) { 164 const UDataInfo *pInfo; 165 int32_t headerSize; 166 const uint8_t *inBytes; 167 uint8_t *outBytes; 168 const int32_t *inIndexes; 169 int32_t indexes[DictionaryData::IX_COUNT]; 170 int32_t i, offset, size; 171 172 headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode); 173 if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0; 174 pInfo = (const UDataInfo *)((const char *)inData + 4); 175 if (!(pInfo->dataFormat[0] == 0x44 && 176 pInfo->dataFormat[1] == 0x69 && 177 pInfo->dataFormat[2] == 0x63 && 178 pInfo->dataFormat[3] == 0x74 && 179 pInfo->formatVersion[0] == 1)) { 180 udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n", 181 pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]); 182 *pErrorCode = U_UNSUPPORTED_ERROR; 183 return 0; 184 } 185 186 inBytes = (const uint8_t *)inData + headerSize; 187 outBytes = (uint8_t *)outData + headerSize; 188 189 inIndexes = (const int32_t *)inBytes; 190 if (length >= 0) { 191 length -= headerSize; 192 if (length < (int32_t)(sizeof(indexes))) { 193 udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length); 194 *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; 195 return 0; 196 } 197 } 198 199 for (i = 0; i < DictionaryData::IX_COUNT; i++) { 200 indexes[i] = udata_readInt32(ds, inIndexes[i]); 201 } 202 203 size = indexes[DictionaryData::IX_TOTAL_SIZE]; 204 205 if (length >= 0) { 206 if (length < size) { 207 udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length); 208 *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; 209 return 0; 210 } 211 212 if (inBytes != outBytes) { 213 uprv_memcpy(outBytes, inBytes, size); 214 } 215 216 offset = 0; 217 ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode); 218 offset = (int32_t)sizeof(indexes); 219 int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK; 220 int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET]; 221 222 if (trieType == DictionaryData::TRIE_TYPE_UCHARS) { 223 ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode); 224 } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) { 225 // nothing to do 226 } else { 227 udata_printError(ds, "udict_swap(): unknown trie type!\n"); 228 *pErrorCode = U_UNSUPPORTED_ERROR; 229 return 0; 230 } 231 232 // these next two sections are empty in the current format, 233 // but may be used later. 234 offset = nextOffset; 235 nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET]; 236 offset = nextOffset; 237 nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE]; 238 offset = nextOffset; 239 } 240 return headerSize + size; 241 } 242 #endif 243