1 /** 2 ******************************************************************************* 3 * Copyright (C) 2006,2012, International Business Machines Corporation * 4 * and others. All Rights Reserved. * 5 ******************************************************************************* 6 */ 7 8 #ifndef DICTBE_H 9 #define DICTBE_H 10 11 #include "unicode/utypes.h" 12 #include "unicode/uniset.h" 13 #include "unicode/utext.h" 14 15 #include "brkeng.h" 16 17 U_NAMESPACE_BEGIN 18 19 class DictionaryMatcher; 20 21 /******************************************************************* 22 * DictionaryBreakEngine 23 */ 24 25 /** 26 * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a 27 * dictionary to determine language-specific breaks.</p> 28 * 29 * <p>After it is constructed a DictionaryBreakEngine may be shared between 30 * threads without synchronization.</p> 31 */ 32 class DictionaryBreakEngine : public LanguageBreakEngine { 33 private: 34 /** 35 * The set of characters handled by this engine 36 * @internal 37 */ 38 39 UnicodeSet fSet; 40 41 /** 42 * The set of break types handled by this engine 43 * @internal 44 */ 45 46 uint32_t fTypes; 47 48 /** 49 * <p>Default constructor.</p> 50 * 51 */ 52 DictionaryBreakEngine(); 53 54 public: 55 56 /** 57 * <p>Constructor setting the break types handled.</p> 58 * 59 * @param breakTypes A bitmap of types handled by the engine. 60 */ 61 DictionaryBreakEngine( uint32_t breakTypes ); 62 63 /** 64 * <p>Virtual destructor.</p> 65 */ 66 virtual ~DictionaryBreakEngine(); 67 68 /** 69 * <p>Indicate whether this engine handles a particular character for 70 * a particular kind of break.</p> 71 * 72 * @param c A character which begins a run that the engine might handle 73 * @param breakType The type of text break which the caller wants to determine 74 * @return TRUE if this engine handles the particular character and break 75 * type. 76 */ 77 virtual UBool handles( UChar32 c, int32_t breakType ) const; 78 79 /** 80 * <p>Find any breaks within a run in the supplied text.</p> 81 * 82 * @param text A UText representing the text. The iterator is left at 83 * the end of the run of characters which the engine is capable of handling 84 * that starts from the first (or last) character in the range. 85 * @param startPos The start of the run within the supplied text. 86 * @param endPos The end of the run within the supplied text. 87 * @param reverse Whether the caller is looking for breaks in a reverse 88 * direction. 89 * @param breakType The type of break desired, or -1. 90 * @param foundBreaks An allocated C array of the breaks found, if any 91 * @return The number of breaks found. 92 */ 93 virtual int32_t findBreaks( UText *text, 94 int32_t startPos, 95 int32_t endPos, 96 UBool reverse, 97 int32_t breakType, 98 UStack &foundBreaks ) const; 99 100 protected: 101 102 /** 103 * <p>Set the character set handled by this engine.</p> 104 * 105 * @param set A UnicodeSet of the set of characters handled by the engine 106 */ 107 virtual void setCharacters( const UnicodeSet &set ); 108 109 /** 110 * <p>Set the break types handled by this engine.</p> 111 * 112 * @param breakTypes A bitmap of types handled by the engine. 113 */ 114 // virtual void setBreakTypes( uint32_t breakTypes ); 115 116 /** 117 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 118 * 119 * @param text A UText representing the text 120 * @param rangeStart The start of the range of dictionary characters 121 * @param rangeEnd The end of the range of dictionary characters 122 * @param foundBreaks Output of C array of int32_t break positions, or 0 123 * @return The number of breaks found 124 */ 125 virtual int32_t divideUpDictionaryRange( UText *text, 126 int32_t rangeStart, 127 int32_t rangeEnd, 128 UStack &foundBreaks ) const = 0; 129 130 }; 131 132 /******************************************************************* 133 * ThaiBreakEngine 134 */ 135 136 /** 137 * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a 138 * dictionary and heuristics to determine Thai-specific breaks.</p> 139 * 140 * <p>After it is constructed a ThaiBreakEngine may be shared between 141 * threads without synchronization.</p> 142 */ 143 class ThaiBreakEngine : public DictionaryBreakEngine { 144 private: 145 /** 146 * The set of characters handled by this engine 147 * @internal 148 */ 149 150 UnicodeSet fThaiWordSet; 151 UnicodeSet fEndWordSet; 152 UnicodeSet fBeginWordSet; 153 UnicodeSet fSuffixSet; 154 UnicodeSet fMarkSet; 155 DictionaryMatcher *fDictionary; 156 157 public: 158 159 /** 160 * <p>Default constructor.</p> 161 * 162 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 163 * engine is deleted. 164 */ 165 ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 166 167 /** 168 * <p>Virtual destructor.</p> 169 */ 170 virtual ~ThaiBreakEngine(); 171 172 protected: 173 /** 174 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 175 * 176 * @param text A UText representing the text 177 * @param rangeStart The start of the range of dictionary characters 178 * @param rangeEnd The end of the range of dictionary characters 179 * @param foundBreaks Output of C array of int32_t break positions, or 0 180 * @return The number of breaks found 181 */ 182 virtual int32_t divideUpDictionaryRange( UText *text, 183 int32_t rangeStart, 184 int32_t rangeEnd, 185 UStack &foundBreaks ) const; 186 187 }; 188 189 #if !UCONFIG_NO_NORMALIZATION 190 191 /******************************************************************* 192 * CjkBreakEngine 193 */ 194 195 //indicates language/script that the CjkBreakEngine will handle 196 enum LanguageType { 197 kKorean, 198 kChineseJapanese 199 }; 200 201 /** 202 * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a 203 * dictionary with costs associated with each word and 204 * Viterbi decoding to determine CJK-specific breaks.</p> 205 */ 206 class CjkBreakEngine : public DictionaryBreakEngine { 207 protected: 208 /** 209 * The set of characters handled by this engine 210 * @internal 211 */ 212 UnicodeSet fHangulWordSet; 213 UnicodeSet fHanWordSet; 214 UnicodeSet fKatakanaWordSet; 215 UnicodeSet fHiraganaWordSet; 216 217 DictionaryMatcher *fDictionary; 218 219 public: 220 221 /** 222 * <p>Default constructor.</p> 223 * 224 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 225 * engine is deleted. The DictionaryMatcher must contain costs for each word 226 * in order for the dictionary to work properly. 227 */ 228 CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status); 229 230 /** 231 * <p>Virtual destructor.</p> 232 */ 233 virtual ~CjkBreakEngine(); 234 235 protected: 236 /** 237 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 238 * 239 * @param text A UText representing the text 240 * @param rangeStart The start of the range of dictionary characters 241 * @param rangeEnd The end of the range of dictionary characters 242 * @param foundBreaks Output of C array of int32_t break positions, or 0 243 * @return The number of breaks found 244 */ 245 virtual int32_t divideUpDictionaryRange( UText *text, 246 int32_t rangeStart, 247 int32_t rangeEnd, 248 UStack &foundBreaks ) const; 249 250 }; 251 252 #endif 253 254 /******************************************************************* 255 * KhmerBreakEngine 256 */ 257 258 /** 259 * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a 260 * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p> 261 * 262 * <p>After it is constructed a KhmerBreakEngine may be shared between 263 * threads without synchronization.</p> 264 */ 265 class KhmerBreakEngine : public DictionaryBreakEngine { 266 private: 267 /** 268 * The set of characters handled by this engine 269 * @internal 270 */ 271 272 UnicodeSet fKhmerWordSet; 273 UnicodeSet fEndWordSet; 274 UnicodeSet fBeginWordSet; 275 UnicodeSet fMarkSet; 276 DictionaryMatcher *fDictionary; 277 278 public: 279 280 /** 281 * <p>Default constructor.</p> 282 * 283 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 284 * engine is deleted. 285 */ 286 KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 287 288 /** 289 * <p>Virtual destructor.</p> 290 */ 291 virtual ~KhmerBreakEngine(); 292 293 protected: 294 /** 295 * <p>Divide up a range of known dictionary characters.</p> 296 * 297 * @param text A UText representing the text 298 * @param rangeStart The start of the range of dictionary characters 299 * @param rangeEnd The end of the range of dictionary characters 300 * @param foundBreaks Output of C array of int32_t break positions, or 0 301 * @return The number of breaks found 302 */ 303 virtual int32_t divideUpDictionaryRange( UText *text, 304 int32_t rangeStart, 305 int32_t rangeEnd, 306 UStack &foundBreaks ) const; 307 308 }; 309 310 311 U_NAMESPACE_END 312 313 /* DICTBE_H */ 314 #endif 315