1 /** 2 ********************************************************************************** 3 * Copyright (C) 2006,2007, International Business Machines Corporation and others. 4 * All Rights Reserved. 5 ********************************************************************************** 6 */ 7 8 #ifndef DICTBE_H 9 #define DICTBE_H 10 11 #include "unicode/utypes.h" 12 #include "unicode/uniset.h" 13 #include "unicode/utext.h" 14 15 #include "brkeng.h" 16 17 U_NAMESPACE_BEGIN 18 19 class TrieWordDictionary; 20 21 /******************************************************************* 22 * DictionaryBreakEngine 23 */ 24 25 /** 26 * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a 27 * dictionary to determine language-specific breaks.</p> 28 * 29 * <p>After it is constructed a DictionaryBreakEngine may be shared between 30 * threads without synchronization.</p> 31 */ 32 class DictionaryBreakEngine : public LanguageBreakEngine { 33 private: 34 /** 35 * The set of characters handled by this engine 36 * @internal 37 */ 38 39 UnicodeSet fSet; 40 41 /** 42 * The set of break types handled by this engine 43 * @internal 44 */ 45 46 uint32_t fTypes; 47 48 /** 49 * <p>Default constructor.</p> 50 * 51 */ 52 DictionaryBreakEngine(); 53 54 public: 55 56 /** 57 * <p>Constructor setting the break types handled.</p> 58 * 59 * @param breakTypes A bitmap of types handled by the engine. 60 */ 61 DictionaryBreakEngine( uint32_t breakTypes ); 62 63 /** 64 * <p>Virtual destructor.</p> 65 */ 66 virtual ~DictionaryBreakEngine(); 67 68 /** 69 * <p>Indicate whether this engine handles a particular character for 70 * a particular kind of break.</p> 71 * 72 * @param c A character which begins a run that the engine might handle 73 * @param breakType The type of text break which the caller wants to determine 74 * @return TRUE if this engine handles the particular character and break 75 * type. 76 */ 77 virtual UBool handles( UChar32 c, int32_t breakType ) const; 78 79 /** 80 * <p>Find any breaks within a run in the supplied text.</p> 81 * 82 * @param text A UText representing the text. The iterator is left at 83 * the end of the run of characters which the engine is capable of handling 84 * that starts from the first (or last) character in the range. 85 * @param startPos The start of the run within the supplied text. 86 * @param endPos The end of the run within the supplied text. 87 * @param reverse Whether the caller is looking for breaks in a reverse 88 * direction. 89 * @param breakType The type of break desired, or -1. 90 * @param foundBreaks An allocated C array of the breaks found, if any 91 * @return The number of breaks found. 92 */ 93 virtual int32_t findBreaks( UText *text, 94 int32_t startPos, 95 int32_t endPos, 96 UBool reverse, 97 int32_t breakType, 98 UStack &foundBreaks ) const; 99 100 protected: 101 102 /** 103 * <p>Set the character set handled by this engine.</p> 104 * 105 * @param set A UnicodeSet of the set of characters handled by the engine 106 */ 107 virtual void setCharacters( const UnicodeSet &set ); 108 109 /** 110 * <p>Set the break types handled by this engine.</p> 111 * 112 * @param breakTypes A bitmap of types handled by the engine. 113 */ 114 // virtual void setBreakTypes( uint32_t breakTypes ); 115 116 /** 117 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 118 * 119 * @param text A UText representing the text 120 * @param rangeStart The start of the range of dictionary characters 121 * @param rangeEnd The end of the range of dictionary characters 122 * @param foundBreaks Output of C array of int32_t break positions, or 0 123 * @return The number of breaks found 124 */ 125 virtual int32_t divideUpDictionaryRange( UText *text, 126 int32_t rangeStart, 127 int32_t rangeEnd, 128 UStack &foundBreaks ) const = 0; 129 130 }; 131 132 /******************************************************************* 133 * ThaiBreakEngine 134 */ 135 136 /** 137 * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a 138 * TrieWordDictionary and heuristics to determine Thai-specific breaks.</p> 139 * 140 * <p>After it is constructed a ThaiBreakEngine may be shared between 141 * threads without synchronization.</p> 142 */ 143 class ThaiBreakEngine : public DictionaryBreakEngine { 144 private: 145 /** 146 * The set of characters handled by this engine 147 * @internal 148 */ 149 150 UnicodeSet fThaiWordSet; 151 UnicodeSet fEndWordSet; 152 UnicodeSet fBeginWordSet; 153 UnicodeSet fSuffixSet; 154 UnicodeSet fMarkSet; 155 const TrieWordDictionary *fDictionary; 156 157 public: 158 159 /** 160 * <p>Default constructor.</p> 161 * 162 * @param adoptDictionary A TrieWordDictionary to adopt. Deleted when the 163 * engine is deleted. 164 */ 165 ThaiBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status); 166 167 /** 168 * <p>Virtual destructor.</p> 169 */ 170 virtual ~ThaiBreakEngine(); 171 172 protected: 173 /** 174 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 175 * 176 * @param text A UText representing the text 177 * @param rangeStart The start of the range of dictionary characters 178 * @param rangeEnd The end of the range of dictionary characters 179 * @param foundBreaks Output of C array of int32_t break positions, or 0 180 * @return The number of breaks found 181 */ 182 virtual int32_t divideUpDictionaryRange( UText *text, 183 int32_t rangeStart, 184 int32_t rangeEnd, 185 UStack &foundBreaks ) const; 186 187 }; 188 189 /******************************************************************* 190 * CjkBreakEngine 191 */ 192 193 //indicates language/script that the CjkBreakEngine will handle 194 enum LanguageType { 195 kKorean, 196 kChineseJapanese 197 }; 198 199 /** 200 * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a 201 * TrieWordDictionary with costs associated with each word and 202 * Viterbi decoding to determine CJK-specific breaks.</p> 203 */ 204 class CjkBreakEngine : public DictionaryBreakEngine { 205 protected: 206 /** 207 * The set of characters handled by this engine 208 * @internal 209 */ 210 UnicodeSet fHangulWordSet; 211 UnicodeSet fHanWordSet; 212 UnicodeSet fKatakanaWordSet; 213 UnicodeSet fHiraganaWordSet; 214 215 const TrieWordDictionary *fDictionary; 216 217 public: 218 219 /** 220 * <p>Default constructor.</p> 221 * 222 * @param adoptDictionary A TrieWordDictionary to adopt. Deleted when the 223 * engine is deleted. The TrieWordDictionary must contain costs for each word 224 * in order for the dictionary to work properly. 225 */ 226 CjkBreakEngine(const TrieWordDictionary *adoptDictionary, LanguageType type, UErrorCode &status); 227 228 /** 229 * <p>Virtual destructor.</p> 230 */ 231 virtual ~CjkBreakEngine(); 232 233 protected: 234 /** 235 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 236 * 237 * @param text A UText representing the text 238 * @param rangeStart The start of the range of dictionary characters 239 * @param rangeEnd The end of the range of dictionary characters 240 * @param foundBreaks Output of C array of int32_t break positions, or 0 241 * @return The number of breaks found 242 */ 243 virtual int32_t divideUpDictionaryRange( UText *text, 244 int32_t rangeStart, 245 int32_t rangeEnd, 246 UStack &foundBreaks ) const; 247 248 }; 249 250 U_NAMESPACE_END 251 252 /* DICTBE_H */ 253 #endif 254