1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /** 4 ************************************************************************************ 5 * Copyright (C) 2006-2012, International Business Machines Corporation and others. * 6 * All Rights Reserved. * 7 ************************************************************************************ 8 */ 9 10 #ifndef BRKENG_H 11 #define BRKENG_H 12 13 #include "unicode/utypes.h" 14 #include "unicode/uobject.h" 15 #include "unicode/utext.h" 16 #include "unicode/uscript.h" 17 18 U_NAMESPACE_BEGIN 19 20 class UnicodeSet; 21 class UStack; 22 class DictionaryMatcher; 23 24 /******************************************************************* 25 * LanguageBreakEngine 26 */ 27 28 /** 29 * <p>LanguageBreakEngines implement language-specific knowledge for 30 * finding text boundaries within a run of characters belonging to a 31 * specific set. The boundaries will be of a specific kind, e.g. word, 32 * line, etc.</p> 33 * 34 * <p>LanguageBreakEngines should normally be implemented so as to 35 * be shared between threads without locking.</p> 36 */ 37 class LanguageBreakEngine : public UMemory { 38 public: 39 40 /** 41 * <p>Default constructor.</p> 42 * 43 */ 44 LanguageBreakEngine(); 45 46 /** 47 * <p>Virtual destructor.</p> 48 */ 49 virtual ~LanguageBreakEngine(); 50 51 /** 52 * <p>Indicate whether this engine handles a particular character for 53 * a particular kind of break.</p> 54 * 55 * @param c A character which begins a run that the engine might handle 56 * @param breakType The type of text break which the caller wants to determine 57 * @return TRUE if this engine handles the particular character and break 58 * type. 59 */ 60 virtual UBool handles(UChar32 c, int32_t breakType) const = 0; 61 62 /** 63 * <p>Find any breaks within a run in the supplied text.</p> 64 * 65 * @param text A UText representing the text. The 66 * iterator is left at the end of the run of characters which the engine 67 * is capable of handling. 68 * @param startPos The start of the run within the supplied text. 69 * @param endPos The end of the run within the supplied text. 70 * @param reverse Whether the caller is looking for breaks in a reverse 71 * direction. 72 * @param breakType The type of break desired, or -1. 73 * @param foundBreaks An allocated C array of the breaks found, if any 74 * @return The number of breaks found. 75 */ 76 virtual int32_t findBreaks( UText *text, 77 int32_t startPos, 78 int32_t endPos, 79 UBool reverse, 80 int32_t breakType, 81 UStack &foundBreaks ) const = 0; 82 83 }; 84 85 /******************************************************************* 86 * LanguageBreakFactory 87 */ 88 89 /** 90 * <p>LanguageBreakFactorys find and return a LanguageBreakEngine 91 * that can determine breaks for characters in a specific set, if 92 * such an object can be found.</p> 93 * 94 * <p>If a LanguageBreakFactory is to be shared between threads, 95 * appropriate synchronization must be used; there is none internal 96 * to the factory.</p> 97 * 98 * <p>A LanguageBreakEngine returned by a LanguageBreakFactory can 99 * normally be shared between threads without synchronization, unless 100 * the specific subclass of LanguageBreakFactory indicates otherwise.</p> 101 * 102 * <p>A LanguageBreakFactory is responsible for deleting any LanguageBreakEngine 103 * it returns when it itself is deleted, unless the specific subclass of 104 * LanguageBreakFactory indicates otherwise. Naturally, the factory should 105 * not be deleted until the LanguageBreakEngines it has returned are no 106 * longer needed.</p> 107 */ 108 class LanguageBreakFactory : public UMemory { 109 public: 110 111 /** 112 * <p>Default constructor.</p> 113 * 114 */ 115 LanguageBreakFactory(); 116 117 /** 118 * <p>Virtual destructor.</p> 119 */ 120 virtual ~LanguageBreakFactory(); 121 122 /** 123 * <p>Find and return a LanguageBreakEngine that can find the desired 124 * kind of break for the set of characters to which the supplied 125 * character belongs. It is up to the set of available engines to 126 * determine what the sets of characters are.</p> 127 * 128 * @param c A character that begins a run for which a LanguageBreakEngine is 129 * sought. 130 * @param breakType The kind of text break for which a LanguageBreakEngine is 131 * sought. 132 * @return A LanguageBreakEngine with the desired characteristics, or 0. 133 */ 134 virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType) = 0; 135 136 }; 137 138 /******************************************************************* 139 * UnhandledEngine 140 */ 141 142 /** 143 * <p>UnhandledEngine is a special subclass of LanguageBreakEngine that 144 * handles characters that no other LanguageBreakEngine is available to 145 * handle. It is told the character and the type of break; at its 146 * discretion it may handle more than the specified character (e.g., 147 * the entire script to which that character belongs.</p> 148 * 149 * <p>UnhandledEngines may not be shared between threads without 150 * external synchronization.</p> 151 */ 152 153 class UnhandledEngine : public LanguageBreakEngine { 154 private: 155 156 /** 157 * The sets of characters handled, for each break type 158 * @internal 159 */ 160 161 UnicodeSet *fHandled[4]; 162 163 public: 164 165 /** 166 * <p>Default constructor.</p> 167 * 168 */ 169 UnhandledEngine(UErrorCode &status); 170 171 /** 172 * <p>Virtual destructor.</p> 173 */ 174 virtual ~UnhandledEngine(); 175 176 /** 177 * <p>Indicate whether this engine handles a particular character for 178 * a particular kind of break.</p> 179 * 180 * @param c A character which begins a run that the engine might handle 181 * @param breakType The type of text break which the caller wants to determine 182 * @return TRUE if this engine handles the particular character and break 183 * type. 184 */ 185 virtual UBool handles(UChar32 c, int32_t breakType) const; 186 187 /** 188 * <p>Find any breaks within a run in the supplied text.</p> 189 * 190 * @param text A UText representing the text (TODO: UText). The 191 * iterator is left at the end of the run of characters which the engine 192 * is capable of handling. 193 * @param startPos The start of the run within the supplied text. 194 * @param endPos The end of the run within the supplied text. 195 * @param reverse Whether the caller is looking for breaks in a reverse 196 * direction. 197 * @param breakType The type of break desired, or -1. 198 * @param foundBreaks An allocated C array of the breaks found, if any 199 * @return The number of breaks found. 200 */ 201 virtual int32_t findBreaks( UText *text, 202 int32_t startPos, 203 int32_t endPos, 204 UBool reverse, 205 int32_t breakType, 206 UStack &foundBreaks ) const; 207 208 /** 209 * <p>Tell the engine to handle a particular character and break type.</p> 210 * 211 * @param c A character which the engine should handle 212 * @param breakType The type of text break for which the engine should handle c 213 */ 214 virtual void handleCharacter(UChar32 c, int32_t breakType); 215 216 }; 217 218 /******************************************************************* 219 * ICULanguageBreakFactory 220 */ 221 222 /** 223 * <p>ICULanguageBreakFactory is the default LanguageBreakFactory for 224 * ICU. It creates dictionary-based LanguageBreakEngines from dictionary 225 * data in the ICU data file.</p> 226 */ 227 class ICULanguageBreakFactory : public LanguageBreakFactory { 228 private: 229 230 /** 231 * The stack of break engines created by this factory 232 * @internal 233 */ 234 235 UStack *fEngines; 236 237 public: 238 239 /** 240 * <p>Standard constructor.</p> 241 * 242 */ 243 ICULanguageBreakFactory(UErrorCode &status); 244 245 /** 246 * <p>Virtual destructor.</p> 247 */ 248 virtual ~ICULanguageBreakFactory(); 249 250 /** 251 * <p>Find and return a LanguageBreakEngine that can find the desired 252 * kind of break for the set of characters to which the supplied 253 * character belongs. It is up to the set of available engines to 254 * determine what the sets of characters are.</p> 255 * 256 * @param c A character that begins a run for which a LanguageBreakEngine is 257 * sought. 258 * @param breakType The kind of text break for which a LanguageBreakEngine is 259 * sought. 260 * @return A LanguageBreakEngine with the desired characteristics, or 0. 261 */ 262 virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType); 263 264 protected: 265 /** 266 * <p>Create a LanguageBreakEngine for the set of characters to which 267 * the supplied character belongs, for the specified break type.</p> 268 * 269 * @param c A character that begins a run for which a LanguageBreakEngine is 270 * sought. 271 * @param breakType The kind of text break for which a LanguageBreakEngine is 272 * sought. 273 * @return A LanguageBreakEngine with the desired characteristics, or 0. 274 */ 275 virtual const LanguageBreakEngine *loadEngineFor(UChar32 c, int32_t breakType); 276 277 /** 278 * <p>Create a DictionaryMatcher for the specified script and break type.</p> 279 * @param script An ISO 15924 script code that identifies the dictionary to be 280 * created. 281 * @param breakType The kind of text break for which a dictionary is 282 * sought. 283 * @return A DictionaryMatcher with the desired characteristics, or NULL. 284 */ 285 virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script, int32_t breakType); 286 }; 287 288 U_NAMESPACE_END 289 290 /* BRKENG_H */ 291 #endif 292