1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2009-2010, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: normalizer2.h 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2009nov22 14 * created by: Markus W. Scherer 15 */ 16 17 #ifndef __NORMALIZER2_H__ 18 #define __NORMALIZER2_H__ 19 20 /** 21 * \file 22 * \brief C++ API: New API for Unicode Normalization. 23 */ 24 25 #include "unicode/utypes.h" 26 27 #if !UCONFIG_NO_NORMALIZATION 28 29 #include "unicode/uniset.h" 30 #include "unicode/unistr.h" 31 #include "unicode/unorm2.h" 32 33 U_NAMESPACE_BEGIN 34 35 /** 36 * Unicode normalization functionality for standard Unicode normalization or 37 * for using custom mapping tables. 38 * All instances of this class are unmodifiable/immutable. 39 * Instances returned by getInstance() are singletons that must not be deleted by the caller. 40 * 41 * The primary functions are to produce a normalized string and to detect whether 42 * a string is already normalized. 43 * The most commonly used normalization forms are those defined in 44 * http://www.unicode.org/unicode/reports/tr15/ 45 * However, this API supports additional normalization forms for specialized purposes. 46 * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE) 47 * and can be used in implementations of UTS #46. 48 * 49 * Not only are the standard compose and decompose modes supplied, 50 * but additional modes are provided as documented in the Mode enum. 51 * 52 * Some of the functions in this class identify normalization boundaries. 53 * At a normalization boundary, the portions of the string 54 * before it and starting from it do not interact and can be handled independently. 55 * 56 * The spanQuickCheckYes() stops at a normalization boundary. 57 * When the goal is a normalized string, then the text before the boundary 58 * can be copied, and the remainder can be processed with normalizeSecondAndAppend(). 59 * 60 * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether 61 * a character is guaranteed to be at a normalization boundary, 62 * regardless of context. 63 * This is used for moving from one normalization boundary to the next 64 * or preceding boundary, and for performing iterative normalization. 65 * 66 * Iterative normalization is useful when only a small portion of a 67 * longer string needs to be processed. 68 * For example, in ICU, iterative normalization is used by the NormalizationTransliterator 69 * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart() 70 * (to process only the substring for which sort key bytes are computed). 71 * 72 * The set of normalization boundaries returned by these functions may not be 73 * complete: There may be more boundaries that could be returned. 74 * Different functions may return different boundaries. 75 * @draft ICU 4.4 76 */ 77 class U_COMMON_API Normalizer2 : public UObject { 78 public: 79 /** 80 * Returns a Normalizer2 instance which uses the specified data file 81 * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle) 82 * and which composes or decomposes text according to the specified mode. 83 * Returns an unmodifiable singleton instance. Do not delete it. 84 * 85 * Use packageName=NULL for data files that are part of ICU's own data. 86 * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD. 87 * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD. 88 * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold. 89 * 90 * @param packageName NULL for ICU built-in data, otherwise application data package name 91 * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file 92 * @param mode normalization mode (compose or decompose etc.) 93 * @param errorCode Standard ICU error code. Its input value must 94 * pass the U_SUCCESS() test, or else the function returns 95 * immediately. Check for U_FAILURE() on output or use with 96 * function chaining. (See User Guide for details.) 97 * @return the requested Normalizer2, if successful 98 * @draft ICU 4.4 99 */ 100 static const Normalizer2 * 101 getInstance(const char *packageName, 102 const char *name, 103 UNormalization2Mode mode, 104 UErrorCode &errorCode); 105 106 /** 107 * Returns the normalized form of the source string. 108 * @param src source string 109 * @param errorCode Standard ICU error code. Its input value must 110 * pass the U_SUCCESS() test, or else the function returns 111 * immediately. Check for U_FAILURE() on output or use with 112 * function chaining. (See User Guide for details.) 113 * @return normalized src 114 * @draft ICU 4.4 115 */ 116 UnicodeString 117 normalize(const UnicodeString &src, UErrorCode &errorCode) const { 118 UnicodeString result; 119 normalize(src, result, errorCode); 120 return result; 121 } 122 /** 123 * Writes the normalized form of the source string to the destination string 124 * (replacing its contents) and returns the destination string. 125 * The source and destination strings must be different objects. 126 * @param src source string 127 * @param dest destination string; its contents is replaced with normalized src 128 * @param errorCode Standard ICU error code. Its input value must 129 * pass the U_SUCCESS() test, or else the function returns 130 * immediately. Check for U_FAILURE() on output or use with 131 * function chaining. (See User Guide for details.) 132 * @return dest 133 * @draft ICU 4.4 134 */ 135 virtual UnicodeString & 136 normalize(const UnicodeString &src, 137 UnicodeString &dest, 138 UErrorCode &errorCode) const = 0; 139 /** 140 * Appends the normalized form of the second string to the first string 141 * (merging them at the boundary) and returns the first string. 142 * The result is normalized if the first string was normalized. 143 * The first and second strings must be different objects. 144 * @param first string, should be normalized 145 * @param second string, will be normalized 146 * @param errorCode Standard ICU error code. Its input value must 147 * pass the U_SUCCESS() test, or else the function returns 148 * immediately. Check for U_FAILURE() on output or use with 149 * function chaining. (See User Guide for details.) 150 * @return first 151 * @draft ICU 4.4 152 */ 153 virtual UnicodeString & 154 normalizeSecondAndAppend(UnicodeString &first, 155 const UnicodeString &second, 156 UErrorCode &errorCode) const = 0; 157 /** 158 * Appends the second string to the first string 159 * (merging them at the boundary) and returns the first string. 160 * The result is normalized if both the strings were normalized. 161 * The first and second strings must be different objects. 162 * @param first string, should be normalized 163 * @param second string, should be normalized 164 * @param errorCode Standard ICU error code. Its input value must 165 * pass the U_SUCCESS() test, or else the function returns 166 * immediately. Check for U_FAILURE() on output or use with 167 * function chaining. (See User Guide for details.) 168 * @return first 169 * @draft ICU 4.4 170 */ 171 virtual UnicodeString & 172 append(UnicodeString &first, 173 const UnicodeString &second, 174 UErrorCode &errorCode) const = 0; 175 176 /** 177 * Tests if the string is normalized. 178 * Internally, in cases where the quickCheck() method would return "maybe" 179 * (which is only possible for the two COMPOSE modes) this method 180 * resolves to "yes" or "no" to provide a definitive result, 181 * at the cost of doing more work in those cases. 182 * @param s input string 183 * @param errorCode Standard ICU error code. Its input value must 184 * pass the U_SUCCESS() test, or else the function returns 185 * immediately. Check for U_FAILURE() on output or use with 186 * function chaining. (See User Guide for details.) 187 * @return TRUE if s is normalized 188 * @draft ICU 4.4 189 */ 190 virtual UBool 191 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0; 192 193 /** 194 * Tests if the string is normalized. 195 * For the two COMPOSE modes, the result could be "maybe" in cases that 196 * would take a little more work to resolve definitively. 197 * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster 198 * combination of quick check + normalization, to avoid 199 * re-checking the "yes" prefix. 200 * @param s input string 201 * @param errorCode Standard ICU error code. Its input value must 202 * pass the U_SUCCESS() test, or else the function returns 203 * immediately. Check for U_FAILURE() on output or use with 204 * function chaining. (See User Guide for details.) 205 * @return UNormalizationCheckResult 206 * @draft ICU 4.4 207 */ 208 virtual UNormalizationCheckResult 209 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0; 210 211 /** 212 * Returns the end of the normalized substring of the input string. 213 * In other words, with <code>end=spanQuickCheckYes(s, ec);</code> 214 * the substring <code>UnicodeString(s, 0, end)</code> 215 * will pass the quick check with a "yes" result. 216 * 217 * The returned end index is usually one or more characters before the 218 * "no" or "maybe" character: The end index is at a normalization boundary. 219 * (See the class documentation for more about normalization boundaries.) 220 * 221 * When the goal is a normalized string and most input strings are expected 222 * to be normalized already, then call this method, 223 * and if it returns a prefix shorter than the input string, 224 * copy that prefix and use normalizeSecondAndAppend() for the remainder. 225 * @param s input string 226 * @param errorCode Standard ICU error code. Its input value must 227 * pass the U_SUCCESS() test, or else the function returns 228 * immediately. Check for U_FAILURE() on output or use with 229 * function chaining. (See User Guide for details.) 230 * @return "yes" span end index 231 * @draft ICU 4.4 232 */ 233 virtual int32_t 234 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0; 235 236 /** 237 * Tests if the character always has a normalization boundary before it, 238 * regardless of context. 239 * If true, then the character does not normalization-interact with 240 * preceding characters. 241 * In other words, a string containing this character can be normalized 242 * by processing portions before this character and starting from this 243 * character independently. 244 * This is used for iterative normalization. See the class documentation for details. 245 * @param c character to test 246 * @return TRUE if c has a normalization boundary before it 247 * @draft ICU 4.4 248 */ 249 virtual UBool hasBoundaryBefore(UChar32 c) const = 0; 250 251 /** 252 * Tests if the character always has a normalization boundary after it, 253 * regardless of context. 254 * If true, then the character does not normalization-interact with 255 * following characters. 256 * In other words, a string containing this character can be normalized 257 * by processing portions up to this character and after this 258 * character independently. 259 * This is used for iterative normalization. See the class documentation for details. 260 * Note that this operation may be significantly slower than hasBoundaryBefore(). 261 * @param c character to test 262 * @return TRUE if c has a normalization boundary after it 263 * @draft ICU 4.4 264 */ 265 virtual UBool hasBoundaryAfter(UChar32 c) const = 0; 266 267 /** 268 * Tests if the character is normalization-inert. 269 * If true, then the character does not change, nor normalization-interact with 270 * preceding or following characters. 271 * In other words, a string containing this character can be normalized 272 * by processing portions before this character and after this 273 * character independently. 274 * This is used for iterative normalization. See the class documentation for details. 275 * Note that this operation may be significantly slower than hasBoundaryBefore(). 276 * @param c character to test 277 * @return TRUE if c is normalization-inert 278 * @draft ICU 4.4 279 */ 280 virtual UBool isInert(UChar32 c) const = 0; 281 282 /** 283 * ICU "poor man's RTTI", returns a UClassID for this class. 284 * @returns a UClassID for this class. 285 * @draft ICU 4.4 286 */ 287 static UClassID U_EXPORT2 getStaticClassID(); 288 289 /** 290 * ICU "poor man's RTTI", returns a UClassID for the actual class. 291 * @return a UClassID for the actual class. 292 * @draft ICU 4.4 293 */ 294 virtual UClassID getDynamicClassID() const = 0; 295 }; 296 297 /** 298 * Normalization filtered by a UnicodeSet. 299 * Normalizes portions of the text contained in the filter set and leaves 300 * portions not contained in the filter set unchanged. 301 * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE). 302 * Not-in-the-filter text is treated as "is normalized" and "quick check yes". 303 * This class implements all of (and only) the Normalizer2 API. 304 * An instance of this class is unmodifiable/immutable but is constructed and 305 * must be destructed by the owner. 306 * @draft ICU 4.4 307 */ 308 class U_COMMON_API FilteredNormalizer2 : public Normalizer2 { 309 public: 310 /** 311 * Constructs a filtered normalizer wrapping any Normalizer2 instance 312 * and a filter set. 313 * Both are aliased and must not be modified or deleted while this object 314 * is used. 315 * The filter set should be frozen; otherwise the performance will suffer greatly. 316 * @param n2 wrapped Normalizer2 instance 317 * @param filterSet UnicodeSet which determines the characters to be normalized 318 * @draft ICU 4.4 319 */ 320 FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) : 321 norm2(n2), set(filterSet) {} 322 323 /** 324 * Writes the normalized form of the source string to the destination string 325 * (replacing its contents) and returns the destination string. 326 * The source and destination strings must be different objects. 327 * @param src source string 328 * @param dest destination string; its contents is replaced with normalized src 329 * @param errorCode Standard ICU error code. Its input value must 330 * pass the U_SUCCESS() test, or else the function returns 331 * immediately. Check for U_FAILURE() on output or use with 332 * function chaining. (See User Guide for details.) 333 * @return dest 334 * @draft ICU 4.4 335 */ 336 virtual UnicodeString & 337 normalize(const UnicodeString &src, 338 UnicodeString &dest, 339 UErrorCode &errorCode) const; 340 /** 341 * Appends the normalized form of the second string to the first string 342 * (merging them at the boundary) and returns the first string. 343 * The result is normalized if the first string was normalized. 344 * The first and second strings must be different objects. 345 * @param first string, should be normalized 346 * @param second string, will be normalized 347 * @param errorCode Standard ICU error code. Its input value must 348 * pass the U_SUCCESS() test, or else the function returns 349 * immediately. Check for U_FAILURE() on output or use with 350 * function chaining. (See User Guide for details.) 351 * @return first 352 * @draft ICU 4.4 353 */ 354 virtual UnicodeString & 355 normalizeSecondAndAppend(UnicodeString &first, 356 const UnicodeString &second, 357 UErrorCode &errorCode) const; 358 /** 359 * Appends the second string to the first string 360 * (merging them at the boundary) and returns the first string. 361 * The result is normalized if both the strings were normalized. 362 * The first and second strings must be different objects. 363 * @param first string, should be normalized 364 * @param second string, should be normalized 365 * @param errorCode Standard ICU error code. Its input value must 366 * pass the U_SUCCESS() test, or else the function returns 367 * immediately. Check for U_FAILURE() on output or use with 368 * function chaining. (See User Guide for details.) 369 * @return first 370 * @draft ICU 4.4 371 */ 372 virtual UnicodeString & 373 append(UnicodeString &first, 374 const UnicodeString &second, 375 UErrorCode &errorCode) const; 376 377 /** 378 * Tests if the string is normalized. 379 * For details see the Normalizer2 base class documentation. 380 * @param s input string 381 * @param errorCode Standard ICU error code. Its input value must 382 * pass the U_SUCCESS() test, or else the function returns 383 * immediately. Check for U_FAILURE() on output or use with 384 * function chaining. (See User Guide for details.) 385 * @return TRUE if s is normalized 386 * @draft ICU 4.4 387 */ 388 virtual UBool 389 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const; 390 /** 391 * Tests if the string is normalized. 392 * For details see the Normalizer2 base class documentation. 393 * @param s input string 394 * @param errorCode Standard ICU error code. Its input value must 395 * pass the U_SUCCESS() test, or else the function returns 396 * immediately. Check for U_FAILURE() on output or use with 397 * function chaining. (See User Guide for details.) 398 * @return UNormalizationCheckResult 399 * @draft ICU 4.4 400 */ 401 virtual UNormalizationCheckResult 402 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const; 403 /** 404 * Returns the end of the normalized substring of the input string. 405 * For details see the Normalizer2 base class documentation. 406 * @param s input string 407 * @param errorCode Standard ICU error code. Its input value must 408 * pass the U_SUCCESS() test, or else the function returns 409 * immediately. Check for U_FAILURE() on output or use with 410 * function chaining. (See User Guide for details.) 411 * @return "yes" span end index 412 * @draft ICU 4.4 413 */ 414 virtual int32_t 415 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const; 416 417 /** 418 * Tests if the character always has a normalization boundary before it, 419 * regardless of context. 420 * For details see the Normalizer2 base class documentation. 421 * @param c character to test 422 * @return TRUE if c has a normalization boundary before it 423 * @draft ICU 4.4 424 */ 425 virtual UBool hasBoundaryBefore(UChar32 c) const; 426 427 /** 428 * Tests if the character always has a normalization boundary after it, 429 * regardless of context. 430 * For details see the Normalizer2 base class documentation. 431 * @param c character to test 432 * @return TRUE if c has a normalization boundary after it 433 * @draft ICU 4.4 434 */ 435 virtual UBool hasBoundaryAfter(UChar32 c) const; 436 437 /** 438 * Tests if the character is normalization-inert. 439 * For details see the Normalizer2 base class documentation. 440 * @param c character to test 441 * @return TRUE if c is normalization-inert 442 * @draft ICU 4.4 443 */ 444 virtual UBool isInert(UChar32 c) const; 445 446 /** 447 * ICU "poor man's RTTI", returns a UClassID for this class. 448 * @returns a UClassID for this class. 449 * @draft ICU 4.4 450 */ 451 static UClassID U_EXPORT2 getStaticClassID(); 452 453 /** 454 * ICU "poor man's RTTI", returns a UClassID for the actual class. 455 * @return a UClassID for the actual class. 456 * @draft ICU 4.4 457 */ 458 virtual UClassID getDynamicClassID() const; 459 private: 460 UnicodeString & 461 normalize(const UnicodeString &src, 462 UnicodeString &dest, 463 USetSpanCondition spanCondition, 464 UErrorCode &errorCode) const; 465 466 UnicodeString & 467 normalizeSecondAndAppend(UnicodeString &first, 468 const UnicodeString &second, 469 UBool doNormalize, 470 UErrorCode &errorCode) const; 471 472 const Normalizer2 &norm2; 473 const UnicodeSet &set; 474 }; 475 476 U_NAMESPACE_END 477 478 #endif // !UCONFIG_NO_NORMALIZATION 479 #endif // __NORMALIZER2_H__ 480