1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2011 International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 */ 9 10 #ifndef INDEXCHARS_H 11 #define INDEXCHARS_H 12 13 #include "unicode/utypes.h" 14 #include "unicode/uobject.h" 15 #include "unicode/locid.h" 16 17 /** 18 * \file 19 * \brief C++ API: Index Characters 20 */ 21 22 23 U_CDECL_BEGIN 24 25 /** 26 * Constants for Alphabetic Index Label Types. 27 * The form of these enum constants anticipates having a plain C API 28 * for Alphabetic Indexes that will also use them. 29 * @draft ICU 4.8 30 */ 31 typedef enum UAlphabeticIndexLabelType { 32 /** 33 * Normal Label, typically the starting letter of the names 34 * in the bucket with this label. 35 * @draft ICU 4.8 36 */ 37 U_ALPHAINDEX_NORMAL = 0, 38 39 /** 40 * Undeflow Label. The bucket with this label contains names 41 * in scripts that sort before any of the bucket labels in this index. 42 * @draft ICU 4.8 43 */ 44 U_ALPHAINDEX_UNDERFLOW = 1, 45 46 /** 47 * Inflow Label. The bucket with this label contains names 48 * in scripts that sort between two of the bucket labels in this index. 49 * Inflow labels are created when an index contains normal labels for 50 * multiple scripts, and skips other scripts that sort between some of the 51 * included scripts. 52 * @draft ICU 4.8 53 */ 54 U_ALPHAINDEX_INFLOW = 2, 55 56 /** 57 * Overflow Label. Te bucket with this label contains names in scripts 58 * that sort after all of the bucket labels in this index. 59 * @draft ICU 4.8 60 */ 61 U_ALPHAINDEX_OVERFLOW = 3 62 } UAlphabeticIndexLabelType; 63 64 65 struct UHashtable; 66 U_CDECL_END 67 68 U_NAMESPACE_BEGIN 69 70 // Forward Declarations 71 72 class Collator; 73 class RuleBasedCollator; 74 class StringEnumeration; 75 class UnicodeSet; 76 class UVector; 77 78 79 80 /** 81 * class AlphabeticIndex supports the creation of a UI index appropriate for a given language, such as: 82 * 83 * <pre> 84 * <b>... A B C D E F G H I J K L M N O P Q R S T U V W X Y Z \\u00C6 \\u00D8 \\u00C5 ...</b> 85 * 86 * <b>A</b> 87 * Addison 88 * Albertson 89 * Azensky 90 * <b>B</b> 91 * Baker 92 * ... 93 * </pre> 94 * 95 * The class can generate a list of labels for use as a UI "index", that is, a list of 96 * clickable characters (or character sequences) that allow the user to see a segment 97 * (bucket) of a larger "target" list. That is, each label corresponds to a bucket in 98 * the target list, where everything in the bucket is greater than or equal to the character 99 * (according to the locale's collation). Strings can be added to the index; 100 * they will be in sorted order in the right bucket. 101 * <p> 102 * The class also supports having buckets for strings before the first (underflow), 103 * after the last (overflow), and between scripts (inflow). For example, if the index 104 * is constructed with labels for Russian and English, Greek characters would fall 105 * into an inflow bucket between the other two scripts. 106 * <p> 107 * The AlphabeticIndex class is not intended for public subclassing. 108 * <p> 109 * <i>Example</i> 110 * <p> 111 * The "show..." methods below are just to illustrate usage. 112 * 113 * <pre> 114 * // Create a simple index. "Item" is assumed to be an application 115 * // defined type that the application's UI and other processing knows about, 116 * // and that has a name. 117 * 118 * UErrorCode status = U_ZERO_ERROR; 119 * AlphabeticIndex index = new AlphabeticIndex(desiredLocale, status); 120 * index->addLabels(additionalLocale, status); 121 * for (Item *item in some source of Items ) { 122 * index->addRecord(item->name(), item, status); 123 * } 124 * ... 125 * // Show index at top. We could skip or gray out empty buckets 126 * 127 * while (index->nextBucket(status)) { 128 * if (showAll || index->getBucketRecordCount() != 0) { 129 * showLabelAtTop(UI, index->getBucketLabel()); 130 * } 131 * } 132 * ... 133 * // Show the buckets with their contents, skipping empty buckets 134 * 135 * index->resetBucketIterator(status); 136 * while (index->nextBucket(status)) { 137 * if (index->getBucketRecordCount() != 0) { 138 * showLabelInList(UI, index->getBucketLabel()); 139 * while (index->nextRecord(status)) { 140 * showIndexedItem(UI, static_cast<Item *>(index->getRecordData())) 141 * </pre> 142 * 143 * The caller can build different UIs using this class. 144 * For example, an index character could be omitted or grayed-out 145 * if its bucket is empty. Small buckets could also be combined based on size, such as: 146 * 147 * <pre> 148 * <b>... A-F G-N O-Z ...</b> 149 * </pre> 150 * 151 * <p> 152 * <b>Notes:</b> 153 * <ul> 154 * <li>Additional collation parameters can be passed in as part of the locale name. 155 * For example, German plus numeric 156 * sorting would be "de@kn-true". 157 * </ul> 158 * 159 * @draft ICU 4.8 This API might change or be removed in a future release. 160 */ 161 162 163 class U_I18N_API AlphabeticIndex: public UObject { 164 165 public: 166 167 /** 168 * Construct an AlphabeticIndex object for the specified locale. If the locale's 169 * data does not include index characters, a set of them will be 170 * synthesized based on the locale's exemplar characters. The locale 171 * determines the sorting order for both the index characters and the 172 * user item names appearing under each Index character. 173 * 174 * @param locale the desired locale. 175 * @param status Error code, will be set with the reason if the construction 176 * of the AlphabeticIndex object fails. 177 * @draft ICU 4.8 178 */ 179 AlphabeticIndex(const Locale &locale, UErrorCode &status); 180 181 182 183 /** 184 * Add Labels to this Index. The labels are additions to those 185 * that are already in the index; they do not replace the existing 186 * ones. 187 * @param additions The additional characters to add to the index, such as A-Z. 188 * @param status Error code, will be set with the reason if the 189 * operation fails. 190 * @return this, for chaining 191 * @draft ICU 4.8 192 */ 193 virtual AlphabeticIndex &addLabels(const UnicodeSet &additions, UErrorCode &status); 194 195 /** 196 * Add the index characters from a Locale to the index. The labels 197 * are added to those that are already in the index; they do not replace the 198 * existing index characters. The collation order for this index is not 199 * changed; it remains that of the locale that was originally specified 200 * when creating this Index. 201 * 202 * @param locale The locale whose index characters are to be added. 203 * @param status Error code, will be set with the reason if the 204 * operation fails. 205 * @return this, for chaining 206 * @draft ICU 4.8 207 */ 208 virtual AlphabeticIndex &addLabels(const Locale &locale, UErrorCode &status); 209 210 /** 211 * Destructor 212 * @draft ICU 4.8 213 */ 214 virtual ~AlphabeticIndex(); 215 216 217 /** 218 * Get the Collator that establishes the ordering of the items in this index. 219 * Ownership of the collator remains with the AlphabeticIndex instance. 220 * 221 * The returned collator is a reference to the internal collator used by this 222 * index. It may be safely used to compare the names of items or to get 223 * sort keys for names. However if any settings need to be changed, 224 * or other non-const methods called, a cloned copy must be made first. 225 * 226 * @return The collator 227 * @draft ICU 4.8 228 */ 229 virtual const RuleBasedCollator &getCollator() const; 230 231 232 /** 233 * Get the default label used for abbreviated buckets <i>between</i> other index characters. 234 * For example, consider the labels when Latin and Greek are used: 235 * X Y Z ... Α Β Γ. 236 * 237 * @return inflow label 238 * @draft ICU 4.8 239 */ 240 virtual const UnicodeString &getInflowLabel() const; 241 242 /** 243 * Set the default label used for abbreviated buckets <i>between</i> other index characters. 244 * An inflow label will be automatically inserted if two otherwise-adjacent label characters 245 * are from different scripts, e.g. Latin and Cyrillic, and a third script, e.g. Greek, 246 * sorts between the two. The default inflow character is an ellipsis (...) 247 * 248 * @param inflowLabel the new Inflow label. 249 * @param status Error code, will be set with the reason if the operation fails. 250 * @return this 251 * @draft ICU 4.8 252 */ 253 virtual AlphabeticIndex &setInflowLabel(const UnicodeString &inflowLabel, UErrorCode &status); 254 255 256 257 /** 258 * Get the special label used for items that sort after the last normal label, 259 * and that would not otherwise have an appropriate label. 260 * 261 * @return the overflow label 262 * @draft ICU 4.8 263 */ 264 virtual const UnicodeString &getOverflowLabel() const; 265 266 267 /** 268 * Set the label used for items that sort after the last normal label, 269 * and that would not otherwise have an appropriate label. 270 * 271 * @param overflowLabel the new overflow label. 272 * @param status Error code, will be set with the reason if the operation fails. 273 * @return this 274 * @draft ICU 4.8 275 */ 276 virtual AlphabeticIndex &setOverflowLabel(const UnicodeString &overflowLabel, UErrorCode &status); 277 278 /** 279 * Get the special label used for items that sort before the first normal label, 280 * and that would not otherwise have an appropriate label. 281 * 282 * @return underflow label 283 * @draft ICU 4.8 284 */ 285 virtual const UnicodeString &getUnderflowLabel() const; 286 287 /** 288 * Set the label used for items that sort before the first normal label, 289 * and that would not otherwise have an appropriate label. 290 * 291 * @param underflowLabel the new underflow label. 292 * @param status Error code, will be set with the reason if the operation fails. 293 * @return this 294 * @draft ICU 4.8 295 */ 296 virtual AlphabeticIndex &setUnderflowLabel(const UnicodeString &underflowLabel, UErrorCode &status); 297 298 299 /** 300 * Get the limit on the number of labels permitted in the index. 301 * The number does not include over, under and inflow labels. 302 * 303 * @return maxLabelCount maximum number of labels. 304 * @draft ICU 4.8 305 */ 306 virtual int32_t getMaxLabelCount() const; 307 308 /** 309 * Set a limit on the number of labels permitted in the index. 310 * The number does not include over, under and inflow labels. 311 * Currently, if the number is exceeded, then every 312 * nth item is removed to bring the count down. 313 * A more sophisticated mechanism may be available in the future. 314 * 315 * @param maxLabelCount the maximum number of labels. 316 * @param status error code 317 * @return This, for chaining 318 * @draft ICU 4.8 319 */ 320 virtual AlphabeticIndex &setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status); 321 322 323 /** 324 * Get the Unicode character (or tailored string) that defines an overflow bucket; 325 * that is anything greater than or equal to that string should go in that bucket, 326 * instead of with the last character. Normally that is the first character of the script 327 * after lowerLimit. Thus in X Y Z ... <i>Devanagari-ka</i>, the overflow character for Z 328 * would be the <i>Greek-alpha</i>. 329 * 330 * @param lowerLimit The character below the overflow (or inflow) bucket 331 * @param status error code 332 * @return string that defines top of the overflow buck for lowerLimit, or an empty string if there is none 333 * @internal 334 */ 335 virtual const UnicodeString &getOverflowComparisonString(const UnicodeString &lowerLimit, 336 UErrorCode &status); 337 338 339 /** 340 * Add a record to the index. Each record will be associated with an index Bucket 341 * based on the record's name. The list of records for each bucket will be sorted 342 * based on the collation ordering of the names in the index's locale. 343 * Records with duplicate names are permitted; they will be kept in the order 344 * that they were added. 345 * 346 * @param name The display name for the Record. The Record will be placed in 347 * a bucket based on this name. 348 * @param data An optional pointer to user data associated with this 349 * item. When iterating the contents of a bucket, both the 350 * data pointer the name will be available for each Record. 351 * @param status Error code, will be set with the reason if the operation fails. 352 * @return This, for chaining. 353 * @draft ICU 4.8 354 */ 355 virtual AlphabeticIndex &addRecord(const UnicodeString &name, const void *data, UErrorCode &status); 356 357 /** 358 * Remove all Records from the Index. The set of Buckets, which define the headings under 359 * which records are classified, is not altered. 360 * 361 * @param status Error code, will be set with the reason if the operation fails. 362 * @return This, for chaining. 363 * @draft ICU 4.8 364 */ 365 virtual AlphabeticIndex &clearRecords(UErrorCode &status); 366 367 368 /** Get the number of labels in this index. 369 * Note: may trigger lazy index construction. 370 * 371 * @param status Error code, will be set with the reason if the operation fails. 372 * @return The number of labels in this index, including any under, over or 373 * in-flow labels. 374 * @draft ICU 4.8 375 */ 376 virtual int32_t getBucketCount(UErrorCode &status); 377 378 379 /** Get the total number of Records in this index, that is, the number 380 * of <name, data> pairs added. 381 * 382 * @param status Error code, will be set with the reason if the operation fails. 383 * @return The number of records in this index, that is, the total number 384 * of (name, data) items added with addRecord(). 385 * @draft ICU 4.8 386 */ 387 virtual int32_t getRecordCount(UErrorCode &status); 388 389 390 391 /** 392 * Given the name of a record, return the zero-based index of the Bucket 393 * in which the item should appear. The name need not be in the index. 394 * A Record will not be added to the index by this function. 395 * Bucket numbers are zero-based, in Bucket iteration order. 396 * 397 * @param itemName The name whose bucket position in the index is to be determined. 398 * @param status Error code, will be set with the reason if the operation fails. 399 * @return The bucket number for this name. 400 * @draft ICU 4.8 401 * 402 */ 403 virtual int32_t getBucketIndex(const UnicodeString &itemName, UErrorCode &status); 404 405 406 /** 407 * Get the zero based index of the current Bucket from an iteration 408 * over the Buckets of this index. Return -1 if no iteration is in process. 409 * @return the index of the current Bucket 410 * @draft ICU 4.8 411 */ 412 virtual int32_t getBucketIndex() const; 413 414 415 /** 416 * Advance the iteration over the Buckets of this index. Return FALSE if 417 * there are no more Buckets. 418 * 419 * @param status Error code, will be set with the reason if the operation fails. 420 * U_ENUM_OUT_OF_SYNC_ERROR will be reported if the index is modified while 421 * an enumeration of its contents are in process. 422 * 423 * @return TRUE if success, FALSE if at end of iteration 424 * @draft ICU 4.8 425 */ 426 virtual UBool nextBucket(UErrorCode &status); 427 428 /** 429 * Return the name of the Label of the current bucket from an iteration over the buckets. 430 * If the iteration is before the first Bucket (nextBucket() has not been called), 431 * or after the last, return an empty string. 432 * 433 * @return the bucket label. 434 * @draft ICU 4.8 435 */ 436 virtual const UnicodeString &getBucketLabel() const; 437 438 /** 439 * Return the type of the label for the current Bucket (selected by the 440 * iteration over Buckets.) 441 * 442 * @return the label type. 443 * @draft ICU 4.8 444 */ 445 virtual UAlphabeticIndexLabelType getBucketLabelType() const; 446 447 /** 448 * Get the number of <name, data> Records in the current Bucket. 449 * If the current bucket iteration position is before the first label or after the 450 * last, return 0. 451 * 452 * @return the number of Records. 453 * @draft ICU 4.8 454 */ 455 virtual int32_t getBucketRecordCount() const; 456 457 458 /** 459 * Reset the Bucket iteration for this index. The next call to nextBucket() 460 * will restart the iteration at the first label. 461 * 462 * @param status Error code, will be set with the reason if the operation fails. 463 * @return this, for chaining. 464 * @draft ICU 4.8 465 */ 466 virtual AlphabeticIndex &resetBucketIterator(UErrorCode &status); 467 468 /** 469 * Advance to the next record in the current Bucket. 470 * When nextBucket() is called, Record iteration is reset to just before the 471 * first Record in the new Bucket. 472 * 473 * @param status Error code, will be set with the reason if the operation fails. 474 * U_ENUM_OUT_OF_SYNC_ERROR will be reported if the index is modified while 475 * an enumeration of its contents are in process. 476 * @return TRUE if successful, FALSE when the iteration advances past the last item. 477 * @draft ICU 4.8 478 */ 479 virtual UBool nextRecord(UErrorCode &status); 480 481 /** 482 * Get the name of the current Record. 483 * Return an empty string if the Record iteration position is before first 484 * or after the last. 485 * 486 * @return The name of the current index item. 487 * @draft ICU 4.8 488 */ 489 virtual const UnicodeString &getRecordName() const; 490 491 492 /** 493 * Return the data pointer of the Record currently being iterated over. 494 * Return NULL if the current iteration position before the first item in this Bucket, 495 * or after the last. 496 * 497 * @return The current Record's data pointer. 498 * @draft ICU 4.8 499 */ 500 virtual const void *getRecordData() const; 501 502 503 /** 504 * Reset the Record iterator position to before the first Record in the current Bucket. 505 * 506 * @return This, for chaining. 507 * @draft ICU 4.8 508 */ 509 virtual AlphabeticIndex &resetRecordIterator(); 510 511 private: 512 // No ICU "poor man's RTTI" for this class nor its subclasses. 513 virtual UClassID getDynamicClassID() const; 514 515 /** 516 * No Copy constructor. 517 * @internal 518 */ 519 AlphabeticIndex(const AlphabeticIndex &other); 520 521 /** 522 * No assignment. 523 */ 524 AlphabeticIndex &operator =(const AlphabeticIndex & /*other*/) { return *this;}; 525 526 /** 527 * No Equality operators. 528 * @internal 529 */ 530 virtual UBool operator==(const AlphabeticIndex& other) const; 531 532 /** 533 * Inequality operator. 534 * @internal 535 */ 536 virtual UBool operator!=(const AlphabeticIndex& other) const; 537 538 // Common initialization, for use from all constructors. 539 void init(UErrorCode &status); 540 541 // Initialize & destruct static constants used by this class. 542 static void staticInit(UErrorCode &status); 543 544 // Pinyin stuff. If the input name is Chinese, add the Pinyin prefix to the dest string. 545 void hackName(UnicodeString &dest, const UnicodeString &name, const Collator *coll); 546 void initPinyinBounds(const Collator *coll, UErrorCode &status); 547 548 public: 549 /** 550 * Delete all shared (static) data associated with an AlphabeticIndex. 551 * Internal function, not intended for direct use. 552 * @internal. 553 */ 554 static void staticCleanup(); 555 private: 556 557 // Add index characters from the specified locale to the dest set. 558 // Does not remove any previous contents from dest. 559 static void getIndexExemplars(UnicodeSet &dest, const Locale &locale, UErrorCode &status); 560 561 UVector *firstStringsInScript(UErrorCode &status); 562 563 static UnicodeString separated(const UnicodeString &item); 564 565 static UnicodeSet *getScriptSet(UnicodeSet &dest, const UnicodeString &codePoint, UErrorCode &status); 566 567 void buildIndex(UErrorCode &status); 568 void buildBucketList(UErrorCode &status); 569 void bucketRecords(UErrorCode &status); 570 571 572 public: 573 574 // The following internal items are declared public only to allow access from 575 // implementation code written in plain C. They are not intended for 576 // public use. 577 578 /** 579 * A record, or item, in the index. 580 * @internal 581 */ 582 struct Record: public UMemory { 583 AlphabeticIndex *alphaIndex_; 584 const UnicodeString name_; 585 UnicodeString sortingName_; // Usually the same as name_; different for Pinyin. 586 const void *data_; 587 int32_t serialNumber_; // Defines sorting order for names that compare equal. 588 Record(AlphabeticIndex *alphaIndex, const UnicodeString &name, const void *data); 589 ~Record(); 590 }; 591 592 /** 593 * Holds all user records before they are distributed into buckets. 594 * Type of contents is (Record *) 595 * @internal 596 */ 597 UVector *inputRecords_; 598 599 /** 600 * A Bucket holds an index label and references to everything belonging to that label. 601 * For implementation use only. Declared public because pure C implementation code needs access. 602 * @internal 603 */ 604 struct Bucket: public UMemory { 605 UnicodeString label_; 606 UnicodeString lowerBoundary_; 607 UAlphabeticIndexLabelType labelType_; 608 UVector *records_; // Records are owned by inputRecords_ vector. 609 610 Bucket(const UnicodeString &label, // Parameter strings are copied. 611 const UnicodeString &lowerBoundary, 612 UAlphabeticIndexLabelType type, UErrorCode &status); 613 ~Bucket(); 614 }; 615 616 public: 617 618 /** 619 * Language Types. For internal ICU use only. 620 * @internal 621 */ 622 enum ELangType { 623 /** @internal */ 624 kNormal, 625 /** @internal */ 626 kSimplified, 627 /** @internal */ 628 kTraditional 629 }; 630 631 /** 632 * Get the Language Type for this Index. Based on the locale. 633 * @internal 634 */ 635 static ELangType langTypeFromLocale(const Locale &loc); 636 637 638 private: 639 640 // Holds the contents of this index, buckets of user items. 641 // UVector elements are of type (Bucket *) 642 UVector *bucketList_; 643 644 int32_t labelsIterIndex_; // Index of next item to return. 645 int32_t itemsIterIndex_; 646 Bucket *currentBucket_; // While an iteration of the index in underway, 647 // point to the bucket for the current label. 648 // NULL when no iteration underway. 649 650 UBool indexBuildRequired_; // Caller has made changes to the index that 651 // require rebuilding & bucketing before the 652 // contents can be iterated. 653 654 int32_t maxLabelCount_; // Limit on # of labels permitted in the index. 655 656 UHashtable *alreadyIn_; // Key=UnicodeString, value=UnicodeSet 657 658 UnicodeSet *initialLabels_; // Initial (unprocessed) set of Labels. Union 659 // of those explicitly set by the user plus 660 // those from locales. Raw values, before 661 // crunching into bucket labels. 662 663 UVector *labels_; // List of Labels, after processing, sorting. 664 // Contents are (UnicodeString *) 665 666 UnicodeSet *noDistinctSorting_; // As the set of labels is built, strings may 667 // be discarded from the exemplars. This contains 668 // some of the discards, and is 669 // intended for debugging. 670 671 UnicodeSet *notAlphabetic_; // As the set of labels is built, strings may 672 // be discarded from the exemplars. This contains 673 // some of the discards, and is 674 // intended for debugging. 675 676 677 UVector *firstScriptCharacters_; // The first character from each script, 678 // in collation order. 679 680 Locale locale_; 681 Collator *collator_; 682 Collator *collatorPrimaryOnly_; 683 684 UnicodeString inflowLabel_; 685 UnicodeString overflowLabel_; 686 UnicodeString underflowLabel_; 687 UnicodeString overflowComparisonString_; 688 689 ELangType langType_; // The language type, simplified Chinese, Traditional Chinese, 690 // or not Chinese (Normal). Part of the Pinyin support 691 692 typedef const UChar PinyinLookup[24][3]; 693 static PinyinLookup HACK_PINYIN_LOOKUP_SHORT; 694 static PinyinLookup HACK_PINYIN_LOOKUP_LONG; 695 696 // These will be lazily set to the short or long tables based on which 697 // Chinese collation has been configured into the ICU library. 698 static PinyinLookup *HACK_PINYIN_LOOKUP; 699 static const UChar *PINYIN_LOWER_BOUNDS; 700 701 702 703 int32_t recordCounter_; // Counts Records created. For minting record serial numbers. 704 705 // Constants. Lazily initialized the first time an AlphabeticIndex object is created. 706 707 static UnicodeSet *ALPHABETIC; 708 static UnicodeSet *CORE_LATIN; 709 static UnicodeSet *ETHIOPIC; 710 static UnicodeSet *HANGUL; 711 static UnicodeSet *IGNORE_SCRIPTS; 712 static UnicodeSet *TO_TRY; 713 static UnicodeSet *UNIHAN; 714 static const UnicodeString *EMPTY_STRING; 715 716 }; 717 718 U_NAMESPACE_END 719 #endif 720 721