Home | History | Annotate | Download | only in unicode
      1 /*
      2 ********************************************************************************
      3 *   Copyright (C) 1997-2013, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 ********************************************************************************
      6 *
      7 * File brkiter.h
      8 *
      9 * Modification History:
     10 *
     11 *   Date        Name        Description
     12 *   02/18/97    aliu        Added typedef for TextCount.  Made DONE const.
     13 *   05/07/97    aliu        Fixed DLL declaration.
     14 *   07/09/97    jfitz       Renamed BreakIterator and interface synced with JDK
     15 *   08/11/98    helena      Sync-up JDK1.2.
     16 *   01/13/2000  helena      Added UErrorCode parameter to createXXXInstance methods.
     17 ********************************************************************************
     18 */
     19 
     20 #ifndef BRKITER_H
     21 #define BRKITER_H
     22 
     23 #include "unicode/utypes.h"
     24 
     25 /**
     26  * \file
     27  * \brief C++ API: Break Iterator.
     28  */
     29 
     30 #if UCONFIG_NO_BREAK_ITERATION
     31 
     32 U_NAMESPACE_BEGIN
     33 
     34 /*
     35  * Allow the declaration of APIs with pointers to BreakIterator
     36  * even when break iteration is removed from the build.
     37  */
     38 class BreakIterator;
     39 
     40 U_NAMESPACE_END
     41 
     42 #else
     43 
     44 #include "unicode/uobject.h"
     45 #include "unicode/unistr.h"
     46 #include "unicode/chariter.h"
     47 #include "unicode/locid.h"
     48 #include "unicode/ubrk.h"
     49 #include "unicode/strenum.h"
     50 #include "unicode/utext.h"
     51 #include "unicode/umisc.h"
     52 
     53 U_NAMESPACE_BEGIN
     54 
     55 /**
     56  * The BreakIterator class implements methods for finding the location
     57  * of boundaries in text. BreakIterator is an abstract base class.
     58  * Instances of BreakIterator maintain a current position and scan over
     59  * text returning the index of characters where boundaries occur.
     60  * <p>
     61  * Line boundary analysis determines where a text string can be broken
     62  * when line-wrapping. The mechanism correctly handles punctuation and
     63  * hyphenated words.
     64  * <p>
     65  * Sentence boundary analysis allows selection with correct
     66  * interpretation of periods within numbers and abbreviations, and
     67  * trailing punctuation marks such as quotation marks and parentheses.
     68  * <p>
     69  * Word boundary analysis is used by search and replace functions, as
     70  * well as within text editing applications that allow the user to
     71  * select words with a double click. Word selection provides correct
     72  * interpretation of punctuation marks within and following
     73  * words. Characters that are not part of a word, such as symbols or
     74  * punctuation marks, have word-breaks on both sides.
     75  * <p>
     76  * Character boundary analysis allows users to interact with
     77  * characters as they expect to, for example, when moving the cursor
     78  * through a text string. Character boundary analysis provides correct
     79  * navigation of through character strings, regardless of how the
     80  * character is stored.  For example, an accented character might be
     81  * stored as a base character and a diacritical mark. What users
     82  * consider to be a character can differ between languages.
     83  * <p>
     84  * The text boundary positions are found according to the rules
     85  * described in Unicode Standard Annex #29, Text Boundaries, and
     86  * Unicode Standard Annex #14, Line Breaking Properties.  These
     87  * are available at http://www.unicode.org/reports/tr14/ and
     88  * http://www.unicode.org/reports/tr29/.
     89  * <p>
     90  * In addition to the C++ API defined in this header file, a
     91  * plain C API with equivalent functionality is defined in the
     92  * file ubrk.h
     93  * <p>
     94  * Code snippets illustrating the use of the Break Iterator APIs
     95  * are available in the ICU User Guide,
     96  * http://icu-project.org/userguide/boundaryAnalysis.html
     97  * and in the sample program icu/source/samples/break/break.cpp
     98  *
     99  */
    100 class U_COMMON_API BreakIterator : public UObject {
    101 public:
    102     /**
    103      *  destructor
    104      *  @stable ICU 2.0
    105      */
    106     virtual ~BreakIterator();
    107 
    108     /**
    109      * Return true if another object is semantically equal to this
    110      * one. The other object should be an instance of the same subclass of
    111      * BreakIterator. Objects of different subclasses are considered
    112      * unequal.
    113      * <P>
    114      * Return true if this BreakIterator is at the same position in the
    115      * same text, and is the same class and type (word, line, etc.) of
    116      * BreakIterator, as the argument.  Text is considered the same if
    117      * it contains the same characters, it need not be the same
    118      * object, and styles are not considered.
    119      * @stable ICU 2.0
    120      */
    121     virtual UBool operator==(const BreakIterator&) const = 0;
    122 
    123     /**
    124      * Returns the complement of the result of operator==
    125      * @param rhs The BreakIterator to be compared for inequality
    126      * @return the complement of the result of operator==
    127      * @stable ICU 2.0
    128      */
    129     UBool operator!=(const BreakIterator& rhs) const { return !operator==(rhs); }
    130 
    131     /**
    132      * Return a polymorphic copy of this object.  This is an abstract
    133      * method which subclasses implement.
    134      * @stable ICU 2.0
    135      */
    136     virtual BreakIterator* clone(void) const = 0;
    137 
    138     /**
    139      * Return a polymorphic class ID for this object. Different subclasses
    140      * will return distinct unequal values.
    141      * @stable ICU 2.0
    142      */
    143     virtual UClassID getDynamicClassID(void) const = 0;
    144 
    145     /**
    146      * Return a CharacterIterator over the text being analyzed.
    147      * @stable ICU 2.0
    148      */
    149     virtual CharacterIterator& getText(void) const = 0;
    150 
    151 
    152     /**
    153       *  Get a UText for the text being analyzed.
    154       *  The returned UText is a shallow clone of the UText used internally
    155       *  by the break iterator implementation.  It can safely be used to
    156       *  access the text without impacting any break iterator operations,
    157       *  but the underlying text itself must not be altered.
    158       *
    159       * @param fillIn A UText to be filled in.  If NULL, a new UText will be
    160       *           allocated to hold the result.
    161       * @param status receives any error codes.
    162       * @return   The current UText for this break iterator.  If an input
    163       *           UText was provided, it will always be returned.
    164       * @stable ICU 3.4
    165       */
    166      virtual UText *getUText(UText *fillIn, UErrorCode &status) const = 0;
    167 
    168     /**
    169      * Change the text over which this operates. The text boundary is
    170      * reset to the start.
    171      * @param text The UnicodeString used to change the text.
    172      * @stable ICU 2.0
    173      */
    174     virtual void  setText(const UnicodeString &text) = 0;
    175 
    176     /**
    177      * Reset the break iterator to operate over the text represented by
    178      * the UText.  The iterator position is reset to the start.
    179      *
    180      * This function makes a shallow clone of the supplied UText.  This means
    181      * that the caller is free to immediately close or otherwise reuse the
    182      * Utext that was passed as a parameter, but that the underlying text itself
    183      * must not be altered while being referenced by the break iterator.
    184      *
    185      * All index positions returned by break iterator functions are
    186      * native indices from the UText. For example, when breaking UTF-8
    187      * encoded text, the break positions returned by next(), previous(), etc.
    188      * will be UTF-8 string indices, not UTF-16 positions.
    189      *
    190      * @param text The UText used to change the text.
    191      * @param status receives any error codes.
    192      * @stable ICU 3.4
    193      */
    194     virtual void  setText(UText *text, UErrorCode &status) = 0;
    195 
    196     /**
    197      * Change the text over which this operates. The text boundary is
    198      * reset to the start.
    199      * Note that setText(UText *) provides similar functionality to this function,
    200      * and is more efficient.
    201      * @param it The CharacterIterator used to change the text.
    202      * @stable ICU 2.0
    203      */
    204     virtual void  adoptText(CharacterIterator* it) = 0;
    205 
    206     enum {
    207         /**
    208          * DONE is returned by previous() and next() after all valid
    209          * boundaries have been returned.
    210          * @stable ICU 2.0
    211          */
    212         DONE = (int32_t)-1
    213     };
    214 
    215     /**
    216      * Set the iterator position to the index of the first character in the text being scanned.
    217      * @return The index of the first character in the text being scanned.
    218      * @stable ICU 2.0
    219      */
    220     virtual int32_t first(void) = 0;
    221 
    222     /**
    223      * Set the iterator position to the index immediately BEYOND the last character in the text being scanned.
    224      * @return The index immediately BEYOND the last character in the text being scanned.
    225      * @stable ICU 2.0
    226      */
    227     virtual int32_t last(void) = 0;
    228 
    229     /**
    230      * Set the iterator position to the boundary preceding the current boundary.
    231      * @return The character index of the previous text boundary or DONE if all
    232      * boundaries have been returned.
    233      * @stable ICU 2.0
    234      */
    235     virtual int32_t previous(void) = 0;
    236 
    237     /**
    238      * Advance the iterator to the boundary following the current boundary.
    239      * @return The character index of the next text boundary or DONE if all
    240      * boundaries have been returned.
    241      * @stable ICU 2.0
    242      */
    243     virtual int32_t next(void) = 0;
    244 
    245     /**
    246      * Return character index of the current interator position within the text.
    247      * @return The boundary most recently returned.
    248      * @stable ICU 2.0
    249      */
    250     virtual int32_t current(void) const = 0;
    251 
    252     /**
    253      * Advance the iterator to the first boundary following the specified offset.
    254      * The value returned is always greater than the offset or
    255      * the value BreakIterator.DONE
    256      * @param offset the offset to begin scanning.
    257      * @return The first boundary after the specified offset.
    258      * @stable ICU 2.0
    259      */
    260     virtual int32_t following(int32_t offset) = 0;
    261 
    262     /**
    263      * Set the iterator position to the first boundary preceding the specified offset.
    264      * The value returned is always smaller than the offset or
    265      * the value BreakIterator.DONE
    266      * @param offset the offset to begin scanning.
    267      * @return The first boundary before the specified offset.
    268      * @stable ICU 2.0
    269      */
    270     virtual int32_t preceding(int32_t offset) = 0;
    271 
    272     /**
    273      * Return true if the specfied position is a boundary position.
    274      * As a side effect, the current position of the iterator is set
    275      * to the first boundary position at or following the specified offset.
    276      * @param offset the offset to check.
    277      * @return True if "offset" is a boundary position.
    278      * @stable ICU 2.0
    279      */
    280     virtual UBool isBoundary(int32_t offset) = 0;
    281 
    282     /**
    283      * Set the iterator position to the nth boundary from the current boundary
    284      * @param n the number of boundaries to move by.  A value of 0
    285      * does nothing.  Negative values move to previous boundaries
    286      * and positive values move to later boundaries.
    287      * @return The new iterator position, or
    288      * DONE if there are fewer than |n| boundaries in the specfied direction.
    289      * @stable ICU 2.0
    290      */
    291     virtual int32_t next(int32_t n) = 0;
    292 
    293     /**
    294      * Create BreakIterator for word-breaks using the given locale.
    295      * Returns an instance of a BreakIterator implementing word breaks.
    296      * WordBreak is useful for word selection (ex. double click)
    297      * @param where the locale.
    298      * @param status the error code
    299      * @return A BreakIterator for word-breaks.  The UErrorCode& status
    300      * parameter is used to return status information to the user.
    301      * To check whether the construction succeeded or not, you should check
    302      * the value of U_SUCCESS(err).  If you wish more detailed information, you
    303      * can check for informational error results which still indicate success.
    304      * U_USING_FALLBACK_WARNING indicates that a fall back locale was used.  For
    305      * example, 'de_CH' was requested, but nothing was found there, so 'de' was
    306      * used.  U_USING_DEFAULT_WARNING indicates that the default locale data was
    307      * used; neither the requested locale nor any of its fall back locales
    308      * could be found.
    309      * The caller owns the returned object and is responsible for deleting it.
    310      * @stable ICU 2.0
    311      */
    312     static BreakIterator* U_EXPORT2
    313     createWordInstance(const Locale& where, UErrorCode& status);
    314 
    315     /**
    316      * Create BreakIterator for line-breaks using specified locale.
    317      * Returns an instance of a BreakIterator implementing line breaks. Line
    318      * breaks are logically possible line breaks, actual line breaks are
    319      * usually determined based on display width.
    320      * LineBreak is useful for word wrapping text.
    321      * @param where the locale.
    322      * @param status The error code.
    323      * @return A BreakIterator for line-breaks.  The UErrorCode& status
    324      * parameter is used to return status information to the user.
    325      * To check whether the construction succeeded or not, you should check
    326      * the value of U_SUCCESS(err).  If you wish more detailed information, you
    327      * can check for informational error results which still indicate success.
    328      * U_USING_FALLBACK_WARNING indicates that a fall back locale was used.  For
    329      * example, 'de_CH' was requested, but nothing was found there, so 'de' was
    330      * used.  U_USING_DEFAULT_WARNING indicates that the default locale data was
    331      * used; neither the requested locale nor any of its fall back locales
    332      * could be found.
    333      * The caller owns the returned object and is responsible for deleting it.
    334      * @stable ICU 2.0
    335      */
    336     static BreakIterator* U_EXPORT2
    337     createLineInstance(const Locale& where, UErrorCode& status);
    338 
    339     /**
    340      * Create BreakIterator for character-breaks using specified locale
    341      * Returns an instance of a BreakIterator implementing character breaks.
    342      * Character breaks are boundaries of combining character sequences.
    343      * @param where the locale.
    344      * @param status The error code.
    345      * @return A BreakIterator for character-breaks.  The UErrorCode& status
    346      * parameter is used to return status information to the user.
    347      * To check whether the construction succeeded or not, you should check
    348      * the value of U_SUCCESS(err).  If you wish more detailed information, you
    349      * can check for informational error results which still indicate success.
    350      * U_USING_FALLBACK_WARNING indicates that a fall back locale was used.  For
    351      * example, 'de_CH' was requested, but nothing was found there, so 'de' was
    352      * used.  U_USING_DEFAULT_WARNING indicates that the default locale data was
    353      * used; neither the requested locale nor any of its fall back locales
    354      * could be found.
    355      * The caller owns the returned object and is responsible for deleting it.
    356      * @stable ICU 2.0
    357      */
    358     static BreakIterator* U_EXPORT2
    359     createCharacterInstance(const Locale& where, UErrorCode& status);
    360 
    361     /**
    362      * Create BreakIterator for sentence-breaks using specified locale
    363      * Returns an instance of a BreakIterator implementing sentence breaks.
    364      * @param where the locale.
    365      * @param status The error code.
    366      * @return A BreakIterator for sentence-breaks.  The UErrorCode& status
    367      * parameter is used to return status information to the user.
    368      * To check whether the construction succeeded or not, you should check
    369      * the value of U_SUCCESS(err).  If you wish more detailed information, you
    370      * can check for informational error results which still indicate success.
    371      * U_USING_FALLBACK_WARNING indicates that a fall back locale was used.  For
    372      * example, 'de_CH' was requested, but nothing was found there, so 'de' was
    373      * used.  U_USING_DEFAULT_WARNING indicates that the default locale data was
    374      * used; neither the requested locale nor any of its fall back locales
    375      * could be found.
    376      * The caller owns the returned object and is responsible for deleting it.
    377      * @stable ICU 2.0
    378      */
    379     static BreakIterator* U_EXPORT2
    380     createSentenceInstance(const Locale& where, UErrorCode& status);
    381 
    382     /**
    383      * Create BreakIterator for title-casing breaks using the specified locale
    384      * Returns an instance of a BreakIterator implementing title breaks.
    385      * The iterator returned locates title boundaries as described for
    386      * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration,
    387      * please use Word Boundary iterator.{@link #createWordInstance }
    388      *
    389      * @param where the locale.
    390      * @param status The error code.
    391      * @return A BreakIterator for title-breaks.  The UErrorCode& status
    392      * parameter is used to return status information to the user.
    393      * To check whether the construction succeeded or not, you should check
    394      * the value of U_SUCCESS(err).  If you wish more detailed information, you
    395      * can check for informational error results which still indicate success.
    396      * U_USING_FALLBACK_WARNING indicates that a fall back locale was used.  For
    397      * example, 'de_CH' was requested, but nothing was found there, so 'de' was
    398      * used.  U_USING_DEFAULT_WARNING indicates that the default locale data was
    399      * used; neither the requested locale nor any of its fall back locales
    400      * could be found.
    401      * The caller owns the returned object and is responsible for deleting it.
    402      * @stable ICU 2.1
    403      */
    404     static BreakIterator* U_EXPORT2
    405     createTitleInstance(const Locale& where, UErrorCode& status);
    406 
    407     /**
    408      * Get the set of Locales for which TextBoundaries are installed.
    409      * <p><b>Note:</b> this will not return locales added through the register
    410      * call. To see the registered locales too, use the getAvailableLocales
    411      * function that returns a StringEnumeration object </p>
    412      * @param count the output parameter of number of elements in the locale list
    413      * @return available locales
    414      * @stable ICU 2.0
    415      */
    416     static const Locale* U_EXPORT2 getAvailableLocales(int32_t& count);
    417 
    418     /**
    419      * Get name of the object for the desired Locale, in the desired langauge.
    420      * @param objectLocale must be from getAvailableLocales.
    421      * @param displayLocale specifies the desired locale for output.
    422      * @param name the fill-in parameter of the return value
    423      * Uses best match.
    424      * @return user-displayable name
    425      * @stable ICU 2.0
    426      */
    427     static UnicodeString& U_EXPORT2 getDisplayName(const Locale& objectLocale,
    428                                          const Locale& displayLocale,
    429                                          UnicodeString& name);
    430 
    431     /**
    432      * Get name of the object for the desired Locale, in the langauge of the
    433      * default locale.
    434      * @param objectLocale must be from getMatchingLocales
    435      * @param name the fill-in parameter of the return value
    436      * @return user-displayable name
    437      * @stable ICU 2.0
    438      */
    439     static UnicodeString& U_EXPORT2 getDisplayName(const Locale& objectLocale,
    440                                          UnicodeString& name);
    441 
    442     /**
    443      * Thread safe client-buffer-based cloning operation
    444      *    Do NOT call delete on a safeclone, since 'new' is not used to create it.
    445      * @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated.
    446      * If buffer is not large enough, new memory will be allocated.
    447      * @param BufferSize reference to size of allocated space.
    448      * If BufferSize == 0, a sufficient size for use in cloning will
    449      * be returned ('pre-flighting')
    450      * If BufferSize is not enough for a stack-based safe clone,
    451      * new memory will be allocated.
    452      * @param status to indicate whether the operation went on smoothly or there were errors
    453      *  An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were
    454      *  necessary.
    455      * @return pointer to the new clone
    456      *
    457      * @stable ICU 2.0
    458      */
    459     virtual BreakIterator *  createBufferClone(void *stackBuffer,
    460                                                int32_t &BufferSize,
    461                                                UErrorCode &status) = 0;
    462 
    463     /**
    464      *   Determine whether the BreakIterator was created in user memory by
    465      *   createBufferClone(), and thus should not be deleted.  Such objects
    466      *   must be closed by an explicit call to the destructor (not delete).
    467      *  @stable ICU 2.0
    468      */
    469     inline UBool isBufferClone(void);
    470 
    471 #if !UCONFIG_NO_SERVICE
    472     /**
    473      * Register a new break iterator of the indicated kind, to use in the given locale.
    474      * The break iterator will be adopted.  Clones of the iterator will be returned
    475      * if a request for a break iterator of the given kind matches or falls back to
    476      * this locale.
    477      * @param toAdopt the BreakIterator instance to be adopted
    478      * @param locale the Locale for which this instance is to be registered
    479      * @param kind the type of iterator for which this instance is to be registered
    480      * @param status the in/out status code, no special meanings are assigned
    481      * @return a registry key that can be used to unregister this instance
    482      * @stable ICU 2.4
    483      */
    484     static URegistryKey U_EXPORT2 registerInstance(BreakIterator* toAdopt,
    485                                         const Locale& locale,
    486                                         UBreakIteratorType kind,
    487                                         UErrorCode& status);
    488 
    489     /**
    490      * Unregister a previously-registered BreakIterator using the key returned from the
    491      * register call.  Key becomes invalid after a successful call and should not be used again.
    492      * The BreakIterator corresponding to the key will be deleted.
    493      * @param key the registry key returned by a previous call to registerInstance
    494      * @param status the in/out status code, no special meanings are assigned
    495      * @return TRUE if the iterator for the key was successfully unregistered
    496      * @stable ICU 2.4
    497      */
    498     static UBool U_EXPORT2 unregister(URegistryKey key, UErrorCode& status);
    499 
    500     /**
    501      * Return a StringEnumeration over the locales available at the time of the call,
    502      * including registered locales.
    503      * @return a StringEnumeration over the locales available at the time of the call
    504      * @stable ICU 2.4
    505      */
    506     static StringEnumeration* U_EXPORT2 getAvailableLocales(void);
    507 #endif
    508 
    509     /**
    510      * Returns the locale for this break iterator. Two flavors are available: valid and
    511      * actual locale.
    512      * @stable ICU 2.8
    513      */
    514     Locale getLocale(ULocDataLocaleType type, UErrorCode& status) const;
    515 
    516 #ifndef U_HIDE_INTERNAL_API
    517     /** Get the locale for this break iterator object. You can choose between valid and actual locale.
    518      *  @param type type of the locale we're looking for (valid or actual)
    519      *  @param status error code for the operation
    520      *  @return the locale
    521      *  @internal
    522      */
    523     const char *getLocaleID(ULocDataLocaleType type, UErrorCode& status) const;
    524 #endif  /* U_HIDE_INTERNAL_API */
    525 
    526     /**
    527      *  Set the subject text string upon which the break iterator is operating
    528      *  without changing any other aspect of the matching state.
    529      *  The new and previous text strings must have the same content.
    530      *
    531      *  This function is intended for use in environments where ICU is operating on
    532      *  strings that may move around in memory.  It provides a mechanism for notifying
    533      *  ICU that the string has been relocated, and providing a new UText to access the
    534      *  string in its new position.
    535      *
    536      *  Note that the break iterator implementation never copies the underlying text
    537      *  of a string being processed, but always operates directly on the original text
    538      *  provided by the user. Refreshing simply drops the references to the old text
    539      *  and replaces them with references to the new.
    540      *
    541      *  Caution:  this function is normally used only by very specialized,
    542      *  system-level code.  One example use case is with garbage collection that moves
    543      *  the text in memory.
    544      *
    545      * @param input      The new (moved) text string.
    546      * @param status     Receives errors detected by this function.
    547      * @return           *this
    548      *
    549      * @stable ICU 49
    550      */
    551     virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) = 0;
    552 
    553  private:
    554     static BreakIterator* buildInstance(const Locale& loc, const char *type, int32_t kind, UErrorCode& status);
    555     static BreakIterator* createInstance(const Locale& loc, int32_t kind, UErrorCode& status);
    556     static BreakIterator* makeInstance(const Locale& loc, int32_t kind, UErrorCode& status);
    557 
    558     friend class ICUBreakIteratorFactory;
    559     friend class ICUBreakIteratorService;
    560 
    561 protected:
    562     // Do not enclose protected default/copy constructors with #ifndef U_HIDE_INTERNAL_API
    563     // or else the compiler will create a public ones.
    564     /** @internal */
    565     BreakIterator();
    566     /** @internal */
    567     UBool fBufferClone;
    568     /** @internal */
    569     BreakIterator (const BreakIterator &other) : UObject(other), fBufferClone(FALSE) {}
    570 
    571 private:
    572 
    573     /** @internal */
    574     char actualLocale[ULOC_FULLNAME_CAPACITY];
    575     char validLocale[ULOC_FULLNAME_CAPACITY];
    576 
    577     /**
    578      * The assignment operator has no real implementation.
    579      * It's provided to make the compiler happy. Do not call.
    580      */
    581     BreakIterator& operator=(const BreakIterator&);
    582 };
    583 
    584 inline UBool BreakIterator::isBufferClone()
    585 {
    586     return fBufferClone;
    587 }
    588 
    589 U_NAMESPACE_END
    590 
    591 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
    592 
    593 #endif // _BRKITER
    594 //eof
    595 
    596