Home | History | Annotate | Download | only in unicode
      1 /*
      2 ********************************************************************************
      3 *   Copyright (C) 1997-2010, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 ********************************************************************************
      6 *
      7 * File brkiter.h
      8 *
      9 * Modification History:
     10 *
     11 *   Date        Name        Description
     12 *   02/18/97    aliu        Added typedef for TextCount.  Made DONE const.
     13 *   05/07/97    aliu        Fixed DLL declaration.
     14 *   07/09/97    jfitz       Renamed BreakIterator and interface synced with JDK
     15 *   08/11/98    helena      Sync-up JDK1.2.
     16 *   01/13/2000  helena      Added UErrorCode parameter to createXXXInstance methods.
     17 ********************************************************************************
     18 */
     19 
     20 #ifndef BRKITER_H
     21 #define BRKITER_H
     22 
     23 #include "unicode/utypes.h"
     24 
     25 /**
     26  * \file
     27  * \brief C++ API: Break Iterator.
     28  */
     29 
     30 #if UCONFIG_NO_BREAK_ITERATION
     31 
     32 U_NAMESPACE_BEGIN
     33 
     34 /*
     35  * Allow the declaration of APIs with pointers to BreakIterator
     36  * even when break iteration is removed from the build.
     37  */
     38 class BreakIterator;
     39 
     40 U_NAMESPACE_END
     41 
     42 #else
     43 
     44 #include "unicode/uobject.h"
     45 #include "unicode/unistr.h"
     46 #include "unicode/chariter.h"
     47 #include "unicode/locid.h"
     48 #include "unicode/ubrk.h"
     49 #include "unicode/strenum.h"
     50 #include "unicode/utext.h"
     51 #include "unicode/umisc.h"
     52 
     53 U_NAMESPACE_BEGIN
     54 
     55 /**
     56  * The BreakIterator class implements methods for finding the location
     57  * of boundaries in text. BreakIterator is an abstract base class.
     58  * Instances of BreakIterator maintain a current position and scan over
     59  * text returning the index of characters where boundaries occur.
     60  * <p>
     61  * Line boundary analysis determines where a text string can be broken
     62  * when line-wrapping. The mechanism correctly handles punctuation and
     63  * hyphenated words.
     64  * <p>
     65  * Sentence boundary analysis allows selection with correct
     66  * interpretation of periods within numbers and abbreviations, and
     67  * trailing punctuation marks such as quotation marks and parentheses.
     68  * <p>
     69  * Word boundary analysis is used by search and replace functions, as
     70  * well as within text editing applications that allow the user to
     71  * select words with a double click. Word selection provides correct
     72  * interpretation of punctuation marks within and following
     73  * words. Characters that are not part of a word, such as symbols or
     74  * punctuation marks, have word-breaks on both sides.
     75  * <p>
     76  * Character boundary analysis allows users to interact with
     77  * characters as they expect to, for example, when moving the cursor
     78  * through a text string. Character boundary analysis provides correct
     79  * navigation of through character strings, regardless of how the
     80  * character is stored.  For example, an accented character might be
     81  * stored as a base character and a diacritical mark. What users
     82  * consider to be a character can differ between languages.
     83  * <p>
     84  * The text boundary positions are found according to the rules
     85  * described in Unicode Standard Annex #29, Text Boundaries, and
     86  * Unicode Standard Annex #14, Line Breaking Properties.  These
     87  * are available at http://www.unicode.org/reports/tr14/ and
     88  * http://www.unicode.org/reports/tr29/.
     89  * <p>
     90  * In addition to the C++ API defined in this header file, a
     91  * plain C API with equivalent functionality is defined in the
     92  * file ubrk.h
     93  * <p>
     94  * Code snippets illustrating the use of the Break Iterator APIs
     95  * are available in the ICU User Guide,
     96  * http://icu-project.org/userguide/boundaryAnalysis.html
     97  * and in the sample program icu/source/samples/break/break.cpp
     98  *
     99  */
    100 class U_COMMON_API BreakIterator : public UObject {
    101 public:
    102     /**
    103      *  destructor
    104      *  @stable ICU 2.0
    105      */
    106     virtual ~BreakIterator();
    107 
    108     /**
    109      * Return true if another object is semantically equal to this
    110      * one. The other object should be an instance of the same subclass of
    111      * BreakIterator. Objects of different subclasses are considered
    112      * unequal.
    113      * <P>
    114      * Return true if this BreakIterator is at the same position in the
    115      * same text, and is the same class and type (word, line, etc.) of
    116      * BreakIterator, as the argument.  Text is considered the same if
    117      * it contains the same characters, it need not be the same
    118      * object, and styles are not considered.
    119      * @stable ICU 2.0
    120      */
    121     virtual UBool operator==(const BreakIterator&) const = 0;
    122 
    123     /**
    124      * Returns the complement of the result of operator==
    125      * @param rhs The BreakIterator to be compared for inequality
    126      * @return the complement of the result of operator==
    127      * @stable ICU 2.0
    128      */
    129     UBool operator!=(const BreakIterator& rhs) const { return !operator==(rhs); }
    130 
    131     /**
    132      * Return a polymorphic copy of this object.  This is an abstract
    133      * method which subclasses implement.
    134      * @stable ICU 2.0
    135      */
    136     virtual BreakIterator* clone(void) const = 0;
    137 
    138     /**
    139      * Return a polymorphic class ID for this object. Different subclasses
    140      * will return distinct unequal values.
    141      * @stable ICU 2.0
    142      */
    143     virtual UClassID getDynamicClassID(void) const = 0;
    144 
    145     /**
    146      * Return a CharacterIterator over the text being analyzed.
    147      * @stable ICU 2.0
    148      */
    149     virtual CharacterIterator& getText(void) const = 0;
    150 
    151 
    152     /**
    153       *  Get a UText for the text being analyzed.
    154       *  The returned UText is a shallow clone of the UText used internally
    155       *  by the break iterator implementation.  It can safely be used to
    156       *  access the text without impacting any break iterator operations,
    157       *  but the underlying text itself must not be altered.
    158       *
    159       * @param fillIn A UText to be filled in.  If NULL, a new UText will be
    160       *           allocated to hold the result.
    161       * @param status receives any error codes.
    162       * @return   The current UText for this break iterator.  If an input
    163       *           UText was provided, it will always be returned.
    164       * @stable ICU 3.4
    165       */
    166      virtual UText *getUText(UText *fillIn, UErrorCode &status) const = 0;
    167 
    168     /**
    169      * Change the text over which this operates. The text boundary is
    170      * reset to the start.
    171      * @param text The UnicodeString used to change the text.
    172      * @stable ICU 2.0
    173      */
    174     virtual void  setText(const UnicodeString &text) = 0;
    175 
    176     /**
    177      * Reset the break iterator to operate over the text represented by
    178      * the UText.  The iterator position is reset to the start.
    179      *
    180      * This function makes a shallow clone of the supplied UText.  This means
    181      * that the caller is free to immediately close or otherwise reuse the
    182      * Utext that was passed as a parameter, but that the underlying text itself
    183      * must not be altered while being referenced by the break iterator.
    184      *
    185      * @param text The UText used to change the text.
    186      * @param status receives any error codes.
    187      * @stable ICU 3.4
    188      */
    189     virtual void  setText(UText *text, UErrorCode &status) = 0;
    190 
    191     /**
    192      * Change the text over which this operates. The text boundary is
    193      * reset to the start.
    194      * Note that setText(UText *) provides similar functionality to this function,
    195      * and is more efficient.
    196      * @param it The CharacterIterator used to change the text.
    197      * @stable ICU 2.0
    198      */
    199     virtual void  adoptText(CharacterIterator* it) = 0;
    200 
    201     enum {
    202         /**
    203          * DONE is returned by previous() and next() after all valid
    204          * boundaries have been returned.
    205          * @stable ICU 2.0
    206          */
    207         DONE = (int32_t)-1
    208     };
    209 
    210     /**
    211      * Return the index of the first character in the text being scanned.
    212      * @stable ICU 2.0
    213      */
    214     virtual int32_t first(void) = 0;
    215 
    216     /**
    217      * Return the index immediately BEYOND the last character in the text being scanned.
    218      * @stable ICU 2.0
    219      */
    220     virtual int32_t last(void) = 0;
    221 
    222     /**
    223      * Return the boundary preceding the current boundary.
    224      * @return The character index of the previous text boundary or DONE if all
    225      * boundaries have been returned.
    226      * @stable ICU 2.0
    227      */
    228     virtual int32_t previous(void) = 0;
    229 
    230     /**
    231      * Return the boundary following the current boundary.
    232      * @return The character index of the next text boundary or DONE if all
    233      * boundaries have been returned.
    234      * @stable ICU 2.0
    235      */
    236     virtual int32_t next(void) = 0;
    237 
    238     /**
    239      * Return character index of the current interator position within the text.
    240      * @return The boundary most recently returned.
    241      * @stable ICU 2.0
    242      */
    243     virtual int32_t current(void) const = 0;
    244 
    245     /**
    246      * Return the first boundary following the specified offset.
    247      * The value returned is always greater than the offset or
    248      * the value BreakIterator.DONE
    249      * @param offset the offset to begin scanning.
    250      * @return The first boundary after the specified offset.
    251      * @stable ICU 2.0
    252      */
    253     virtual int32_t following(int32_t offset) = 0;
    254 
    255     /**
    256      * Return the first boundary preceding the specified offset.
    257      * The value returned is always smaller than the offset or
    258      * the value BreakIterator.DONE
    259      * @param offset the offset to begin scanning.
    260      * @return The first boundary before the specified offset.
    261      * @stable ICU 2.0
    262      */
    263     virtual int32_t preceding(int32_t offset) = 0;
    264 
    265     /**
    266      * Return true if the specfied position is a boundary position.
    267      * As a side effect, the current position of the iterator is set
    268      * to the first boundary position at or following the specified offset.
    269      * @param offset the offset to check.
    270      * @return True if "offset" is a boundary position.
    271      * @stable ICU 2.0
    272      */
    273     virtual UBool isBoundary(int32_t offset) = 0;
    274 
    275     /**
    276      * Return the nth boundary from the current boundary
    277      * @param n which boundary to return.  A value of 0
    278      * does nothing.  Negative values move to previous boundaries
    279      * and positive values move to later boundaries.
    280      * @return The index of the nth boundary from the current position, or
    281      * DONE if there are fewer than |n| boundaries in the specfied direction.
    282      * @stable ICU 2.0
    283      */
    284     virtual int32_t next(int32_t n) = 0;
    285 
    286     /**
    287      * Create BreakIterator for word-breaks using the given locale.
    288      * Returns an instance of a BreakIterator implementing word breaks.
    289      * WordBreak is useful for word selection (ex. double click)
    290      * @param where the locale.
    291      * @param status the error code
    292      * @return A BreakIterator for word-breaks.  The UErrorCode& status
    293      * parameter is used to return status information to the user.
    294      * To check whether the construction succeeded or not, you should check
    295      * the value of U_SUCCESS(err).  If you wish more detailed information, you
    296      * can check for informational error results which still indicate success.
    297      * U_USING_FALLBACK_WARNING indicates that a fall back locale was used.  For
    298      * example, 'de_CH' was requested, but nothing was found there, so 'de' was
    299      * used.  U_USING_DEFAULT_WARNING indicates that the default locale data was
    300      * used; neither the requested locale nor any of its fall back locales
    301      * could be found.
    302      * The caller owns the returned object and is responsible for deleting it.
    303      * @stable ICU 2.0
    304      */
    305     static BreakIterator* U_EXPORT2
    306     createWordInstance(const Locale& where, UErrorCode& status);
    307 
    308     /**
    309      * Create BreakIterator for line-breaks using specified locale.
    310      * Returns an instance of a BreakIterator implementing line breaks. Line
    311      * breaks are logically possible line breaks, actual line breaks are
    312      * usually determined based on display width.
    313      * LineBreak is useful for word wrapping text.
    314      * @param where the locale.
    315      * @param status The error code.
    316      * @return A BreakIterator for line-breaks.  The UErrorCode& status
    317      * parameter is used to return status information to the user.
    318      * To check whether the construction succeeded or not, you should check
    319      * the value of U_SUCCESS(err).  If you wish more detailed information, you
    320      * can check for informational error results which still indicate success.
    321      * U_USING_FALLBACK_WARNING indicates that a fall back locale was used.  For
    322      * example, 'de_CH' was requested, but nothing was found there, so 'de' was
    323      * used.  U_USING_DEFAULT_WARNING indicates that the default locale data was
    324      * used; neither the requested locale nor any of its fall back locales
    325      * could be found.
    326      * The caller owns the returned object and is responsible for deleting it.
    327      * @stable ICU 2.0
    328      */
    329     static BreakIterator* U_EXPORT2
    330     createLineInstance(const Locale& where, UErrorCode& status);
    331 
    332     /**
    333      * Create BreakIterator for character-breaks using specified locale
    334      * Returns an instance of a BreakIterator implementing character breaks.
    335      * Character breaks are boundaries of combining character sequences.
    336      * @param where the locale.
    337      * @param status The error code.
    338      * @return A BreakIterator for character-breaks.  The UErrorCode& status
    339      * parameter is used to return status information to the user.
    340      * To check whether the construction succeeded or not, you should check
    341      * the value of U_SUCCESS(err).  If you wish more detailed information, you
    342      * can check for informational error results which still indicate success.
    343      * U_USING_FALLBACK_WARNING indicates that a fall back locale was used.  For
    344      * example, 'de_CH' was requested, but nothing was found there, so 'de' was
    345      * used.  U_USING_DEFAULT_WARNING indicates that the default locale data was
    346      * used; neither the requested locale nor any of its fall back locales
    347      * could be found.
    348      * The caller owns the returned object and is responsible for deleting it.
    349      * @stable ICU 2.0
    350      */
    351     static BreakIterator* U_EXPORT2
    352     createCharacterInstance(const Locale& where, UErrorCode& status);
    353 
    354     /**
    355      * Create BreakIterator for sentence-breaks using specified locale
    356      * Returns an instance of a BreakIterator implementing sentence breaks.
    357      * @param where the locale.
    358      * @param status The error code.
    359      * @return A BreakIterator for sentence-breaks.  The UErrorCode& status
    360      * parameter is used to return status information to the user.
    361      * To check whether the construction succeeded or not, you should check
    362      * the value of U_SUCCESS(err).  If you wish more detailed information, you
    363      * can check for informational error results which still indicate success.
    364      * U_USING_FALLBACK_WARNING indicates that a fall back locale was used.  For
    365      * example, 'de_CH' was requested, but nothing was found there, so 'de' was
    366      * used.  U_USING_DEFAULT_WARNING indicates that the default locale data was
    367      * used; neither the requested locale nor any of its fall back locales
    368      * could be found.
    369      * The caller owns the returned object and is responsible for deleting it.
    370      * @stable ICU 2.0
    371      */
    372     static BreakIterator* U_EXPORT2
    373     createSentenceInstance(const Locale& where, UErrorCode& status);
    374 
    375     /**
    376      * Create BreakIterator for title-casing breaks using the specified locale
    377      * Returns an instance of a BreakIterator implementing title breaks.
    378      * The iterator returned locates title boundaries as described for
    379      * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration,
    380      * please use Word Boundary iterator.{@link #createWordInstance }
    381      *
    382      * @param where the locale.
    383      * @param status The error code.
    384      * @return A BreakIterator for title-breaks.  The UErrorCode& status
    385      * parameter is used to return status information to the user.
    386      * To check whether the construction succeeded or not, you should check
    387      * the value of U_SUCCESS(err).  If you wish more detailed information, you
    388      * can check for informational error results which still indicate success.
    389      * U_USING_FALLBACK_WARNING indicates that a fall back locale was used.  For
    390      * example, 'de_CH' was requested, but nothing was found there, so 'de' was
    391      * used.  U_USING_DEFAULT_WARNING indicates that the default locale data was
    392      * used; neither the requested locale nor any of its fall back locales
    393      * could be found.
    394      * The caller owns the returned object and is responsible for deleting it.
    395      * @stable ICU 2.1
    396      */
    397     static BreakIterator* U_EXPORT2
    398     createTitleInstance(const Locale& where, UErrorCode& status);
    399 
    400     /**
    401      * Get the set of Locales for which TextBoundaries are installed.
    402      * <p><b>Note:</b> this will not return locales added through the register
    403      * call. To see the registered locales too, use the getAvailableLocales
    404      * function that returns a StringEnumeration object </p>
    405      * @param count the output parameter of number of elements in the locale list
    406      * @return available locales
    407      * @stable ICU 2.0
    408      */
    409     static const Locale* U_EXPORT2 getAvailableLocales(int32_t& count);
    410 
    411     /**
    412      * Get name of the object for the desired Locale, in the desired langauge.
    413      * @param objectLocale must be from getAvailableLocales.
    414      * @param displayLocale specifies the desired locale for output.
    415      * @param name the fill-in parameter of the return value
    416      * Uses best match.
    417      * @return user-displayable name
    418      * @stable ICU 2.0
    419      */
    420     static UnicodeString& U_EXPORT2 getDisplayName(const Locale& objectLocale,
    421                                          const Locale& displayLocale,
    422                                          UnicodeString& name);
    423 
    424     /**
    425      * Get name of the object for the desired Locale, in the langauge of the
    426      * default locale.
    427      * @param objectLocale must be from getMatchingLocales
    428      * @param name the fill-in parameter of the return value
    429      * @return user-displayable name
    430      * @stable ICU 2.0
    431      */
    432     static UnicodeString& U_EXPORT2 getDisplayName(const Locale& objectLocale,
    433                                          UnicodeString& name);
    434 
    435     /**
    436      * Thread safe client-buffer-based cloning operation
    437      *    Do NOT call delete on a safeclone, since 'new' is not used to create it.
    438      * @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated.
    439      * If buffer is not large enough, new memory will be allocated.
    440      * @param BufferSize reference to size of allocated space.
    441      * If BufferSize == 0, a sufficient size for use in cloning will
    442      * be returned ('pre-flighting')
    443      * If BufferSize is not enough for a stack-based safe clone,
    444      * new memory will be allocated.
    445      * @param status to indicate whether the operation went on smoothly or there were errors
    446      *  An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were
    447      *  necessary.
    448      * @return pointer to the new clone
    449      *
    450      * @stable ICU 2.0
    451      */
    452     virtual BreakIterator *  createBufferClone(void *stackBuffer,
    453                                                int32_t &BufferSize,
    454                                                UErrorCode &status) = 0;
    455 
    456     /**
    457      *   Determine whether the BreakIterator was created in user memory by
    458      *   createBufferClone(), and thus should not be deleted.  Such objects
    459      *   must be closed by an explicit call to the destructor (not delete).
    460      *  @stable ICU 2.0
    461      */
    462     inline UBool isBufferClone(void);
    463 
    464 #if !UCONFIG_NO_SERVICE
    465     /**
    466      * Register a new break iterator of the indicated kind, to use in the given locale.
    467      * The break iterator will be adopted.  Clones of the iterator will be returned
    468      * if a request for a break iterator of the given kind matches or falls back to
    469      * this locale.
    470      * @param toAdopt the BreakIterator instance to be adopted
    471      * @param locale the Locale for which this instance is to be registered
    472      * @param kind the type of iterator for which this instance is to be registered
    473      * @param status the in/out status code, no special meanings are assigned
    474      * @return a registry key that can be used to unregister this instance
    475      * @stable ICU 2.4
    476      */
    477     static URegistryKey U_EXPORT2 registerInstance(BreakIterator* toAdopt,
    478                                         const Locale& locale,
    479                                         UBreakIteratorType kind,
    480                                         UErrorCode& status);
    481 
    482     /**
    483      * Unregister a previously-registered BreakIterator using the key returned from the
    484      * register call.  Key becomes invalid after a successful call and should not be used again.
    485      * The BreakIterator corresponding to the key will be deleted.
    486      * @param key the registry key returned by a previous call to registerInstance
    487      * @param status the in/out status code, no special meanings are assigned
    488      * @return TRUE if the iterator for the key was successfully unregistered
    489      * @stable ICU 2.4
    490      */
    491     static UBool U_EXPORT2 unregister(URegistryKey key, UErrorCode& status);
    492 
    493     /**
    494      * Return a StringEnumeration over the locales available at the time of the call,
    495      * including registered locales.
    496      * @return a StringEnumeration over the locales available at the time of the call
    497      * @stable ICU 2.4
    498      */
    499     static StringEnumeration* U_EXPORT2 getAvailableLocales(void);
    500 #endif
    501 
    502     /**
    503      * Returns the locale for this break iterator. Two flavors are available: valid and
    504      * actual locale.
    505      * @stable ICU 2.8
    506      */
    507     Locale getLocale(ULocDataLocaleType type, UErrorCode& status) const;
    508 
    509     /** Get the locale for this break iterator object. You can choose between valid and actual locale.
    510      *  @param type type of the locale we're looking for (valid or actual)
    511      *  @param status error code for the operation
    512      *  @return the locale
    513      *  @internal
    514      */
    515     const char *getLocaleID(ULocDataLocaleType type, UErrorCode& status) const;
    516 
    517  private:
    518     static BreakIterator* buildInstance(const Locale& loc, const char *type, int32_t kind, UErrorCode& status);
    519     static BreakIterator* createInstance(const Locale& loc, int32_t kind, UErrorCode& status);
    520     static BreakIterator* makeInstance(const Locale& loc, int32_t kind, UErrorCode& status);
    521 
    522     friend class ICUBreakIteratorFactory;
    523     friend class ICUBreakIteratorService;
    524 
    525 protected:
    526     /** @internal */
    527     BreakIterator();
    528     /** @internal */
    529     UBool fBufferClone;
    530     /** @internal */
    531     BreakIterator (const BreakIterator &other) : UObject(other), fBufferClone(FALSE) {}
    532 
    533 private:
    534 
    535     /** @internal */
    536     char actualLocale[ULOC_FULLNAME_CAPACITY];
    537     char validLocale[ULOC_FULLNAME_CAPACITY];
    538 
    539     /**
    540      * The assignment operator has no real implementation.
    541      * It's provided to make the compiler happy. Do not call.
    542      */
    543     BreakIterator& operator=(const BreakIterator&);
    544 };
    545 
    546 inline UBool BreakIterator::isBufferClone()
    547 {
    548     return fBufferClone;
    549 }
    550 
    551 U_NAMESPACE_END
    552 
    553 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
    554 
    555 #endif // _BRKITER
    556 //eof
    557 
    558