Home | History | Annotate | Download | only in unicode
      1 /*
      2 ******************************************************************************
      3 * Copyright (C) 1996-2015, International Business Machines Corporation and others.
      4 * All Rights Reserved.
      5 ******************************************************************************
      6 */
      7 
      8 #ifndef UBRK_H
      9 #define UBRK_H
     10 
     11 #include "unicode/utypes.h"
     12 #include "unicode/uloc.h"
     13 #include "unicode/utext.h"
     14 #include "unicode/localpointer.h"
     15 
     16 /**
     17  * A text-break iterator.
     18  *  For usage in C programs.
     19  */
     20 #ifndef UBRK_TYPEDEF_UBREAK_ITERATOR
     21 #   define UBRK_TYPEDEF_UBREAK_ITERATOR
     22     /**
     23      *  Opaque type representing an ICU Break iterator object.
     24      *  @stable ICU 2.0
     25      */
     26     typedef struct UBreakIterator UBreakIterator;
     27 #endif
     28 
     29 #if !UCONFIG_NO_BREAK_ITERATION
     30 
     31 #include "unicode/parseerr.h"
     32 
     33 /**
     34  * \file
     35  * \brief C API: BreakIterator
     36  *
     37  * <h2> BreakIterator C API </h2>
     38  *
     39  * The BreakIterator C API defines  methods for finding the location
     40  * of boundaries in text. Pointer to a UBreakIterator maintain a
     41  * current position and scan over text returning the index of characters
     42  * where boundaries occur.
     43  * <p>
     44  * Line boundary analysis determines where a text string can be broken
     45  * when line-wrapping. The mechanism correctly handles punctuation and
     46  * hyphenated words.
     47  * <p>
     48  * Note: The locale keyword "lb" can be used to modify line break
     49  * behavior according to the CSS level 3 line-break options, see
     50  * <http://dev.w3.org/csswg/css-text/#line-breaking>. For example:
     51  * "ja@lb=strict", "zh@lb=loose".
     52  * <p>
     53  * Sentence boundary analysis allows selection with correct
     54  * interpretation of periods within numbers and abbreviations, and
     55  * trailing punctuation marks such as quotation marks and parentheses.
     56  * <p>
     57  * Note: The locale keyword "ss" can be used to enable use of
     58  * segmentation suppression data (preventing breaks in English after
     59  * abbreviations such as "Mr." or "Est.", for example), as follows:
     60  * "en@ss=standard".
     61  * <p>
     62  * Word boundary analysis is used by search and replace functions, as
     63  * well as within text editing applications that allow the user to
     64  * select words with a double click. Word selection provides correct
     65  * interpretation of punctuation marks within and following
     66  * words. Characters that are not part of a word, such as symbols or
     67  * punctuation marks, have word-breaks on both sides.
     68  * <p>
     69  * Character boundary analysis identifies the boundaries of
     70  * "Extended Grapheme Clusters", which are groupings of codepoints
     71  * that should be treated as character-like units for many text operations.
     72  * Please see Unicode Standard Annex #29, Unicode Text Segmentation,
     73  * http://www.unicode.org/reports/tr29/ for additional information
     74  * on grapheme clusters and guidelines on their use.
     75  * <p>
     76  * Title boundary analysis locates all positions,
     77  * typically starts of words, that should be set to Title Case
     78  * when title casing the text.
     79  * <p>
     80  * The text boundary positions are found according to the rules
     81  * described in Unicode Standard Annex #29, Text Boundaries, and
     82  * Unicode Standard Annex #14, Line Breaking Properties.  These
     83  * are available at http://www.unicode.org/reports/tr14/ and
     84  * http://www.unicode.org/reports/tr29/.
     85  * <p>
     86  * In addition to the plain C API defined in this header file, an
     87  * object oriented C++ API with equivalent functionality is defined in the
     88  * file brkiter.h.
     89  * <p>
     90  * Code snippets illustrating the use of the Break Iterator APIs
     91  * are available in the ICU User Guide,
     92  * http://icu-project.org/userguide/boundaryAnalysis.html
     93  * and in the sample program icu/source/samples/break/break.cpp
     94  */
     95 
     96 /** The possible types of text boundaries.  @stable ICU 2.0 */
     97 typedef enum UBreakIteratorType {
     98   /** Character breaks  @stable ICU 2.0 */
     99   UBRK_CHARACTER = 0,
    100   /** Word breaks @stable ICU 2.0 */
    101   UBRK_WORD = 1,
    102   /** Line breaks @stable ICU 2.0 */
    103   UBRK_LINE = 2,
    104   /** Sentence breaks @stable ICU 2.0 */
    105   UBRK_SENTENCE = 3,
    106 
    107 #ifndef U_HIDE_DEPRECATED_API
    108   /**
    109    * Title Case breaks
    110    * The iterator created using this type locates title boundaries as described for
    111    * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration,
    112    * please use Word Boundary iterator.
    113    *
    114    * @deprecated ICU 2.8 Use the word break iterator for titlecasing for Unicode 4 and later.
    115    */
    116   UBRK_TITLE = 4,
    117 #endif /* U_HIDE_DEPRECATED_API */
    118   UBRK_COUNT = 5
    119 } UBreakIteratorType;
    120 
    121 /** Value indicating all text boundaries have been returned.
    122  *  @stable ICU 2.0
    123  */
    124 #define UBRK_DONE ((int32_t) -1)
    125 
    126 
    127 /**
    128  *  Enum constants for the word break tags returned by
    129  *  getRuleStatus().  A range of values is defined for each category of
    130  *  word, to allow for further subdivisions of a category in future releases.
    131  *  Applications should check for tag values falling within the range, rather
    132  *  than for single individual values.
    133  *  @stable ICU 2.2
    134 */
    135 typedef enum UWordBreak {
    136     /** Tag value for "words" that do not fit into any of other categories.
    137      *  Includes spaces and most punctuation. */
    138     UBRK_WORD_NONE           = 0,
    139     /** Upper bound for tags for uncategorized words. */
    140     UBRK_WORD_NONE_LIMIT     = 100,
    141     /** Tag value for words that appear to be numbers, lower limit.    */
    142     UBRK_WORD_NUMBER         = 100,
    143     /** Tag value for words that appear to be numbers, upper limit.    */
    144     UBRK_WORD_NUMBER_LIMIT   = 200,
    145     /** Tag value for words that contain letters, excluding
    146      *  hiragana, katakana or ideographic characters, lower limit.    */
    147     UBRK_WORD_LETTER         = 200,
    148     /** Tag value for words containing letters, upper limit  */
    149     UBRK_WORD_LETTER_LIMIT   = 300,
    150     /** Tag value for words containing kana characters, lower limit */
    151     UBRK_WORD_KANA           = 300,
    152     /** Tag value for words containing kana characters, upper limit */
    153     UBRK_WORD_KANA_LIMIT     = 400,
    154     /** Tag value for words containing ideographic characters, lower limit */
    155     UBRK_WORD_IDEO           = 400,
    156     /** Tag value for words containing ideographic characters, upper limit */
    157     UBRK_WORD_IDEO_LIMIT     = 500
    158 } UWordBreak;
    159 
    160 /**
    161  *  Enum constants for the line break tags returned by getRuleStatus().
    162  *  A range of values is defined for each category of
    163  *  word, to allow for further subdivisions of a category in future releases.
    164  *  Applications should check for tag values falling within the range, rather
    165  *  than for single individual values.
    166  *  @stable ICU 2.8
    167 */
    168 typedef enum ULineBreakTag {
    169     /** Tag value for soft line breaks, positions at which a line break
    170       *  is acceptable but not required                */
    171     UBRK_LINE_SOFT            = 0,
    172     /** Upper bound for soft line breaks.              */
    173     UBRK_LINE_SOFT_LIMIT      = 100,
    174     /** Tag value for a hard, or mandatory line break  */
    175     UBRK_LINE_HARD            = 100,
    176     /** Upper bound for hard line breaks.              */
    177     UBRK_LINE_HARD_LIMIT      = 200
    178 } ULineBreakTag;
    179 
    180 
    181 
    182 /**
    183  *  Enum constants for the sentence break tags returned by getRuleStatus().
    184  *  A range of values is defined for each category of
    185  *  sentence, to allow for further subdivisions of a category in future releases.
    186  *  Applications should check for tag values falling within the range, rather
    187  *  than for single individual values.
    188  *  @stable ICU 2.8
    189 */
    190 typedef enum USentenceBreakTag {
    191     /** Tag value for for sentences  ending with a sentence terminator
    192       * ('.', '?', '!', etc.) character, possibly followed by a
    193       * hard separator (CR, LF, PS, etc.)
    194       */
    195     UBRK_SENTENCE_TERM       = 0,
    196     /** Upper bound for tags for sentences ended by sentence terminators.    */
    197     UBRK_SENTENCE_TERM_LIMIT = 100,
    198     /** Tag value for for sentences that do not contain an ending
    199       * sentence terminator ('.', '?', '!', etc.) character, but
    200       * are ended only by a hard separator (CR, LF, PS, etc.) or end of input.
    201       */
    202     UBRK_SENTENCE_SEP        = 100,
    203     /** Upper bound for tags for sentences ended by a separator.              */
    204     UBRK_SENTENCE_SEP_LIMIT  = 200
    205     /** Tag value for a hard, or mandatory line break  */
    206 } USentenceBreakTag;
    207 
    208 
    209 /**
    210  * Open a new UBreakIterator for locating text boundaries for a specified locale.
    211  * A UBreakIterator may be used for detecting character, line, word,
    212  * and sentence breaks in text.
    213  * @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD,
    214  * UBRK_LINE, UBRK_SENTENCE
    215  * @param locale The locale specifying the text-breaking conventions. Note that
    216  * locale keys such as "lb" and "ss" may be used to modify text break behavior,
    217  * see general discussion of BreakIterator C API.
    218  * @param text The text to be iterated over.
    219  * @param textLength The number of characters in text, or -1 if null-terminated.
    220  * @param status A UErrorCode to receive any errors.
    221  * @return A UBreakIterator for the specified locale.
    222  * @see ubrk_openRules
    223  * @stable ICU 2.0
    224  */
    225 U_STABLE UBreakIterator* U_EXPORT2
    226 ubrk_open(UBreakIteratorType type,
    227       const char *locale,
    228       const UChar *text,
    229       int32_t textLength,
    230       UErrorCode *status);
    231 
    232 /**
    233  * Open a new UBreakIterator for locating text boundaries using specified breaking rules.
    234  * The rule syntax is ... (TBD)
    235  * @param rules A set of rules specifying the text breaking conventions.
    236  * @param rulesLength The number of characters in rules, or -1 if null-terminated.
    237  * @param text The text to be iterated over.  May be null, in which case ubrk_setText() is
    238  *        used to specify the text to be iterated.
    239  * @param textLength The number of characters in text, or -1 if null-terminated.
    240  * @param parseErr   Receives position and context information for any syntax errors
    241  *                   detected while parsing the rules.
    242  * @param status A UErrorCode to receive any errors.
    243  * @return A UBreakIterator for the specified rules.
    244  * @see ubrk_open
    245  * @stable ICU 2.2
    246  */
    247 U_STABLE UBreakIterator* U_EXPORT2
    248 ubrk_openRules(const UChar     *rules,
    249                int32_t         rulesLength,
    250                const UChar     *text,
    251                int32_t          textLength,
    252                UParseError     *parseErr,
    253                UErrorCode      *status);
    254 
    255 /**
    256  * Thread safe cloning operation
    257  * @param bi iterator to be cloned
    258  * @param stackBuffer <em>Deprecated functionality as of ICU 52, use NULL.</em><br>
    259  *  user allocated space for the new clone. If NULL new memory will be allocated.
    260  *  If buffer is not large enough, new memory will be allocated.
    261  *  Clients can use the U_BRK_SAFECLONE_BUFFERSIZE.
    262  * @param pBufferSize <em>Deprecated functionality as of ICU 52, use NULL or 1.</em><br>
    263  *  pointer to size of allocated space.
    264  *  If *pBufferSize == 0, a sufficient size for use in cloning will
    265  *  be returned ('pre-flighting')
    266  *  If *pBufferSize is not enough for a stack-based safe clone,
    267  *  new memory will be allocated.
    268  * @param status to indicate whether the operation went on smoothly or there were errors
    269  *  An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were necessary.
    270  * @return pointer to the new clone
    271  * @stable ICU 2.0
    272  */
    273 U_STABLE UBreakIterator * U_EXPORT2
    274 ubrk_safeClone(
    275           const UBreakIterator *bi,
    276           void *stackBuffer,
    277           int32_t *pBufferSize,
    278           UErrorCode *status);
    279 
    280 #ifndef U_HIDE_DEPRECATED_API
    281 
    282 /**
    283   * A recommended size (in bytes) for the memory buffer to be passed to ubrk_saveClone().
    284   * @deprecated ICU 52. Do not rely on ubrk_safeClone() cloning into any provided buffer.
    285   */
    286 #define U_BRK_SAFECLONE_BUFFERSIZE 1
    287 
    288 #endif /* U_HIDE_DEPRECATED_API */
    289 
    290 /**
    291 * Close a UBreakIterator.
    292 * Once closed, a UBreakIterator may no longer be used.
    293 * @param bi The break iterator to close.
    294  * @stable ICU 2.0
    295 */
    296 U_STABLE void U_EXPORT2
    297 ubrk_close(UBreakIterator *bi);
    298 
    299 #if U_SHOW_CPLUSPLUS_API
    300 
    301 U_NAMESPACE_BEGIN
    302 
    303 /**
    304  * \class LocalUBreakIteratorPointer
    305  * "Smart pointer" class, closes a UBreakIterator via ubrk_close().
    306  * For most methods see the LocalPointerBase base class.
    307  *
    308  * @see LocalPointerBase
    309  * @see LocalPointer
    310  * @stable ICU 4.4
    311  */
    312 U_DEFINE_LOCAL_OPEN_POINTER(LocalUBreakIteratorPointer, UBreakIterator, ubrk_close);
    313 
    314 U_NAMESPACE_END
    315 
    316 #endif
    317 
    318 /**
    319  * Sets an existing iterator to point to a new piece of text
    320  * @param bi The iterator to use
    321  * @param text The text to be set
    322  * @param textLength The length of the text
    323  * @param status The error code
    324  * @stable ICU 2.0
    325  */
    326 U_STABLE void U_EXPORT2
    327 ubrk_setText(UBreakIterator* bi,
    328              const UChar*    text,
    329              int32_t         textLength,
    330              UErrorCode*     status);
    331 
    332 
    333 /**
    334  * Sets an existing iterator to point to a new piece of text.
    335  *
    336  * All index positions returned by break iterator functions are
    337  * native indices from the UText. For example, when breaking UTF-8
    338  * encoded text, the break positions returned by \ref ubrk_next, \ref ubrk_previous, etc.
    339  * will be UTF-8 string indices, not UTF-16 positions.
    340  *
    341  * @param bi The iterator to use
    342  * @param text The text to be set.
    343  *             This function makes a shallow clone of the supplied UText.  This means
    344  *             that the caller is free to immediately close or otherwise reuse the
    345  *             UText that was passed as a parameter, but that the underlying text itself
    346  *             must not be altered while being referenced by the break iterator.
    347  * @param status The error code
    348  * @stable ICU 3.4
    349  */
    350 U_STABLE void U_EXPORT2
    351 ubrk_setUText(UBreakIterator* bi,
    352              UText*          text,
    353              UErrorCode*     status);
    354 
    355 
    356 
    357 /**
    358  * Determine the most recently-returned text boundary.
    359  *
    360  * @param bi The break iterator to use.
    361  * @return The character index most recently returned by \ref ubrk_next, \ref ubrk_previous,
    362  * \ref ubrk_first, or \ref ubrk_last.
    363  * @stable ICU 2.0
    364  */
    365 U_STABLE int32_t U_EXPORT2
    366 ubrk_current(const UBreakIterator *bi);
    367 
    368 /**
    369  * Advance the iterator to the boundary following the current boundary.
    370  *
    371  * @param bi The break iterator to use.
    372  * @return The character index of the next text boundary, or UBRK_DONE
    373  * if all text boundaries have been returned.
    374  * @see ubrk_previous
    375  * @stable ICU 2.0
    376  */
    377 U_STABLE int32_t U_EXPORT2
    378 ubrk_next(UBreakIterator *bi);
    379 
    380 /**
    381  * Set the iterator position to the boundary preceding the current boundary.
    382  *
    383  * @param bi The break iterator to use.
    384  * @return The character index of the preceding text boundary, or UBRK_DONE
    385  * if all text boundaries have been returned.
    386  * @see ubrk_next
    387  * @stable ICU 2.0
    388  */
    389 U_STABLE int32_t U_EXPORT2
    390 ubrk_previous(UBreakIterator *bi);
    391 
    392 /**
    393  * Set the iterator position to zero, the start of the text being scanned.
    394  * @param bi The break iterator to use.
    395  * @return The new iterator position (zero).
    396  * @see ubrk_last
    397  * @stable ICU 2.0
    398  */
    399 U_STABLE int32_t U_EXPORT2
    400 ubrk_first(UBreakIterator *bi);
    401 
    402 /**
    403  * Set the iterator position to the index immediately <EM>beyond</EM> the last character in the text being scanned.
    404  * This is not the same as the last character.
    405  * @param bi The break iterator to use.
    406  * @return The character offset immediately <EM>beyond</EM> the last character in the
    407  * text being scanned.
    408  * @see ubrk_first
    409  * @stable ICU 2.0
    410  */
    411 U_STABLE int32_t U_EXPORT2
    412 ubrk_last(UBreakIterator *bi);
    413 
    414 /**
    415  * Set the iterator position to the first boundary preceding the specified offset.
    416  * The new position is always smaller than offset, or UBRK_DONE.
    417  * @param bi The break iterator to use.
    418  * @param offset The offset to begin scanning.
    419  * @return The text boundary preceding offset, or UBRK_DONE.
    420  * @see ubrk_following
    421  * @stable ICU 2.0
    422  */
    423 U_STABLE int32_t U_EXPORT2
    424 ubrk_preceding(UBreakIterator *bi,
    425            int32_t offset);
    426 
    427 /**
    428  * Advance the iterator to the first boundary following the specified offset.
    429  * The value returned is always greater than offset, or UBRK_DONE.
    430  * @param bi The break iterator to use.
    431  * @param offset The offset to begin scanning.
    432  * @return The text boundary following offset, or UBRK_DONE.
    433  * @see ubrk_preceding
    434  * @stable ICU 2.0
    435  */
    436 U_STABLE int32_t U_EXPORT2
    437 ubrk_following(UBreakIterator *bi,
    438            int32_t offset);
    439 
    440 /**
    441 * Get a locale for which text breaking information is available.
    442 * A UBreakIterator in a locale returned by this function will perform the correct
    443 * text breaking for the locale.
    444 * @param index The index of the desired locale.
    445 * @return A locale for which number text breaking information is available, or 0 if none.
    446 * @see ubrk_countAvailable
    447 * @stable ICU 2.0
    448 */
    449 U_STABLE const char* U_EXPORT2
    450 ubrk_getAvailable(int32_t index);
    451 
    452 /**
    453 * Determine how many locales have text breaking information available.
    454 * This function is most useful as determining the loop ending condition for
    455 * calls to \ref ubrk_getAvailable.
    456 * @return The number of locales for which text breaking information is available.
    457 * @see ubrk_getAvailable
    458 * @stable ICU 2.0
    459 */
    460 U_STABLE int32_t U_EXPORT2
    461 ubrk_countAvailable(void);
    462 
    463 
    464 /**
    465 * Returns true if the specfied position is a boundary position.  As a side
    466 * effect, leaves the iterator pointing to the first boundary position at
    467 * or after "offset".
    468 * @param bi The break iterator to use.
    469 * @param offset the offset to check.
    470 * @return True if "offset" is a boundary position.
    471 * @stable ICU 2.0
    472 */
    473 U_STABLE  UBool U_EXPORT2
    474 ubrk_isBoundary(UBreakIterator *bi, int32_t offset);
    475 
    476 /**
    477  * Return the status from the break rule that determined the most recently
    478  * returned break position.  The values appear in the rule source
    479  * within brackets, {123}, for example.  For rules that do not specify a
    480  * status, a default value of 0 is returned.
    481  * <p>
    482  * For word break iterators, the possible values are defined in enum UWordBreak.
    483  * @stable ICU 2.2
    484  */
    485 U_STABLE  int32_t U_EXPORT2
    486 ubrk_getRuleStatus(UBreakIterator *bi);
    487 
    488 /**
    489  * Get the statuses from the break rules that determined the most recently
    490  * returned break position.  The values appear in the rule source
    491  * within brackets, {123}, for example.  The default status value for rules
    492  * that do not explicitly provide one is zero.
    493  * <p>
    494  * For word break iterators, the possible values are defined in enum UWordBreak.
    495  * @param bi        The break iterator to use
    496  * @param fillInVec an array to be filled in with the status values.
    497  * @param capacity  the length of the supplied vector.  A length of zero causes
    498  *                  the function to return the number of status values, in the
    499  *                  normal way, without attemtping to store any values.
    500  * @param status    receives error codes.
    501  * @return          The number of rule status values from rules that determined
    502  *                  the most recent boundary returned by the break iterator.
    503  * @stable ICU 3.0
    504  */
    505 U_STABLE  int32_t U_EXPORT2
    506 ubrk_getRuleStatusVec(UBreakIterator *bi, int32_t *fillInVec, int32_t capacity, UErrorCode *status);
    507 
    508 /**
    509  * Return the locale of the break iterator. You can choose between the valid and
    510  * the actual locale.
    511  * @param bi break iterator
    512  * @param type locale type (valid or actual)
    513  * @param status error code
    514  * @return locale string
    515  * @stable ICU 2.8
    516  */
    517 U_STABLE const char* U_EXPORT2
    518 ubrk_getLocaleByType(const UBreakIterator *bi, ULocDataLocaleType type, UErrorCode* status);
    519 
    520 /**
    521   *  Set the subject text string upon which the break iterator is operating
    522   *  without changing any other aspect of the state.
    523   *  The new and previous text strings must have the same content.
    524   *
    525   *  This function is intended for use in environments where ICU is operating on
    526   *  strings that may move around in memory.  It provides a mechanism for notifying
    527   *  ICU that the string has been relocated, and providing a new UText to access the
    528   *  string in its new position.
    529   *
    530   *  Note that the break iterator never copies the underlying text
    531   *  of a string being processed, but always operates directly on the original text
    532   *  provided by the user. Refreshing simply drops the references to the old text
    533   *  and replaces them with references to the new.
    534   *
    535   *  Caution:  this function is normally used only by very specialized
    536   *            system-level code.   One example use case is with garbage collection
    537   *            that moves the text in memory.
    538   *
    539   * @param bi         The break iterator.
    540   * @param text       The new (moved) text string.
    541   * @param status     Receives errors detected by this function.
    542   *
    543   * @stable ICU 49
    544   */
    545 U_STABLE void U_EXPORT2
    546 ubrk_refreshUText(UBreakIterator *bi,
    547                        UText          *text,
    548                        UErrorCode     *status);
    549 
    550 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
    551 
    552 #endif
    553