Home | History | Annotate | Download | only in unicode
      1 /*
      2 ******************************************************************************
      3 * Copyright (C) 1996-2013, International Business Machines Corporation and others.
      4 * All Rights Reserved.
      5 ******************************************************************************
      6 */
      7 
      8 #ifndef UBRK_H
      9 #define UBRK_H
     10 
     11 #include "unicode/utypes.h"
     12 #include "unicode/uloc.h"
     13 #include "unicode/utext.h"
     14 #include "unicode/localpointer.h"
     15 
     16 /**
     17  * A text-break iterator.
     18  *  For usage in C programs.
     19  */
     20 #ifndef UBRK_TYPEDEF_UBREAK_ITERATOR
     21 #   define UBRK_TYPEDEF_UBREAK_ITERATOR
     22     /**
     23      *  Opaque type representing an ICU Break iterator object.
     24      *  @stable ICU 2.0
     25      */
     26     typedef struct UBreakIterator UBreakIterator;
     27 #endif
     28 
     29 #if !UCONFIG_NO_BREAK_ITERATION
     30 
     31 #include "unicode/parseerr.h"
     32 
     33 /**
     34  * \file
     35  * \brief C API: BreakIterator
     36  *
     37  * <h2> BreakIterator C API </h2>
     38  *
     39  * The BreakIterator C API defines  methods for finding the location
     40  * of boundaries in text. Pointer to a UBreakIterator maintain a
     41  * current position and scan over text returning the index of characters
     42  * where boundaries occur.
     43  * <p>
     44  * Line boundary analysis determines where a text string can be broken
     45  * when line-wrapping. The mechanism correctly handles punctuation and
     46  * hyphenated words.
     47  * <p>
     48  * Sentence boundary analysis allows selection with correct
     49  * interpretation of periods within numbers and abbreviations, and
     50  * trailing punctuation marks such as quotation marks and parentheses.
     51  * <p>
     52  * Word boundary analysis is used by search and replace functions, as
     53  * well as within text editing applications that allow the user to
     54  * select words with a double click. Word selection provides correct
     55  * interpretation of punctuation marks within and following
     56  * words. Characters that are not part of a word, such as symbols or
     57  * punctuation marks, have word-breaks on both sides.
     58  * <p>
     59  * Character boundary analysis identifies the boundaries of
     60  * "Extended Grapheme Clusters", which are groupings of codepoints
     61  * that should be treated as character-like units for many text operations.
     62  * Please see Unicode Standard Annex #29, Unicode Text Segmentation,
     63  * http://www.unicode.org/reports/tr29/ for additional information
     64  * on grapheme clusters and guidelines on their use.
     65  * <p>
     66  * Title boundary analysis locates all positions,
     67  * typically starts of words, that should be set to Title Case
     68  * when title casing the text.
     69  * <p>
     70  * The text boundary positions are found according to the rules
     71  * described in Unicode Standard Annex #29, Text Boundaries, and
     72  * Unicode Standard Annex #14, Line Breaking Properties.  These
     73  * are available at http://www.unicode.org/reports/tr14/ and
     74  * http://www.unicode.org/reports/tr29/.
     75  * <p>
     76  * In addition to the plain C API defined in this header file, an
     77  * object oriented C++ API with equivalent functionality is defined in the
     78  * file brkiter.h.
     79  * <p>
     80  * Code snippets illustrating the use of the Break Iterator APIs
     81  * are available in the ICU User Guide,
     82  * http://icu-project.org/userguide/boundaryAnalysis.html
     83  * and in the sample program icu/source/samples/break/break.cpp
     84  */
     85 
     86 /** The possible types of text boundaries.  @stable ICU 2.0 */
     87 typedef enum UBreakIteratorType {
     88   /** Character breaks  @stable ICU 2.0 */
     89   UBRK_CHARACTER = 0,
     90   /** Word breaks @stable ICU 2.0 */
     91   UBRK_WORD = 1,
     92   /** Line breaks @stable ICU 2.0 */
     93   UBRK_LINE = 2,
     94   /** Sentence breaks @stable ICU 2.0 */
     95   UBRK_SENTENCE = 3,
     96 
     97 #ifndef U_HIDE_DEPRECATED_API
     98   /**
     99    * Title Case breaks
    100    * The iterator created using this type locates title boundaries as described for
    101    * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration,
    102    * please use Word Boundary iterator.
    103    *
    104    * @deprecated ICU 2.8 Use the word break iterator for titlecasing for Unicode 4 and later.
    105    */
    106   UBRK_TITLE = 4,
    107 #endif /* U_HIDE_DEPRECATED_API */
    108   UBRK_COUNT = 5
    109 } UBreakIteratorType;
    110 
    111 /** Value indicating all text boundaries have been returned.
    112  *  @stable ICU 2.0
    113  */
    114 #define UBRK_DONE ((int32_t) -1)
    115 
    116 
    117 /**
    118  *  Enum constants for the word break tags returned by
    119  *  getRuleStatus().  A range of values is defined for each category of
    120  *  word, to allow for further subdivisions of a category in future releases.
    121  *  Applications should check for tag values falling within the range, rather
    122  *  than for single individual values.
    123  *  @stable ICU 2.2
    124 */
    125 typedef enum UWordBreak {
    126     /** Tag value for "words" that do not fit into any of other categories.
    127      *  Includes spaces and most punctuation. */
    128     UBRK_WORD_NONE           = 0,
    129     /** Upper bound for tags for uncategorized words. */
    130     UBRK_WORD_NONE_LIMIT     = 100,
    131     /** Tag value for words that appear to be numbers, lower limit.    */
    132     UBRK_WORD_NUMBER         = 100,
    133     /** Tag value for words that appear to be numbers, upper limit.    */
    134     UBRK_WORD_NUMBER_LIMIT   = 200,
    135     /** Tag value for words that contain letters, excluding
    136      *  hiragana, katakana or ideographic characters, lower limit.    */
    137     UBRK_WORD_LETTER         = 200,
    138     /** Tag value for words containing letters, upper limit  */
    139     UBRK_WORD_LETTER_LIMIT   = 300,
    140     /** Tag value for words containing kana characters, lower limit */
    141     UBRK_WORD_KANA           = 300,
    142     /** Tag value for words containing kana characters, upper limit */
    143     UBRK_WORD_KANA_LIMIT     = 400,
    144     /** Tag value for words containing ideographic characters, lower limit */
    145     UBRK_WORD_IDEO           = 400,
    146     /** Tag value for words containing ideographic characters, upper limit */
    147     UBRK_WORD_IDEO_LIMIT     = 500
    148 } UWordBreak;
    149 
    150 /**
    151  *  Enum constants for the line break tags returned by getRuleStatus().
    152  *  A range of values is defined for each category of
    153  *  word, to allow for further subdivisions of a category in future releases.
    154  *  Applications should check for tag values falling within the range, rather
    155  *  than for single individual values.
    156  *  @stable ICU 2.8
    157 */
    158 typedef enum ULineBreakTag {
    159     /** Tag value for soft line breaks, positions at which a line break
    160       *  is acceptable but not required                */
    161     UBRK_LINE_SOFT            = 0,
    162     /** Upper bound for soft line breaks.              */
    163     UBRK_LINE_SOFT_LIMIT      = 100,
    164     /** Tag value for a hard, or mandatory line break  */
    165     UBRK_LINE_HARD            = 100,
    166     /** Upper bound for hard line breaks.              */
    167     UBRK_LINE_HARD_LIMIT      = 200
    168 } ULineBreakTag;
    169 
    170 
    171 
    172 /**
    173  *  Enum constants for the sentence break tags returned by getRuleStatus().
    174  *  A range of values is defined for each category of
    175  *  sentence, to allow for further subdivisions of a category in future releases.
    176  *  Applications should check for tag values falling within the range, rather
    177  *  than for single individual values.
    178  *  @stable ICU 2.8
    179 */
    180 typedef enum USentenceBreakTag {
    181     /** Tag value for for sentences  ending with a sentence terminator
    182       * ('.', '?', '!', etc.) character, possibly followed by a
    183       * hard separator (CR, LF, PS, etc.)
    184       */
    185     UBRK_SENTENCE_TERM       = 0,
    186     /** Upper bound for tags for sentences ended by sentence terminators.    */
    187     UBRK_SENTENCE_TERM_LIMIT = 100,
    188     /** Tag value for for sentences that do not contain an ending
    189       * sentence terminator ('.', '?', '!', etc.) character, but
    190       * are ended only by a hard separator (CR, LF, PS, etc.) or end of input.
    191       */
    192     UBRK_SENTENCE_SEP        = 100,
    193     /** Upper bound for tags for sentences ended by a separator.              */
    194     UBRK_SENTENCE_SEP_LIMIT  = 200
    195     /** Tag value for a hard, or mandatory line break  */
    196 } USentenceBreakTag;
    197 
    198 
    199 /**
    200  * Open a new UBreakIterator for locating text boundaries for a specified locale.
    201  * A UBreakIterator may be used for detecting character, line, word,
    202  * and sentence breaks in text.
    203  * @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD,
    204  * UBRK_LINE, UBRK_SENTENCE
    205  * @param locale The locale specifying the text-breaking conventions.
    206  * @param text The text to be iterated over.
    207  * @param textLength The number of characters in text, or -1 if null-terminated.
    208  * @param status A UErrorCode to receive any errors.
    209  * @return A UBreakIterator for the specified locale.
    210  * @see ubrk_openRules
    211  * @stable ICU 2.0
    212  */
    213 U_STABLE UBreakIterator* U_EXPORT2
    214 ubrk_open(UBreakIteratorType type,
    215       const char *locale,
    216       const UChar *text,
    217       int32_t textLength,
    218       UErrorCode *status);
    219 
    220 /**
    221  * Open a new UBreakIterator for locating text boundaries using specified breaking rules.
    222  * The rule syntax is ... (TBD)
    223  * @param rules A set of rules specifying the text breaking conventions.
    224  * @param rulesLength The number of characters in rules, or -1 if null-terminated.
    225  * @param text The text to be iterated over.  May be null, in which case ubrk_setText() is
    226  *        used to specify the text to be iterated.
    227  * @param textLength The number of characters in text, or -1 if null-terminated.
    228  * @param parseErr   Receives position and context information for any syntax errors
    229  *                   detected while parsing the rules.
    230  * @param status A UErrorCode to receive any errors.
    231  * @return A UBreakIterator for the specified rules.
    232  * @see ubrk_open
    233  * @stable ICU 2.2
    234  */
    235 U_STABLE UBreakIterator* U_EXPORT2
    236 ubrk_openRules(const UChar     *rules,
    237                int32_t         rulesLength,
    238                const UChar     *text,
    239                int32_t          textLength,
    240                UParseError     *parseErr,
    241                UErrorCode      *status);
    242 
    243 /**
    244  * Thread safe cloning operation
    245  * @param bi iterator to be cloned
    246  * @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated.
    247  *  If buffer is not large enough, new memory will be allocated.
    248  *  Clients can use the U_BRK_SAFECLONE_BUFFERSIZE. This will probably be enough to avoid memory allocations.
    249  * @param pBufferSize pointer to size of allocated space.
    250  *  If *pBufferSize == 0, a sufficient size for use in cloning will
    251  *  be returned ('pre-flighting')
    252  *  If *pBufferSize is not enough for a stack-based safe clone,
    253  *  new memory will be allocated.
    254  * @param status to indicate whether the operation went on smoothly or there were errors
    255  *  An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were necessary.
    256  * @return pointer to the new clone
    257  * @stable ICU 2.0
    258  */
    259 U_STABLE UBreakIterator * U_EXPORT2
    260 ubrk_safeClone(
    261           const UBreakIterator *bi,
    262           void *stackBuffer,
    263           int32_t *pBufferSize,
    264           UErrorCode *status);
    265 
    266 /**
    267   * A recommended size (in bytes) for the memory buffer to be passed to ubrk_saveClone().
    268   * @stable ICU 2.0
    269   */
    270 #define U_BRK_SAFECLONE_BUFFERSIZE 528
    271 
    272 /**
    273 * Close a UBreakIterator.
    274 * Once closed, a UBreakIterator may no longer be used.
    275 * @param bi The break iterator to close.
    276  * @stable ICU 2.0
    277 */
    278 U_STABLE void U_EXPORT2
    279 ubrk_close(UBreakIterator *bi);
    280 
    281 #if U_SHOW_CPLUSPLUS_API
    282 
    283 U_NAMESPACE_BEGIN
    284 
    285 /**
    286  * \class LocalUBreakIteratorPointer
    287  * "Smart pointer" class, closes a UBreakIterator via ubrk_close().
    288  * For most methods see the LocalPointerBase base class.
    289  *
    290  * @see LocalPointerBase
    291  * @see LocalPointer
    292  * @stable ICU 4.4
    293  */
    294 U_DEFINE_LOCAL_OPEN_POINTER(LocalUBreakIteratorPointer, UBreakIterator, ubrk_close);
    295 
    296 U_NAMESPACE_END
    297 
    298 #endif
    299 
    300 /**
    301  * Sets an existing iterator to point to a new piece of text
    302  * @param bi The iterator to use
    303  * @param text The text to be set
    304  * @param textLength The length of the text
    305  * @param status The error code
    306  * @stable ICU 2.0
    307  */
    308 U_STABLE void U_EXPORT2
    309 ubrk_setText(UBreakIterator* bi,
    310              const UChar*    text,
    311              int32_t         textLength,
    312              UErrorCode*     status);
    313 
    314 
    315 /**
    316  * Sets an existing iterator to point to a new piece of text.
    317  *
    318  * All index positions returned by break iterator functions are
    319  * native indices from the UText. For example, when breaking UTF-8
    320  * encoded text, the break positions returned by \ref ubrk_next, \ref ubrk_previous, etc.
    321  * will be UTF-8 string indices, not UTF-16 positions.
    322  *
    323  * @param bi The iterator to use
    324  * @param text The text to be set.
    325  *             This function makes a shallow clone of the supplied UText.  This means
    326  *             that the caller is free to immediately close or otherwise reuse the
    327  *             UText that was passed as a parameter, but that the underlying text itself
    328  *             must not be altered while being referenced by the break iterator.
    329  * @param status The error code
    330  * @stable ICU 3.4
    331  */
    332 U_STABLE void U_EXPORT2
    333 ubrk_setUText(UBreakIterator* bi,
    334              UText*          text,
    335              UErrorCode*     status);
    336 
    337 
    338 
    339 /**
    340  * Determine the most recently-returned text boundary.
    341  *
    342  * @param bi The break iterator to use.
    343  * @return The character index most recently returned by \ref ubrk_next, \ref ubrk_previous,
    344  * \ref ubrk_first, or \ref ubrk_last.
    345  * @stable ICU 2.0
    346  */
    347 U_STABLE int32_t U_EXPORT2
    348 ubrk_current(const UBreakIterator *bi);
    349 
    350 /**
    351  * Advance the iterator to the boundary following the current boundary.
    352  *
    353  * @param bi The break iterator to use.
    354  * @return The character index of the next text boundary, or UBRK_DONE
    355  * if all text boundaries have been returned.
    356  * @see ubrk_previous
    357  * @stable ICU 2.0
    358  */
    359 U_STABLE int32_t U_EXPORT2
    360 ubrk_next(UBreakIterator *bi);
    361 
    362 /**
    363  * Set the iterator position to the boundary preceding the current boundary.
    364  *
    365  * @param bi The break iterator to use.
    366  * @return The character index of the preceding text boundary, or UBRK_DONE
    367  * if all text boundaries have been returned.
    368  * @see ubrk_next
    369  * @stable ICU 2.0
    370  */
    371 U_STABLE int32_t U_EXPORT2
    372 ubrk_previous(UBreakIterator *bi);
    373 
    374 /**
    375  * Set the iterator position to the index of the first character in the text being scanned.
    376  * This is not always the same as index 0 of the text.
    377  * @param bi The break iterator to use.
    378  * @return The character index of the first character in the text being scanned.
    379  * @see ubrk_last
    380  * @stable ICU 2.0
    381  */
    382 U_STABLE int32_t U_EXPORT2
    383 ubrk_first(UBreakIterator *bi);
    384 
    385 /**
    386  * Set the iterator position to the index immediately <EM>beyond</EM> the last character in the text being scanned.
    387  * This is not the same as the last character.
    388  * @param bi The break iterator to use.
    389  * @return The character offset immediately <EM>beyond</EM> the last character in the
    390  * text being scanned.
    391  * @see ubrk_first
    392  * @stable ICU 2.0
    393  */
    394 U_STABLE int32_t U_EXPORT2
    395 ubrk_last(UBreakIterator *bi);
    396 
    397 /**
    398  * Set the iterator position to the first boundary preceding the specified offset.
    399  * The new position is always smaller than offset, or UBRK_DONE.
    400  * @param bi The break iterator to use.
    401  * @param offset The offset to begin scanning.
    402  * @return The text boundary preceding offset, or UBRK_DONE.
    403  * @see ubrk_following
    404  * @stable ICU 2.0
    405  */
    406 U_STABLE int32_t U_EXPORT2
    407 ubrk_preceding(UBreakIterator *bi,
    408            int32_t offset);
    409 
    410 /**
    411  * Advance the iterator to the first boundary following the specified offset.
    412  * The value returned is always greater than offset, or UBRK_DONE.
    413  * @param bi The break iterator to use.
    414  * @param offset The offset to begin scanning.
    415  * @return The text boundary following offset, or UBRK_DONE.
    416  * @see ubrk_preceding
    417  * @stable ICU 2.0
    418  */
    419 U_STABLE int32_t U_EXPORT2
    420 ubrk_following(UBreakIterator *bi,
    421            int32_t offset);
    422 
    423 /**
    424 * Get a locale for which text breaking information is available.
    425 * A UBreakIterator in a locale returned by this function will perform the correct
    426 * text breaking for the locale.
    427 * @param index The index of the desired locale.
    428 * @return A locale for which number text breaking information is available, or 0 if none.
    429 * @see ubrk_countAvailable
    430 * @stable ICU 2.0
    431 */
    432 U_STABLE const char* U_EXPORT2
    433 ubrk_getAvailable(int32_t index);
    434 
    435 /**
    436 * Determine how many locales have text breaking information available.
    437 * This function is most useful as determining the loop ending condition for
    438 * calls to \ref ubrk_getAvailable.
    439 * @return The number of locales for which text breaking information is available.
    440 * @see ubrk_getAvailable
    441 * @stable ICU 2.0
    442 */
    443 U_STABLE int32_t U_EXPORT2
    444 ubrk_countAvailable(void);
    445 
    446 
    447 /**
    448 * Returns true if the specfied position is a boundary position.  As a side
    449 * effect, leaves the iterator pointing to the first boundary position at
    450 * or after "offset".
    451 * @param bi The break iterator to use.
    452 * @param offset the offset to check.
    453 * @return True if "offset" is a boundary position.
    454 * @stable ICU 2.0
    455 */
    456 U_STABLE  UBool U_EXPORT2
    457 ubrk_isBoundary(UBreakIterator *bi, int32_t offset);
    458 
    459 /**
    460  * Return the status from the break rule that determined the most recently
    461  * returned break position.  The values appear in the rule source
    462  * within brackets, {123}, for example.  For rules that do not specify a
    463  * status, a default value of 0 is returned.
    464  * <p>
    465  * For word break iterators, the possible values are defined in enum UWordBreak.
    466  * @stable ICU 2.2
    467  */
    468 U_STABLE  int32_t U_EXPORT2
    469 ubrk_getRuleStatus(UBreakIterator *bi);
    470 
    471 /**
    472  * Get the statuses from the break rules that determined the most recently
    473  * returned break position.  The values appear in the rule source
    474  * within brackets, {123}, for example.  The default status value for rules
    475  * that do not explicitly provide one is zero.
    476  * <p>
    477  * For word break iterators, the possible values are defined in enum UWordBreak.
    478  * @param bi        The break iterator to use
    479  * @param fillInVec an array to be filled in with the status values.
    480  * @param capacity  the length of the supplied vector.  A length of zero causes
    481  *                  the function to return the number of status values, in the
    482  *                  normal way, without attemtping to store any values.
    483  * @param status    receives error codes.
    484  * @return          The number of rule status values from rules that determined
    485  *                  the most recent boundary returned by the break iterator.
    486  * @stable ICU 3.0
    487  */
    488 U_STABLE  int32_t U_EXPORT2
    489 ubrk_getRuleStatusVec(UBreakIterator *bi, int32_t *fillInVec, int32_t capacity, UErrorCode *status);
    490 
    491 /**
    492  * Return the locale of the break iterator. You can choose between the valid and
    493  * the actual locale.
    494  * @param bi break iterator
    495  * @param type locale type (valid or actual)
    496  * @param status error code
    497  * @return locale string
    498  * @stable ICU 2.8
    499  */
    500 U_STABLE const char* U_EXPORT2
    501 ubrk_getLocaleByType(const UBreakIterator *bi, ULocDataLocaleType type, UErrorCode* status);
    502 
    503 /**
    504   *  Set the subject text string upon which the break iterator is operating
    505   *  without changing any other aspect of the state.
    506   *  The new and previous text strings must have the same content.
    507   *
    508   *  This function is intended for use in environments where ICU is operating on
    509   *  strings that may move around in memory.  It provides a mechanism for notifying
    510   *  ICU that the string has been relocated, and providing a new UText to access the
    511   *  string in its new position.
    512   *
    513   *  Note that the break iterator never copies the underlying text
    514   *  of a string being processed, but always operates directly on the original text
    515   *  provided by the user. Refreshing simply drops the references to the old text
    516   *  and replaces them with references to the new.
    517   *
    518   *  Caution:  this function is normally used only by very specialized
    519   *            system-level code.   One example use case is with garbage collection
    520   *            that moves the text in memory.
    521   *
    522   * @param bi         The break iterator.
    523   * @param text       The new (moved) text string.
    524   * @param status     Receives errors detected by this function.
    525   *
    526   * @stable ICU 49
    527   */
    528 U_STABLE void U_EXPORT2
    529 ubrk_refreshUText(UBreakIterator *bi,
    530                        UText          *text,
    531                        UErrorCode     *status);
    532 
    533 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
    534 
    535 #endif
    536