Home | History | Annotate | Download | only in unicode
      1 /*
      2 ******************************************************************************
      3 * Copyright (C) 1996-2009, International Business Machines Corporation and others.
      4 * All Rights Reserved.
      5 ******************************************************************************
      6 */
      7 
      8 #ifndef UBRK_H
      9 #define UBRK_H
     10 
     11 #include "unicode/utypes.h"
     12 #include "unicode/uloc.h"
     13 #include "unicode/utext.h"
     14 
     15 /**
     16  * A text-break iterator.
     17  *  For usage in C programs.
     18  */
     19 #ifndef UBRK_TYPEDEF_UBREAK_ITERATOR
     20 #   define UBRK_TYPEDEF_UBREAK_ITERATOR
     21     /**
     22      *  Opaque type representing an ICU Break iterator object.
     23      *  @stable ICU 2.0
     24      */
     25     typedef void UBreakIterator;
     26 #endif
     27 
     28 #if !UCONFIG_NO_BREAK_ITERATION
     29 
     30 #include "unicode/parseerr.h"
     31 
     32 /**
     33  * \file
     34  * \brief C API: BreakIterator
     35  *
     36  * <h2> BreakIterator C API </h2>
     37  *
     38  * The BreakIterator C API defines  methods for finding the location
     39  * of boundaries in text. Pointer to a UBreakIterator maintain a
     40  * current position and scan over text returning the index of characters
     41  * where boundaries occur.
     42  * <p>
     43  * Line boundary analysis determines where a text string can be broken
     44  * when line-wrapping. The mechanism correctly handles punctuation and
     45  * hyphenated words.
     46  * <p>
     47  * Sentence boundary analysis allows selection with correct
     48  * interpretation of periods within numbers and abbreviations, and
     49  * trailing punctuation marks such as quotation marks and parentheses.
     50  * <p>
     51  * Word boundary analysis is used by search and replace functions, as
     52  * well as within text editing applications that allow the user to
     53  * select words with a double click. Word selection provides correct
     54  * interpretation of punctuation marks within and following
     55  * words. Characters that are not part of a word, such as symbols or
     56  * punctuation marks, have word-breaks on both sides.
     57  * <p>
     58  * Character boundary analysis identifies the boundaries of
     59  * "Extended Grapheme Clusters", which are groupings of codepoints
     60  * that should be treated as character-like units for many text operations.
     61  * Please see Unicode Standard Annex #29, Unicode Text Segmentation,
     62  * http://www.unicode.org/reports/tr29/ for additional information
     63  * on grapheme clusters and guidelines on their use.
     64  * <p>
     65  * Title boundary analysis locates all positions,
     66  * typically starts of words, that should be set to Title Case
     67  * when title casing the text.
     68  * <p>
     69  * The text boundary positions are found according to the rules
     70  * described in Unicode Standard Annex #29, Text Boundaries, and
     71  * Unicode Standard Annex #14, Line Breaking Properties.  These
     72  * are available at http://www.unicode.org/reports/tr14/ and
     73  * http://www.unicode.org/reports/tr29/.
     74  * <p>
     75  * In addition to the plain C API defined in this header file, an
     76  * object oriented C++ API with equivalent functionality is defined in the
     77  * file brkiter.h.
     78  * <p>
     79  * Code snippits illustrating the use of the Break Iterator APIs
     80  * are available in the ICU User Guide,
     81  * http://icu-project.org/userguide/boundaryAnalysis.html
     82  * and in the sample program icu/source/samples/break/break.cpp"
     83  */
     84 
     85 /** The possible types of text boundaries.  @stable ICU 2.0 */
     86 typedef enum UBreakIteratorType {
     87   /** Character breaks  @stable ICU 2.0 */
     88   UBRK_CHARACTER = 0,
     89   /** Word breaks @stable ICU 2.0 */
     90   UBRK_WORD = 1,
     91   /** Line breaks @stable ICU 2.0 */
     92   UBRK_LINE = 2,
     93   /** Sentence breaks @stable ICU 2.0 */
     94   UBRK_SENTENCE = 3,
     95 
     96 #ifndef U_HIDE_DEPRECATED_API
     97   /**
     98    * Title Case breaks
     99    * The iterator created using this type locates title boundaries as described for
    100    * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration,
    101    * please use Word Boundary iterator.
    102    *
    103    * @deprecated ICU 2.8 Use the word break iterator for titlecasing for Unicode 4 and later.
    104    */
    105   UBRK_TITLE = 4,
    106 #endif /* U_HIDE_DEPRECATED_API */
    107   UBRK_COUNT = 5
    108 } UBreakIteratorType;
    109 
    110 /** Value indicating all text boundaries have been returned.
    111  *  @stable ICU 2.0
    112  */
    113 #define UBRK_DONE ((int32_t) -1)
    114 
    115 
    116 /**
    117  *  Enum constants for the word break tags returned by
    118  *  getRuleStatus().  A range of values is defined for each category of
    119  *  word, to allow for further subdivisions of a category in future releases.
    120  *  Applications should check for tag values falling within the range, rather
    121  *  than for single individual values.
    122  *  @stable ICU 2.2
    123 */
    124 typedef enum UWordBreak {
    125     /** Tag value for "words" that do not fit into any of other categories.
    126      *  Includes spaces and most punctuation. */
    127     UBRK_WORD_NONE           = 0,
    128     /** Upper bound for tags for uncategorized words. */
    129     UBRK_WORD_NONE_LIMIT     = 100,
    130     /** Tag value for words that appear to be numbers, lower limit.    */
    131     UBRK_WORD_NUMBER         = 100,
    132     /** Tag value for words that appear to be numbers, upper limit.    */
    133     UBRK_WORD_NUMBER_LIMIT   = 200,
    134     /** Tag value for words that contain letters, excluding
    135      *  hiragana, katakana or ideographic characters, lower limit.    */
    136     UBRK_WORD_LETTER         = 200,
    137     /** Tag value for words containing letters, upper limit  */
    138     UBRK_WORD_LETTER_LIMIT   = 300,
    139     /** Tag value for words containing kana characters, lower limit */
    140     UBRK_WORD_KANA           = 300,
    141     /** Tag value for words containing kana characters, upper limit */
    142     UBRK_WORD_KANA_LIMIT     = 400,
    143     /** Tag value for words containing ideographic characters, lower limit */
    144     UBRK_WORD_IDEO           = 400,
    145     /** Tag value for words containing ideographic characters, upper limit */
    146     UBRK_WORD_IDEO_LIMIT     = 500
    147 } UWordBreak;
    148 
    149 /**
    150  *  Enum constants for the line break tags returned by getRuleStatus().
    151  *  A range of values is defined for each category of
    152  *  word, to allow for further subdivisions of a category in future releases.
    153  *  Applications should check for tag values falling within the range, rather
    154  *  than for single individual values.
    155  *  @stable ICU 2.8
    156 */
    157 typedef enum ULineBreakTag {
    158     /** Tag value for soft line breaks, positions at which a line break
    159       *  is acceptable but not required                */
    160     UBRK_LINE_SOFT            = 0,
    161     /** Upper bound for soft line breaks.              */
    162     UBRK_LINE_SOFT_LIMIT      = 100,
    163     /** Tag value for a hard, or mandatory line break  */
    164     UBRK_LINE_HARD            = 100,
    165     /** Upper bound for hard line breaks.              */
    166     UBRK_LINE_HARD_LIMIT      = 200
    167 } ULineBreakTag;
    168 
    169 
    170 
    171 /**
    172  *  Enum constants for the sentence break tags returned by getRuleStatus().
    173  *  A range of values is defined for each category of
    174  *  sentence, to allow for further subdivisions of a category in future releases.
    175  *  Applications should check for tag values falling within the range, rather
    176  *  than for single individual values.
    177  *  @stable ICU 2.8
    178 */
    179 typedef enum USentenceBreakTag {
    180     /** Tag value for for sentences  ending with a sentence terminator
    181       * ('.', '?', '!', etc.) character, possibly followed by a
    182       * hard separator (CR, LF, PS, etc.)
    183       */
    184     UBRK_SENTENCE_TERM       = 0,
    185     /** Upper bound for tags for sentences ended by sentence terminators.    */
    186     UBRK_SENTENCE_TERM_LIMIT = 100,
    187     /** Tag value for for sentences that do not contain an ending
    188       * sentence terminator ('.', '?', '!', etc.) character, but
    189       * are ended only by a hard separator (CR, LF, PS, etc.) or end of input.
    190       */
    191     UBRK_SENTENCE_SEP        = 100,
    192     /** Upper bound for tags for sentences ended by a separator.              */
    193     UBRK_SENTENCE_SEP_LIMIT  = 200
    194     /** Tag value for a hard, or mandatory line break  */
    195 } USentenceBreakTag;
    196 
    197 
    198 /**
    199  * Open a new UBreakIterator for locating text boundaries for a specified locale.
    200  * A UBreakIterator may be used for detecting character, line, word,
    201  * and sentence breaks in text.
    202  * @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD,
    203  * UBRK_LINE, UBRK_SENTENCE
    204  * @param locale The locale specifying the text-breaking conventions.
    205  * @param text The text to be iterated over.
    206  * @param textLength The number of characters in text, or -1 if null-terminated.
    207  * @param status A UErrorCode to receive any errors.
    208  * @return A UBreakIterator for the specified locale.
    209  * @see ubrk_openRules
    210  * @stable ICU 2.0
    211  */
    212 U_STABLE UBreakIterator* U_EXPORT2
    213 ubrk_open(UBreakIteratorType type,
    214       const char *locale,
    215       const UChar *text,
    216       int32_t textLength,
    217       UErrorCode *status);
    218 
    219 /**
    220  * Open a new UBreakIterator for locating text boundaries using specified breaking rules.
    221  * The rule syntax is ... (TBD)
    222  * @param rules A set of rules specifying the text breaking conventions.
    223  * @param rulesLength The number of characters in rules, or -1 if null-terminated.
    224  * @param text The text to be iterated over.  May be null, in which case ubrk_setText() is
    225  *        used to specify the text to be iterated.
    226  * @param textLength The number of characters in text, or -1 if null-terminated.
    227  * @param parseErr   Receives position and context information for any syntax errors
    228  *                   detected while parsing the rules.
    229  * @param status A UErrorCode to receive any errors.
    230  * @return A UBreakIterator for the specified rules.
    231  * @see ubrk_open
    232  * @stable ICU 2.2
    233  */
    234 U_STABLE UBreakIterator* U_EXPORT2
    235 ubrk_openRules(const UChar     *rules,
    236                int32_t         rulesLength,
    237                const UChar     *text,
    238                int32_t          textLength,
    239                UParseError     *parseErr,
    240                UErrorCode      *status);
    241 
    242 /**
    243  * Thread safe cloning operation
    244  * @param bi iterator to be cloned
    245  * @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated.
    246  *  If buffer is not large enough, new memory will be allocated.
    247  *  Clients can use the U_BRK_SAFECLONE_BUFFERSIZE. This will probably be enough to avoid memory allocations.
    248  * @param pBufferSize pointer to size of allocated space.
    249  *  If *pBufferSize == 0, a sufficient size for use in cloning will
    250  *  be returned ('pre-flighting')
    251  *  If *pBufferSize is not enough for a stack-based safe clone,
    252  *  new memory will be allocated.
    253  * @param status to indicate whether the operation went on smoothly or there were errors
    254  *  An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were necessary.
    255  * @return pointer to the new clone
    256  * @stable ICU 2.0
    257  */
    258 U_STABLE UBreakIterator * U_EXPORT2
    259 ubrk_safeClone(
    260           const UBreakIterator *bi,
    261           void *stackBuffer,
    262           int32_t *pBufferSize,
    263           UErrorCode *status);
    264 
    265 /**
    266   * A recommended size (in bytes) for the memory buffer to be passed to ubrk_saveClone().
    267   * @stable ICU 2.0
    268   */
    269 #define U_BRK_SAFECLONE_BUFFERSIZE 512
    270 
    271 /**
    272 * Close a UBreakIterator.
    273 * Once closed, a UBreakIterator may no longer be used.
    274 * @param bi The break iterator to close.
    275  * @stable ICU 2.0
    276 */
    277 U_STABLE void U_EXPORT2
    278 ubrk_close(UBreakIterator *bi);
    279 
    280 /**
    281  * Sets an existing iterator to point to a new piece of text
    282  * @param bi The iterator to use
    283  * @param text The text to be set
    284  * @param textLength The length of the text
    285  * @param status The error code
    286  * @stable ICU 2.0
    287  */
    288 U_STABLE void U_EXPORT2
    289 ubrk_setText(UBreakIterator* bi,
    290              const UChar*    text,
    291              int32_t         textLength,
    292              UErrorCode*     status);
    293 
    294 
    295 /**
    296  * Sets an existing iterator to point to a new piece of text
    297  * @param bi The iterator to use
    298  * @param text The text to be set.
    299  *             This function makes a shallow clone of the supplied UText.  This means
    300  *             that the caller is free to immediately close or otherwise reuse the
    301  *             UText that was passed as a parameter, but that the underlying text itself
    302  *             must not be altered while being referenced by the break iterator.
    303  * @param status The error code
    304  * @stable ICU 3.4
    305  */
    306 U_STABLE void U_EXPORT2
    307 ubrk_setUText(UBreakIterator* bi,
    308              UText*          text,
    309              UErrorCode*     status);
    310 
    311 
    312 
    313 /**
    314  * Determine the most recently-returned text boundary.
    315  *
    316  * @param bi The break iterator to use.
    317  * @return The character index most recently returned by \ref ubrk_next, \ref ubrk_previous,
    318  * \ref ubrk_first, or \ref ubrk_last.
    319  * @stable ICU 2.0
    320  */
    321 U_STABLE int32_t U_EXPORT2
    322 ubrk_current(const UBreakIterator *bi);
    323 
    324 /**
    325  * Determine the text boundary following the current text boundary.
    326  *
    327  * @param bi The break iterator to use.
    328  * @return The character index of the next text boundary, or UBRK_DONE
    329  * if all text boundaries have been returned.
    330  * @see ubrk_previous
    331  * @stable ICU 2.0
    332  */
    333 U_STABLE int32_t U_EXPORT2
    334 ubrk_next(UBreakIterator *bi);
    335 
    336 /**
    337  * Determine the text boundary preceding the current text boundary.
    338  *
    339  * @param bi The break iterator to use.
    340  * @return The character index of the preceding text boundary, or UBRK_DONE
    341  * if all text boundaries have been returned.
    342  * @see ubrk_next
    343  * @stable ICU 2.0
    344  */
    345 U_STABLE int32_t U_EXPORT2
    346 ubrk_previous(UBreakIterator *bi);
    347 
    348 /**
    349  * Determine the index of the first character in the text being scanned.
    350  * This is not always the same as index 0 of the text.
    351  * @param bi The break iterator to use.
    352  * @return The character index of the first character in the text being scanned.
    353  * @see ubrk_last
    354  * @stable ICU 2.0
    355  */
    356 U_STABLE int32_t U_EXPORT2
    357 ubrk_first(UBreakIterator *bi);
    358 
    359 /**
    360  * Determine the index immediately <EM>beyond</EM> the last character in the text being
    361  * scanned.
    362  * This is not the same as the last character.
    363  * @param bi The break iterator to use.
    364  * @return The character offset immediately <EM>beyond</EM> the last character in the
    365  * text being scanned.
    366  * @see ubrk_first
    367  * @stable ICU 2.0
    368  */
    369 U_STABLE int32_t U_EXPORT2
    370 ubrk_last(UBreakIterator *bi);
    371 
    372 /**
    373  * Determine the text boundary preceding the specified offset.
    374  * The value returned is always smaller than offset, or UBRK_DONE.
    375  * @param bi The break iterator to use.
    376  * @param offset The offset to begin scanning.
    377  * @return The text boundary preceding offset, or UBRK_DONE.
    378  * @see ubrk_following
    379  * @stable ICU 2.0
    380  */
    381 U_STABLE int32_t U_EXPORT2
    382 ubrk_preceding(UBreakIterator *bi,
    383            int32_t offset);
    384 
    385 /**
    386  * Determine the text boundary following the specified offset.
    387  * The value returned is always greater than offset, or UBRK_DONE.
    388  * @param bi The break iterator to use.
    389  * @param offset The offset to begin scanning.
    390  * @return The text boundary following offset, or UBRK_DONE.
    391  * @see ubrk_preceding
    392  * @stable ICU 2.0
    393  */
    394 U_STABLE int32_t U_EXPORT2
    395 ubrk_following(UBreakIterator *bi,
    396            int32_t offset);
    397 
    398 /**
    399 * Get a locale for which text breaking information is available.
    400 * A UBreakIterator in a locale returned by this function will perform the correct
    401 * text breaking for the locale.
    402 * @param index The index of the desired locale.
    403 * @return A locale for which number text breaking information is available, or 0 if none.
    404 * @see ubrk_countAvailable
    405 * @stable ICU 2.0
    406 */
    407 U_STABLE const char* U_EXPORT2
    408 ubrk_getAvailable(int32_t index);
    409 
    410 /**
    411 * Determine how many locales have text breaking information available.
    412 * This function is most useful as determining the loop ending condition for
    413 * calls to \ref ubrk_getAvailable.
    414 * @return The number of locales for which text breaking information is available.
    415 * @see ubrk_getAvailable
    416 * @stable ICU 2.0
    417 */
    418 U_STABLE int32_t U_EXPORT2
    419 ubrk_countAvailable(void);
    420 
    421 
    422 /**
    423 * Returns true if the specfied position is a boundary position.  As a side
    424 * effect, leaves the iterator pointing to the first boundary position at
    425 * or after "offset".
    426 * @param bi The break iterator to use.
    427 * @param offset the offset to check.
    428 * @return True if "offset" is a boundary position.
    429 * @stable ICU 2.0
    430 */
    431 U_STABLE  UBool U_EXPORT2
    432 ubrk_isBoundary(UBreakIterator *bi, int32_t offset);
    433 
    434 /**
    435  * Return the status from the break rule that determined the most recently
    436  * returned break position.  The values appear in the rule source
    437  * within brackets, {123}, for example.  For rules that do not specify a
    438  * status, a default value of 0 is returned.
    439  * <p>
    440  * For word break iterators, the possible values are defined in enum UWordBreak.
    441  * @stable ICU 2.2
    442  */
    443 U_STABLE  int32_t U_EXPORT2
    444 ubrk_getRuleStatus(UBreakIterator *bi);
    445 
    446 /**
    447  * Get the statuses from the break rules that determined the most recently
    448  * returned break position.  The values appear in the rule source
    449  * within brackets, {123}, for example.  The default status value for rules
    450  * that do not explicitly provide one is zero.
    451  * <p>
    452  * For word break iterators, the possible values are defined in enum UWordBreak.
    453  * @param bi        The break iterator to use
    454  * @param fillInVec an array to be filled in with the status values.
    455  * @param capacity  the length of the supplied vector.  A length of zero causes
    456  *                  the function to return the number of status values, in the
    457  *                  normal way, without attemtping to store any values.
    458  * @param status    receives error codes.
    459  * @return          The number of rule status values from rules that determined
    460  *                  the most recent boundary returned by the break iterator.
    461  * @stable ICU 3.0
    462  */
    463 U_STABLE  int32_t U_EXPORT2
    464 ubrk_getRuleStatusVec(UBreakIterator *bi, int32_t *fillInVec, int32_t capacity, UErrorCode *status);
    465 
    466 /**
    467  * Return the locale of the break iterator. You can choose between the valid and
    468  * the actual locale.
    469  * @param bi break iterator
    470  * @param type locale type (valid or actual)
    471  * @param status error code
    472  * @return locale string
    473  * @stable ICU 2.8
    474  */
    475 U_STABLE const char* U_EXPORT2
    476 ubrk_getLocaleByType(const UBreakIterator *bi, ULocDataLocaleType type, UErrorCode* status);
    477 
    478 
    479 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
    480 
    481 #endif
    482