Home | History | Annotate | Download | only in unicode
      1 /*
      2 *******************************************************************************
      3 *   Copyright (C) 1997-2005, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 *   Date        Name        Description
      7 *   06/21/00    aliu        Creation.
      8 *******************************************************************************
      9 */
     10 
     11 #ifndef UTRANS_H
     12 #define UTRANS_H
     13 
     14 #include "unicode/utypes.h"
     15 
     16 #if !UCONFIG_NO_TRANSLITERATION
     17 
     18 #include "unicode/urep.h"
     19 #include "unicode/parseerr.h"
     20 #include "unicode/uenum.h"
     21 
     22 /********************************************************************
     23  * General Notes
     24  ********************************************************************
     25  */
     26 /**
     27  * \file
     28  * \brief C API: Transliterator
     29  *
     30  * <h2> Transliteration </h2>
     31  * The data structures and functions described in this header provide
     32  * transliteration services.  Transliteration services are implemented
     33  * as C++ classes.  The comments and documentation in this header
     34  * assume the reader is familiar with the C++ headers translit.h and
     35  * associated documentation.
     36  *
     37  * A significant but incomplete subset of the C++ transliteration
     38  * services are available to C code through this header.  In order to
     39  * access more complex transliteration services, refer to the C++
     40  * headers and documentation.
     41  *
     42  * There are two sets of functions for working with transliterator IDs:
     43  *
     44  * An old, deprecated set uses char * IDs, which works for true and pure
     45  * identifiers that these APIs were designed for,
     46  * for example "Cyrillic-Latin".
     47  * It does not work when the ID contains filters ("[:Script=Cyrl:]")
     48  * or even a complete set of rules because then the ID string contains more
     49  * than just "invariant" characters (see utypes.h).
     50  *
     51  * A new set of functions replaces the old ones and uses UChar * IDs,
     52  * paralleling the UnicodeString IDs in the C++ API. (New in ICU 2.8.)
     53  */
     54 
     55 /********************************************************************
     56  * Data Structures
     57  ********************************************************************/
     58 
     59 /**
     60  * An opaque transliterator for use in C.  Open with utrans_openxxx()
     61  * and close with utrans_close() when done.  Equivalent to the C++ class
     62  * Transliterator and its subclasses.
     63  * @see Transliterator
     64  * @stable ICU 2.0
     65  */
     66 typedef void* UTransliterator;
     67 
     68 /**
     69  * Direction constant indicating the direction in a transliterator,
     70  * e.g., the forward or reverse rules of a RuleBasedTransliterator.
     71  * Specified when a transliterator is opened.  An "A-B" transliterator
     72  * transliterates A to B when operating in the forward direction, and
     73  * B to A when operating in the reverse direction.
     74  * @stable ICU 2.0
     75  */
     76 typedef enum UTransDirection {
     77 
     78     /**
     79      * UTRANS_FORWARD means from &lt;source&gt; to &lt;target&gt; for a
     80      * transliterator with ID &lt;source&gt;-&lt;target&gt;.  For a transliterator
     81      * opened using a rule, it means forward direction rules, e.g.,
     82      * "A > B".
     83      */
     84     UTRANS_FORWARD,
     85 
     86     /**
     87      * UTRANS_REVERSE means from &lt;target&gt; to &lt;source&gt; for a
     88      * transliterator with ID &lt;source&gt;-&lt;target&gt;.  For a transliterator
     89      * opened using a rule, it means reverse direction rules, e.g.,
     90      * "A < B".
     91      */
     92     UTRANS_REVERSE
     93 
     94 } UTransDirection;
     95 
     96 /**
     97  * Position structure for utrans_transIncremental() incremental
     98  * transliteration.  This structure defines two substrings of the text
     99  * being transliterated.  The first region, [contextStart,
    100  * contextLimit), defines what characters the transliterator will read
    101  * as context.  The second region, [start, limit), defines what
    102  * characters will actually be transliterated.  The second region
    103  * should be a subset of the first.
    104  *
    105  * <p>After a transliteration operation, some of the indices in this
    106  * structure will be modified.  See the field descriptions for
    107  * details.
    108  *
    109  * <p>contextStart <= start <= limit <= contextLimit
    110  *
    111  * <p>Note: All index values in this structure must be at code point
    112  * boundaries.  That is, none of them may occur between two code units
    113  * of a surrogate pair.  If any index does split a surrogate pair,
    114  * results are unspecified.
    115  *
    116  * @stable ICU 2.0
    117  */
    118 typedef struct UTransPosition {
    119 
    120     /**
    121      * Beginning index, inclusive, of the context to be considered for
    122      * a transliteration operation.  The transliterator will ignore
    123      * anything before this index.  INPUT/OUTPUT parameter: This parameter
    124      * is updated by a transliteration operation to reflect the maximum
    125      * amount of antecontext needed by a transliterator.
    126      * @stable ICU 2.4
    127      */
    128     int32_t contextStart;
    129 
    130     /**
    131      * Ending index, exclusive, of the context to be considered for a
    132      * transliteration operation.  The transliterator will ignore
    133      * anything at or after this index.  INPUT/OUTPUT parameter: This
    134      * parameter is updated to reflect changes in the length of the
    135      * text, but points to the same logical position in the text.
    136      * @stable ICU 2.4
    137      */
    138     int32_t contextLimit;
    139 
    140     /**
    141      * Beginning index, inclusive, of the text to be transliteratd.
    142      * INPUT/OUTPUT parameter: This parameter is advanced past
    143      * characters that have already been transliterated by a
    144      * transliteration operation.
    145      * @stable ICU 2.4
    146      */
    147     int32_t start;
    148 
    149     /**
    150      * Ending index, exclusive, of the text to be transliteratd.
    151      * INPUT/OUTPUT parameter: This parameter is updated to reflect
    152      * changes in the length of the text, but points to the same
    153      * logical position in the text.
    154      * @stable ICU 2.4
    155      */
    156     int32_t limit;
    157 
    158 } UTransPosition;
    159 
    160 /********************************************************************
    161  * General API
    162  ********************************************************************/
    163 
    164 /**
    165  * Open a custom transliterator, given a custom rules string
    166  * OR
    167  * a system transliterator, given its ID.
    168  * Any non-NULL result from this function should later be closed with
    169  * utrans_close().
    170  *
    171  * @param id a valid transliterator ID
    172  * @param idLength the length of the ID string, or -1 if NUL-terminated
    173  * @param dir the desired direction
    174  * @param rules the transliterator rules.  See the C++ header rbt.h for
    175  *              rules syntax. If NULL then a system transliterator matching
    176  *              the ID is returned.
    177  * @param rulesLength the length of the rules, or -1 if the rules
    178  *                    are NUL-terminated.
    179  * @param parseError a pointer to a UParseError struct to receive the details
    180  *                   of any parsing errors. This parameter may be NULL if no
    181  *                   parsing error details are desired.
    182  * @param pErrorCode a pointer to the UErrorCode
    183  * @return a transliterator pointer that may be passed to other
    184  *         utrans_xxx() functions, or NULL if the open call fails.
    185  * @stable ICU 2.8
    186  */
    187 U_STABLE UTransliterator* U_EXPORT2
    188 utrans_openU(const UChar *id,
    189              int32_t idLength,
    190              UTransDirection dir,
    191              const UChar *rules,
    192              int32_t rulesLength,
    193              UParseError *parseError,
    194              UErrorCode *pErrorCode);
    195 
    196 /**
    197  * Open an inverse of an existing transliterator.  For this to work,
    198  * the inverse must be registered with the system.  For example, if
    199  * the Transliterator "A-B" is opened, and then its inverse is opened,
    200  * the result is the Transliterator "B-A", if such a transliterator is
    201  * registered with the system.  Otherwise the result is NULL and a
    202  * failing UErrorCode is set.  Any non-NULL result from this function
    203  * should later be closed with utrans_close().
    204  *
    205  * @param trans the transliterator to open the inverse of.
    206  * @param status a pointer to the UErrorCode
    207  * @return a pointer to a newly-opened transliterator that is the
    208  * inverse of trans, or NULL if the open call fails.
    209  * @stable ICU 2.0
    210  */
    211 U_STABLE UTransliterator* U_EXPORT2
    212 utrans_openInverse(const UTransliterator* trans,
    213                    UErrorCode* status);
    214 
    215 /**
    216  * Create a copy of a transliterator.  Any non-NULL result from this
    217  * function should later be closed with utrans_close().
    218  *
    219  * @param trans the transliterator to be copied.
    220  * @param status a pointer to the UErrorCode
    221  * @return a transliterator pointer that may be passed to other
    222  * utrans_xxx() functions, or NULL if the clone call fails.
    223  * @stable ICU 2.0
    224  */
    225 U_STABLE UTransliterator* U_EXPORT2
    226 utrans_clone(const UTransliterator* trans,
    227              UErrorCode* status);
    228 
    229 /**
    230  * Close a transliterator.  Any non-NULL pointer returned by
    231  * utrans_openXxx() or utrans_clone() should eventually be closed.
    232  * @param trans the transliterator to be closed.
    233  * @stable ICU 2.0
    234  */
    235 U_STABLE void U_EXPORT2
    236 utrans_close(UTransliterator* trans);
    237 
    238 /**
    239  * Return the programmatic identifier for this transliterator.
    240  * If this identifier is passed to utrans_openU(), it will open
    241  * a transliterator equivalent to this one, if the ID has been
    242  * registered.
    243  *
    244  * @param trans the transliterator to return the ID of.
    245  * @param resultLength pointer to an output variable receiving the length
    246  *        of the ID string; can be NULL
    247  * @return the NUL-terminated ID string. This pointer remains
    248  * valid until utrans_close() is called on this transliterator.
    249  *
    250  * @stable ICU 2.8
    251  */
    252 U_STABLE const UChar * U_EXPORT2
    253 utrans_getUnicodeID(const UTransliterator *trans,
    254                     int32_t *resultLength);
    255 
    256 /**
    257  * Register an open transliterator with the system.  When
    258  * utrans_open() is called with an ID string that is equal to that
    259  * returned by utrans_getID(adoptedTrans,...), then
    260  * utrans_clone(adoptedTrans,...) is returned.
    261  *
    262  * <p>NOTE: After this call the system owns the adoptedTrans and will
    263  * close it.  The user must not call utrans_close() on adoptedTrans.
    264  *
    265  * @param adoptedTrans a transliterator, typically the result of
    266  * utrans_openRules(), to be registered with the system.
    267  * @param status a pointer to the UErrorCode
    268  * @stable ICU 2.0
    269  */
    270 U_STABLE void U_EXPORT2
    271 utrans_register(UTransliterator* adoptedTrans,
    272                 UErrorCode* status);
    273 
    274 /**
    275  * Unregister a transliterator from the system.  After this call the
    276  * system will no longer recognize the given ID when passed to
    277  * utrans_open(). If the ID is invalid then nothing is done.
    278  *
    279  * @param id an ID to unregister
    280  * @param idLength the length of id, or -1 if id is zero-terminated
    281  * @stable ICU 2.8
    282  */
    283 U_STABLE void U_EXPORT2
    284 utrans_unregisterID(const UChar* id, int32_t idLength);
    285 
    286 /**
    287  * Set the filter used by a transliterator.  A filter can be used to
    288  * make the transliterator pass certain characters through untouched.
    289  * The filter is expressed using a UnicodeSet pattern.  If the
    290  * filterPattern is NULL or the empty string, then the transliterator
    291  * will be reset to use no filter.
    292  *
    293  * @param trans the transliterator
    294  * @param filterPattern a pattern string, in the form accepted by
    295  * UnicodeSet, specifying which characters to apply the
    296  * transliteration to.  May be NULL or the empty string to indicate no
    297  * filter.
    298  * @param filterPatternLen the length of filterPattern, or -1 if
    299  * filterPattern is zero-terminated
    300  * @param status a pointer to the UErrorCode
    301  * @see UnicodeSet
    302  * @stable ICU 2.0
    303  */
    304 U_STABLE void U_EXPORT2
    305 utrans_setFilter(UTransliterator* trans,
    306                  const UChar* filterPattern,
    307                  int32_t filterPatternLen,
    308                  UErrorCode* status);
    309 
    310 /**
    311  * Return the number of system transliterators.
    312  * It is recommended to use utrans_openIDs() instead.
    313  *
    314  * @return the number of system transliterators.
    315  * @stable ICU 2.0
    316  */
    317 U_STABLE int32_t U_EXPORT2
    318 utrans_countAvailableIDs(void);
    319 
    320 /**
    321  * Return a UEnumeration for the available transliterators.
    322  *
    323  * @param pErrorCode Pointer to the UErrorCode in/out parameter.
    324  * @return UEnumeration for the available transliterators.
    325  *         Close with uenum_close().
    326  *
    327  * @stable ICU 2.8
    328  */
    329 U_STABLE UEnumeration * U_EXPORT2
    330 utrans_openIDs(UErrorCode *pErrorCode);
    331 
    332 /********************************************************************
    333  * Transliteration API
    334  ********************************************************************/
    335 
    336 /**
    337  * Transliterate a segment of a UReplaceable string.  The string is
    338  * passed in as a UReplaceable pointer rep and a UReplaceableCallbacks
    339  * function pointer struct repFunc.  Functions in the repFunc struct
    340  * will be called in order to modify the rep string.
    341  *
    342  * @param trans the transliterator
    343  * @param rep a pointer to the string.  This will be passed to the
    344  * repFunc functions.
    345  * @param repFunc a set of function pointers that will be used to
    346  * modify the string pointed to by rep.
    347  * @param start the beginning index, inclusive; <code>0 <= start <=
    348  * limit</code>.
    349  * @param limit pointer to the ending index, exclusive; <code>start <=
    350  * limit <= repFunc->length(rep)</code>.  Upon return, *limit will
    351  * contain the new limit index.  The text previously occupying
    352  * <code>[start, limit)</code> has been transliterated, possibly to a
    353  * string of a different length, at <code>[start,
    354  * </code><em>new-limit</em><code>)</code>, where <em>new-limit</em>
    355  * is the return value.
    356  * @param status a pointer to the UErrorCode
    357  * @stable ICU 2.0
    358  */
    359 U_STABLE void U_EXPORT2
    360 utrans_trans(const UTransliterator* trans,
    361              UReplaceable* rep,
    362              UReplaceableCallbacks* repFunc,
    363              int32_t start,
    364              int32_t* limit,
    365              UErrorCode* status);
    366 
    367 /**
    368  * Transliterate the portion of the UReplaceable text buffer that can
    369  * be transliterated unambiguosly.  This method is typically called
    370  * after new text has been inserted, e.g. as a result of a keyboard
    371  * event.  The transliterator will try to transliterate characters of
    372  * <code>rep</code> between <code>index.cursor</code> and
    373  * <code>index.limit</code>.  Characters before
    374  * <code>index.cursor</code> will not be changed.
    375  *
    376  * <p>Upon return, values in <code>index</code> will be updated.
    377  * <code>index.start</code> will be advanced to the first
    378  * character that future calls to this method will read.
    379  * <code>index.cursor</code> and <code>index.limit</code> will
    380  * be adjusted to delimit the range of text that future calls to
    381  * this method may change.
    382  *
    383  * <p>Typical usage of this method begins with an initial call
    384  * with <code>index.start</code> and <code>index.limit</code>
    385  * set to indicate the portion of <code>text</code> to be
    386  * transliterated, and <code>index.cursor == index.start</code>.
    387  * Thereafter, <code>index</code> can be used without
    388  * modification in future calls, provided that all changes to
    389  * <code>text</code> are made via this method.
    390  *
    391  * <p>This method assumes that future calls may be made that will
    392  * insert new text into the buffer.  As a result, it only performs
    393  * unambiguous transliterations.  After the last call to this method,
    394  * there may be untransliterated text that is waiting for more input
    395  * to resolve an ambiguity.  In order to perform these pending
    396  * transliterations, clients should call utrans_trans() with a start
    397  * of index.start and a limit of index.end after the last call to this
    398  * method has been made.
    399  *
    400  * @param trans the transliterator
    401  * @param rep a pointer to the string.  This will be passed to the
    402  * repFunc functions.
    403  * @param repFunc a set of function pointers that will be used to
    404  * modify the string pointed to by rep.
    405  * @param pos a struct containing the start and limit indices of the
    406  * text to be read and the text to be transliterated
    407  * @param status a pointer to the UErrorCode
    408  * @stable ICU 2.0
    409  */
    410 U_STABLE void U_EXPORT2
    411 utrans_transIncremental(const UTransliterator* trans,
    412                         UReplaceable* rep,
    413                         UReplaceableCallbacks* repFunc,
    414                         UTransPosition* pos,
    415                         UErrorCode* status);
    416 
    417 /**
    418  * Transliterate a segment of a UChar* string.  The string is passed
    419  * in in a UChar* buffer.  The string is modified in place.  If the
    420  * result is longer than textCapacity, it is truncated.  The actual
    421  * length of the result is returned in *textLength, if textLength is
    422  * non-NULL. *textLength may be greater than textCapacity, but only
    423  * textCapacity UChars will be written to *text, including the zero
    424  * terminator.
    425  *
    426  * @param trans the transliterator
    427  * @param text a pointer to a buffer containing the text to be
    428  * transliterated on input and the result text on output.
    429  * @param textLength a pointer to the length of the string in text.
    430  * If the length is -1 then the string is assumed to be
    431  * zero-terminated.  Upon return, the new length is stored in
    432  * *textLength.  If textLength is NULL then the string is assumed to
    433  * be zero-terminated.
    434  * @param textCapacity a pointer to the length of the text buffer.
    435  * Upon return,
    436  * @param start the beginning index, inclusive; <code>0 <= start <=
    437  * limit</code>.
    438  * @param limit pointer to the ending index, exclusive; <code>start <=
    439  * limit <= repFunc->length(rep)</code>.  Upon return, *limit will
    440  * contain the new limit index.  The text previously occupying
    441  * <code>[start, limit)</code> has been transliterated, possibly to a
    442  * string of a different length, at <code>[start,
    443  * </code><em>new-limit</em><code>)</code>, where <em>new-limit</em>
    444  * is the return value.
    445  * @param status a pointer to the UErrorCode
    446  * @stable ICU 2.0
    447  */
    448 U_STABLE void U_EXPORT2
    449 utrans_transUChars(const UTransliterator* trans,
    450                    UChar* text,
    451                    int32_t* textLength,
    452                    int32_t textCapacity,
    453                    int32_t start,
    454                    int32_t* limit,
    455                    UErrorCode* status);
    456 
    457 /**
    458  * Transliterate the portion of the UChar* text buffer that can be
    459  * transliterated unambiguosly.  See utrans_transIncremental().  The
    460  * string is passed in in a UChar* buffer.  The string is modified in
    461  * place.  If the result is longer than textCapacity, it is truncated.
    462  * The actual length of the result is returned in *textLength, if
    463  * textLength is non-NULL. *textLength may be greater than
    464  * textCapacity, but only textCapacity UChars will be written to
    465  * *text, including the zero terminator.  See utrans_transIncremental()
    466  * for usage details.
    467  *
    468  * @param trans the transliterator
    469  * @param text a pointer to a buffer containing the text to be
    470  * transliterated on input and the result text on output.
    471  * @param textLength a pointer to the length of the string in text.
    472  * If the length is -1 then the string is assumed to be
    473  * zero-terminated.  Upon return, the new length is stored in
    474  * *textLength.  If textLength is NULL then the string is assumed to
    475  * be zero-terminated.
    476  * @param textCapacity the length of the text buffer
    477  * @param pos a struct containing the start and limit indices of the
    478  * text to be read and the text to be transliterated
    479  * @param status a pointer to the UErrorCode
    480  * @see utrans_transIncremental
    481  * @stable ICU 2.0
    482  */
    483 U_STABLE void U_EXPORT2
    484 utrans_transIncrementalUChars(const UTransliterator* trans,
    485                               UChar* text,
    486                               int32_t* textLength,
    487                               int32_t textCapacity,
    488                               UTransPosition* pos,
    489                               UErrorCode* status);
    490 
    491 /* deprecated API ----------------------------------------------------------- */
    492 
    493 /* see utrans.h documentation for why these functions are deprecated */
    494 
    495 /**
    496  * Deprecated, use utrans_openU() instead.
    497  * Open a custom transliterator, given a custom rules string
    498  * OR
    499  * a system transliterator, given its ID.
    500  * Any non-NULL result from this function should later be closed with
    501  * utrans_close().
    502  *
    503  * @param id a valid ID, as returned by utrans_getAvailableID()
    504  * @param dir the desired direction
    505  * @param rules the transliterator rules.  See the C++ header rbt.h
    506  * for rules syntax. If NULL then a system transliterator matching
    507  * the ID is returned.
    508  * @param rulesLength the length of the rules, or -1 if the rules
    509  * are zero-terminated.
    510  * @param parseError a pointer to a UParseError struct to receive the
    511  * details of any parsing errors. This parameter may be NULL if no
    512  * parsing error details are desired.
    513  * @param status a pointer to the UErrorCode
    514  * @return a transliterator pointer that may be passed to other
    515  * utrans_xxx() functions, or NULL if the open call fails.
    516  * @deprecated ICU 2.8 Use utrans_openU() instead, see utrans.h
    517  */
    518 U_DEPRECATED UTransliterator* U_EXPORT2
    519 utrans_open(const char* id,
    520             UTransDirection dir,
    521             const UChar* rules,         /* may be Null */
    522             int32_t rulesLength,        /* -1 if null-terminated */
    523             UParseError* parseError,    /* may be Null */
    524             UErrorCode* status);
    525 
    526 /**
    527  * Deprecated, use utrans_getUnicodeID() instead.
    528  * Return the programmatic identifier for this transliterator.
    529  * If this identifier is passed to utrans_open(), it will open
    530  * a transliterator equivalent to this one, if the ID has been
    531  * registered.
    532  * @param trans the transliterator to return the ID of.
    533  * @param buf the buffer in which to receive the ID.  This may be
    534  * NULL, in which case no characters are copied.
    535  * @param bufCapacity the capacity of the buffer.  Ignored if buf is
    536  * NULL.
    537  * @return the actual length of the ID, not including
    538  * zero-termination.  This may be greater than bufCapacity.
    539  * @deprecated ICU 2.8 Use utrans_getUnicodeID() instead, see utrans.h
    540  */
    541 U_DEPRECATED int32_t U_EXPORT2
    542 utrans_getID(const UTransliterator* trans,
    543              char* buf,
    544              int32_t bufCapacity);
    545 
    546 /**
    547  * Deprecated, use utrans_unregisterID() instead.
    548  * Unregister a transliterator from the system.  After this call the
    549  * system will no longer recognize the given ID when passed to
    550  * utrans_open().  If the id is invalid then nothing is done.
    551  *
    552  * @param id a zero-terminated ID
    553  * @deprecated ICU 2.8 Use utrans_unregisterID() instead, see utrans.h
    554  */
    555 U_DEPRECATED void U_EXPORT2
    556 utrans_unregister(const char* id);
    557 
    558 /**
    559  * Deprecated, use utrans_openIDs() instead.
    560  * Return the ID of the index-th system transliterator.  The result
    561  * is placed in the given buffer.  If the given buffer is too small,
    562  * the initial substring is copied to buf.  The result in buf is
    563  * always zero-terminated.
    564  *
    565  * @param index the number of the transliterator to return.  Must
    566  * satisfy 0 <= index < utrans_countAvailableIDs().  If index is out
    567  * of range then it is treated as if it were 0.
    568  * @param buf the buffer in which to receive the ID.  This may be
    569  * NULL, in which case no characters are copied.
    570  * @param bufCapacity the capacity of the buffer.  Ignored if buf is
    571  * NULL.
    572  * @return the actual length of the index-th ID, not including
    573  * zero-termination.  This may be greater than bufCapacity.
    574  * @deprecated ICU 2.8 Use utrans_openIDs() instead, see utrans.h
    575  */
    576 U_DEPRECATED int32_t U_EXPORT2
    577 utrans_getAvailableID(int32_t index,
    578                       char* buf,
    579                       int32_t bufCapacity);
    580 
    581 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
    582 
    583 #endif
    584