Home | History | Annotate | Download | only in unicode
      1 /*
      2 ******************************************************************************
      3 *
      4 *   Copyright (C) 2000-2010, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 ******************************************************************************
      8 *   file name:  ushape.h
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2000jun29
     14 *   created by: Markus W. Scherer
     15 */
     16 
     17 #ifndef __USHAPE_H__
     18 #define __USHAPE_H__
     19 
     20 #include "unicode/utypes.h"
     21 
     22 /**
     23  * \file
     24  * \brief C API:  Arabic shaping
     25  *
     26  */
     27 
     28 /**
     29  * Shape Arabic text on a character basis.
     30  *
     31  * <p>This function performs basic operations for "shaping" Arabic text. It is most
     32  * useful for use with legacy data formats and legacy display technology
     33  * (simple terminals). All operations are performed on Unicode characters.</p>
     34  *
     35  * <p>Text-based shaping means that some character code points in the text are
     36  * replaced by others depending on the context. It transforms one kind of text
     37  * into another. In comparison, modern displays for Arabic text select
     38  * appropriate, context-dependent font glyphs for each text element, which means
     39  * that they transform text into a glyph vector.</p>
     40  *
     41  * <p>Text transformations are necessary when modern display technology is not
     42  * available or when text needs to be transformed to or from legacy formats that
     43  * use "shaped" characters. Since the Arabic script is cursive, connecting
     44  * adjacent letters to each other, computers select images for each letter based
     45  * on the surrounding letters. This usually results in four images per Arabic
     46  * letter: initial, middle, final, and isolated forms. In Unicode, on the other
     47  * hand, letters are normally stored abstract, and a display system is expected
     48  * to select the necessary glyphs. (This makes searching and other text
     49  * processing easier because the same letter has only one code.) It is possible
     50  * to mimic this with text transformations because there are characters in
     51  * Unicode that are rendered as letters with a specific shape
     52  * (or cursive connectivity). They were included for interoperability with
     53  * legacy systems and codepages, and for unsophisticated display systems.</p>
     54  *
     55  * <p>A second kind of text transformations is supported for Arabic digits:
     56  * For compatibility with legacy codepages that only include European digits,
     57  * it is possible to replace one set of digits by another, changing the
     58  * character code points. These operations can be performed for either
     59  * Arabic-Indic Digits (U+0660...U+0669) or Eastern (Extended) Arabic-Indic
     60  * digits (U+06f0...U+06f9).</p>
     61  *
     62  * <p>Some replacements may result in more or fewer characters (code points).
     63  * By default, this means that the destination buffer may receive text with a
     64  * length different from the source length. Some legacy systems rely on the
     65  * length of the text to be constant. They expect extra spaces to be added
     66  * or consumed either next to the affected character or at the end of the
     67  * text.</p>
     68  *
     69  * <p>For details about the available operations, see the description of the
     70  * <code>U_SHAPE_...</code> options.</p>
     71  *
     72  * @param source The input text.
     73  *
     74  * @param sourceLength The number of UChars in <code>source</code>.
     75  *
     76  * @param dest The destination buffer that will receive the results of the
     77  *             requested operations. It may be <code>NULL</code> only if
     78  *             <code>destSize</code> is 0. The source and destination must not
     79  *             overlap.
     80  *
     81  * @param destSize The size (capacity) of the destination buffer in UChars.
     82  *                 If <code>destSize</code> is 0, then no output is produced,
     83  *                 but the necessary buffer size is returned ("preflighting").
     84  *
     85  * @param options This is a 32-bit set of flags that specify the operations
     86  *                that are performed on the input text. If no error occurs,
     87  *                then the result will always be written to the destination
     88  *                buffer.
     89  *
     90  * @param pErrorCode must be a valid pointer to an error code value,
     91  *        which must not indicate a failure before the function call.
     92  *
     93  * @return The number of UChars written to the destination buffer.
     94  *         If an error occured, then no output was written, or it may be
     95  *         incomplete. If <code>U_BUFFER_OVERFLOW_ERROR</code> is set, then
     96  *         the return value indicates the necessary destination buffer size.
     97  * @stable ICU 2.0
     98  */
     99 U_STABLE int32_t U_EXPORT2
    100 u_shapeArabic(const UChar *source, int32_t sourceLength,
    101               UChar *dest, int32_t destSize,
    102               uint32_t options,
    103               UErrorCode *pErrorCode);
    104 
    105 /**
    106  * Memory option: allow the result to have a different length than the source.
    107  * Affects: LamAlef options
    108  * @stable ICU 2.0
    109  */
    110 #define U_SHAPE_LENGTH_GROW_SHRINK              0
    111 
    112 /**
    113  * Memory option: allow the result to have a different length than the source.
    114  * Affects: LamAlef options
    115  * This option is an alias to U_SHAPE_LENGTH_GROW_SHRINK
    116  * @stable ICU 4.2
    117  */
    118 #define U_SHAPE_LAMALEF_RESIZE                  0
    119 
    120 /**
    121  * Memory option: the result must have the same length as the source.
    122  * If more room is necessary, then try to consume spaces next to modified characters.
    123  * @stable ICU 2.0
    124  */
    125 #define U_SHAPE_LENGTH_FIXED_SPACES_NEAR        1
    126 
    127 /**
    128  * Memory option: the result must have the same length as the source.
    129  * If more room is necessary, then try to consume spaces next to modified characters.
    130  * Affects: LamAlef options
    131  * This option is an alias to U_SHAPE_LENGTH_FIXED_SPACES_NEAR
    132  * @stable ICU 4.2
    133  */
    134 #define U_SHAPE_LAMALEF_NEAR                    1
    135 
    136 /**
    137  * Memory option: the result must have the same length as the source.
    138  * If more room is necessary, then try to consume spaces at the end of the text.
    139  * @stable ICU 2.0
    140  */
    141 #define U_SHAPE_LENGTH_FIXED_SPACES_AT_END      2
    142 
    143 /**
    144  * Memory option: the result must have the same length as the source.
    145  * If more room is necessary, then try to consume spaces at the end of the text.
    146  * Affects: LamAlef options
    147  * This option is an alias to U_SHAPE_LENGTH_FIXED_SPACES_AT_END
    148  * @stable ICU 4.2
    149  */
    150 #define U_SHAPE_LAMALEF_END                     2
    151 
    152 /**
    153  * Memory option: the result must have the same length as the source.
    154  * If more room is necessary, then try to consume spaces at the beginning of the text.
    155  * @stable ICU 2.0
    156  */
    157 #define U_SHAPE_LENGTH_FIXED_SPACES_AT_BEGINNING 3
    158 
    159 /**
    160  * Memory option: the result must have the same length as the source.
    161  * If more room is necessary, then try to consume spaces at the beginning of the text.
    162  * Affects: LamAlef options
    163  * This option is an alias to U_SHAPE_LENGTH_FIXED_SPACES_AT_BEGINNING
    164  * @stable ICU 4.2
    165  */
    166 #define U_SHAPE_LAMALEF_BEGIN                    3
    167 
    168 
    169 /**
    170  * Memory option: the result must have the same length as the source.
    171  * Shaping Mode: For each LAMALEF character found, expand LAMALEF using space at end.
    172  *               If there is no space at end, use spaces at beginning of the buffer. If there
    173  *               is no space at beginning of the buffer, use spaces at the near (i.e. the space
    174  *               after the LAMALEF character).
    175  *               If there are no spaces found, an error U_NO_SPACE_AVAILABLE (as defined in utypes.h)
    176  *               will be set in pErrorCode
    177  *
    178  * Deshaping Mode: Perform the same function as the flag equals U_SHAPE_LAMALEF_END.
    179  * Affects: LamAlef options
    180  * @stable ICU 4.2
    181  */
    182 #define U_SHAPE_LAMALEF_AUTO                     0x10000
    183 
    184 /** Bit mask for memory options. @stable ICU 2.0 */
    185 #define U_SHAPE_LENGTH_MASK                      0x10003 /* Changed old value 3 */
    186 
    187 
    188 /**
    189  * Bit mask for LamAlef memory options.
    190  * @stable ICU 4.2
    191  */
    192 #define U_SHAPE_LAMALEF_MASK                     0x10003 /* updated */
    193 
    194 /** Direction indicator: the source is in logical (keyboard) order. @stable ICU 2.0 */
    195 #define U_SHAPE_TEXT_DIRECTION_LOGICAL          0
    196 
    197 /**
    198  * Direction indicator:
    199  * the source is in visual RTL order,
    200  * the rightmost displayed character stored first.
    201  * This option is an alias to U_SHAPE_TEXT_DIRECTION_LOGICAL
    202  * @stable ICU 4.2
    203  */
    204 #define U_SHAPE_TEXT_DIRECTION_VISUAL_RTL       0
    205 
    206 /**
    207  * Direction indicator:
    208  * the source is in visual LTR order,
    209  * the leftmost displayed character stored first.
    210  * @stable ICU 2.0
    211  */
    212 #define U_SHAPE_TEXT_DIRECTION_VISUAL_LTR       4
    213 
    214 /** Bit mask for direction indicators. @stable ICU 2.0 */
    215 #define U_SHAPE_TEXT_DIRECTION_MASK             4
    216 
    217 
    218 /** Letter shaping option: do not perform letter shaping. @stable ICU 2.0 */
    219 #define U_SHAPE_LETTERS_NOOP                    0
    220 
    221 /** Letter shaping option: replace abstract letter characters by "shaped" ones. @stable ICU 2.0 */
    222 #define U_SHAPE_LETTERS_SHAPE                   8
    223 
    224 /** Letter shaping option: replace "shaped" letter characters by abstract ones. @stable ICU 2.0 */
    225 #define U_SHAPE_LETTERS_UNSHAPE                 0x10
    226 
    227 /**
    228  * Letter shaping option: replace abstract letter characters by "shaped" ones.
    229  * The only difference with U_SHAPE_LETTERS_SHAPE is that Tashkeel letters
    230  * are always "shaped" into the isolated form instead of the medial form
    231  * (selecting code points from the Arabic Presentation Forms-B block).
    232  * @stable ICU 2.0
    233  */
    234 #define U_SHAPE_LETTERS_SHAPE_TASHKEEL_ISOLATED 0x18
    235 
    236 
    237 /** Bit mask for letter shaping options. @stable ICU 2.0 */
    238 #define U_SHAPE_LETTERS_MASK                        0x18
    239 
    240 
    241 /** Digit shaping option: do not perform digit shaping. @stable ICU 2.0 */
    242 #define U_SHAPE_DIGITS_NOOP                     0
    243 
    244 /**
    245  * Digit shaping option:
    246  * Replace European digits (U+0030...) by Arabic-Indic digits.
    247  * @stable ICU 2.0
    248  */
    249 #define U_SHAPE_DIGITS_EN2AN                    0x20
    250 
    251 /**
    252  * Digit shaping option:
    253  * Replace Arabic-Indic digits by European digits (U+0030...).
    254  * @stable ICU 2.0
    255  */
    256 #define U_SHAPE_DIGITS_AN2EN                    0x40
    257 
    258 /**
    259  * Digit shaping option:
    260  * Replace European digits (U+0030...) by Arabic-Indic digits if the most recent
    261  * strongly directional character is an Arabic letter
    262  * (<code>u_charDirection()</code> result <code>U_RIGHT_TO_LEFT_ARABIC</code> [AL]).<br>
    263  * The direction of "preceding" depends on the direction indicator option.
    264  * For the first characters, the preceding strongly directional character
    265  * (initial state) is assumed to be not an Arabic letter
    266  * (it is <code>U_LEFT_TO_RIGHT</code> [L] or <code>U_RIGHT_TO_LEFT</code> [R]).
    267  * @stable ICU 2.0
    268  */
    269 #define U_SHAPE_DIGITS_ALEN2AN_INIT_LR          0x60
    270 
    271 /**
    272  * Digit shaping option:
    273  * Replace European digits (U+0030...) by Arabic-Indic digits if the most recent
    274  * strongly directional character is an Arabic letter
    275  * (<code>u_charDirection()</code> result <code>U_RIGHT_TO_LEFT_ARABIC</code> [AL]).<br>
    276  * The direction of "preceding" depends on the direction indicator option.
    277  * For the first characters, the preceding strongly directional character
    278  * (initial state) is assumed to be an Arabic letter.
    279  * @stable ICU 2.0
    280  */
    281 #define U_SHAPE_DIGITS_ALEN2AN_INIT_AL          0x80
    282 
    283 /** Not a valid option value. May be replaced by a new option. @stable ICU 2.0 */
    284 #define U_SHAPE_DIGITS_RESERVED                 0xa0
    285 
    286 /** Bit mask for digit shaping options. @stable ICU 2.0 */
    287 #define U_SHAPE_DIGITS_MASK                     0xe0
    288 
    289 
    290 /** Digit type option: Use Arabic-Indic digits (U+0660...U+0669). @stable ICU 2.0 */
    291 #define U_SHAPE_DIGIT_TYPE_AN                   0
    292 
    293 /** Digit type option: Use Eastern (Extended) Arabic-Indic digits (U+06f0...U+06f9). @stable ICU 2.0 */
    294 #define U_SHAPE_DIGIT_TYPE_AN_EXTENDED          0x100
    295 
    296 /** Not a valid option value. May be replaced by a new option. @stable ICU 2.0 */
    297 #define U_SHAPE_DIGIT_TYPE_RESERVED             0x200
    298 
    299 /** Bit mask for digit type options. @stable ICU 2.0 */
    300 #define U_SHAPE_DIGIT_TYPE_MASK                 0x300 /* I need to change this from 0x3f00 to 0x300 */
    301 
    302 /**
    303  * Tashkeel aggregation option:
    304  * Replaces any combination of U+0651 with one of
    305  * U+064C, U+064D, U+064E, U+064F, U+0650 with
    306  * U+FC5E, U+FC5F, U+FC60, U+FC61, U+FC62 consecutively.
    307  * @stable ICU 3.6
    308  */
    309 #define U_SHAPE_AGGREGATE_TASHKEEL              0x4000
    310 /** Tashkeel aggregation option: do not aggregate tashkeels. @stable ICU 3.6 */
    311 #define U_SHAPE_AGGREGATE_TASHKEEL_NOOP         0
    312 /** Bit mask for tashkeel aggregation. @stable ICU 3.6 */
    313 #define U_SHAPE_AGGREGATE_TASHKEEL_MASK         0x4000
    314 
    315 /**
    316  * Presentation form option:
    317  * Don't replace Arabic Presentation Forms-A and Arabic Presentation Forms-B
    318  * characters with 0+06xx characters, before shaping.
    319  * @stable ICU 3.6
    320  */
    321 #define U_SHAPE_PRESERVE_PRESENTATION           0x8000
    322 /** Presentation form option:
    323  * Replace Arabic Presentation Forms-A and Arabic Presentationo Forms-B with
    324  * their unshaped correspondants in range 0+06xx, before shaping.
    325  * @stable ICU 3.6
    326  */
    327 #define U_SHAPE_PRESERVE_PRESENTATION_NOOP      0
    328 /** Bit mask for preserve presentation form. @stable ICU 3.6 */
    329 #define U_SHAPE_PRESERVE_PRESENTATION_MASK      0x8000
    330 
    331 /* Seen Tail option */
    332 /**
    333  * Memory option: the result must have the same length as the source.
    334  * Shaping mode: The SEEN family character will expand into two characters using space near
    335  *               the SEEN family character(i.e. the space after the character).
    336  *               If there are no spaces found, an error U_NO_SPACE_AVAILABLE (as defined in utypes.h)
    337  *               will be set in pErrorCode
    338  *
    339  * De-shaping mode: Any Seen character followed by Tail character will be
    340  *                  replaced by one cell Seen and a space will replace the Tail.
    341  * Affects: Seen options
    342  * @stable ICU 4.2
    343  */
    344 #define U_SHAPE_SEEN_TWOCELL_NEAR     0x200000
    345 
    346 /**
    347  * Bit mask for Seen memory options.
    348  * @stable ICU 4.2
    349  */
    350 #define U_SHAPE_SEEN_MASK             0x700000
    351 
    352 /* YehHamza option */
    353 /**
    354  * Memory option: the result must have the same length as the source.
    355  * Shaping mode: The YEHHAMZA character will expand into two characters using space near it
    356  *              (i.e. the space after the character
    357  *               If there are no spaces found, an error U_NO_SPACE_AVAILABLE (as defined in utypes.h)
    358  *               will be set in pErrorCode
    359  *
    360  * De-shaping mode: Any Yeh (final or isolated) character followed by Hamza character will be
    361  *                  replaced by one cell YehHamza and space will replace the Hamza.
    362  * Affects: YehHamza options
    363  * @stable ICU 4.2
    364  */
    365 #define U_SHAPE_YEHHAMZA_TWOCELL_NEAR      0x1000000
    366 
    367 
    368 /**
    369  * Bit mask for YehHamza memory options.
    370  * @stable ICU 4.2
    371  */
    372 #define U_SHAPE_YEHHAMZA_MASK              0x3800000
    373 
    374 /* New Tashkeel options */
    375 /**
    376  * Memory option: the result must have the same length as the source.
    377  * Shaping mode: Tashkeel characters will be replaced by spaces.
    378  *               Spaces will be placed at beginning of the buffer
    379  *
    380  * De-shaping mode: N/A
    381  * Affects: Tashkeel options
    382  * @stable ICU 4.2
    383  */
    384 #define U_SHAPE_TASHKEEL_BEGIN                      0x40000
    385 
    386 /**
    387  * Memory option: the result must have the same length as the source.
    388  * Shaping mode: Tashkeel characters will be replaced by spaces.
    389  *               Spaces will be placed at end of the buffer
    390  *
    391  * De-shaping mode: N/A
    392  * Affects: Tashkeel options
    393  * @stable ICU 4.2
    394  */
    395 #define U_SHAPE_TASHKEEL_END                        0x60000
    396 
    397 /**
    398  * Memory option: allow the result to have a different length than the source.
    399  * Shaping mode: Tashkeel characters will be removed, buffer length will shrink.
    400  * De-shaping mode: N/A
    401  *
    402  * Affect: Tashkeel options
    403  * @stable ICU 4.2
    404  */
    405 #define U_SHAPE_TASHKEEL_RESIZE                     0x80000
    406 
    407 /**
    408  * Memory option: the result must have the same length as the source.
    409  * Shaping mode: Tashkeel characters will be replaced by Tatweel if it is connected to adjacent
    410  *               characters (i.e. shaped on Tatweel) or replaced by space if it is not connected.
    411  *
    412  * De-shaping mode: N/A
    413  * Affects: YehHamza options
    414  * @stable ICU 4.2
    415  */
    416 #define U_SHAPE_TASHKEEL_REPLACE_BY_TATWEEL         0xC0000
    417 
    418 /**
    419  * Bit mask for Tashkeel replacement with Space or Tatweel memory options.
    420  * @stable ICU 4.2
    421  */
    422 #define U_SHAPE_TASHKEEL_MASK                       0xE0000
    423 
    424 
    425 /* Space location Control options */
    426 /**
    427  * This option affect the meaning of BEGIN and END options. if this option is not used the default
    428  * for BEGIN and END will be as following:
    429  * The Default (for both Visual LTR, Visual RTL and Logical Text)
    430  *           1. BEGIN always refers to the start address of physical memory.
    431  *           2. END always refers to the end address of physical memory.
    432  *
    433  * If this option is used it will swap the meaning of BEGIN and END only for Visual LTR text.
    434  *
    435  * The effect on BEGIN and END Memory Options will be as following:
    436  *    A. BEGIN For Visual LTR text: This will be the beginning (right side) of the visual text(
    437  *       corresponding to the physical memory address end for Visual LTR text, Same as END in
    438  *       default behavior)
    439  *    B. BEGIN For Logical text: Same as BEGIN in default behavior.
    440  *    C. END For Visual LTR text: This will be the end (left side) of the visual text (corresponding
    441  *       to the physical memory address beginning for Visual LTR text, Same as BEGIN in default behavior.
    442  *    D. END For Logical text: Same as END in default behavior).
    443  * Affects: All LamAlef BEGIN, END and AUTO options.
    444  * @stable ICU 4.2
    445  */
    446 #define U_SHAPE_SPACES_RELATIVE_TO_TEXT_BEGIN_END 0x4000000
    447 
    448 /**
    449  * Bit mask for swapping BEGIN and END for Visual LTR text
    450  * @stable ICU 4.2
    451  */
    452 #define U_SHAPE_SPACES_RELATIVE_TO_TEXT_MASK      0x4000000
    453 
    454 /**
    455  * If this option is used, shaping will use the new Unicode code point for TAIL (i.e. 0xFE73).
    456  * If this option is not specified (Default), old unofficial Unicode TAIL code point is used (i.e. 0x200B)
    457  * De-shaping will not use this option as it will always search for both the new Unicode code point for the
    458  * TAIL (i.e. 0xFE73) or the old unofficial Unicode TAIL code point (i.e. 0x200B) and de-shape the
    459  * Seen-Family letter accordingly.
    460  *
    461  * Shaping Mode: Only shaping.
    462  * De-shaping Mode: N/A.
    463  * Affects: All Seen options
    464  * @draft ICU 4.2
    465  */
    466 #define SHAPE_TAIL_NEW_UNICODE        0x8000000
    467 
    468 /**
    469  * Bit mask for new Unicode Tail option
    470  * @draft ICU 4.2
    471  */
    472 #define SHAPE_TAIL_TYPE_MASK          0x8000000
    473 
    474 #endif
    475