Home | History | Annotate | Download | only in unicode
      1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 ******************************************************************************
      5 *
      6 *   Copyright (C) 2000-2012, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 ******************************************************************************
     10 *   file name:  ushape.h
     11 *   encoding:   US-ASCII
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2000jun29
     16 *   created by: Markus W. Scherer
     17 */
     18 
     19 #ifndef __USHAPE_H__
     20 #define __USHAPE_H__
     21 
     22 #include "unicode/utypes.h"
     23 
     24 /**
     25  * \file
     26  * \brief C API:  Arabic shaping
     27  *
     28  */
     29 
     30 /**
     31  * Shape Arabic text on a character basis.
     32  *
     33  * <p>This function performs basic operations for "shaping" Arabic text. It is most
     34  * useful for use with legacy data formats and legacy display technology
     35  * (simple terminals). All operations are performed on Unicode characters.</p>
     36  *
     37  * <p>Text-based shaping means that some character code points in the text are
     38  * replaced by others depending on the context. It transforms one kind of text
     39  * into another. In comparison, modern displays for Arabic text select
     40  * appropriate, context-dependent font glyphs for each text element, which means
     41  * that they transform text into a glyph vector.</p>
     42  *
     43  * <p>Text transformations are necessary when modern display technology is not
     44  * available or when text needs to be transformed to or from legacy formats that
     45  * use "shaped" characters. Since the Arabic script is cursive, connecting
     46  * adjacent letters to each other, computers select images for each letter based
     47  * on the surrounding letters. This usually results in four images per Arabic
     48  * letter: initial, middle, final, and isolated forms. In Unicode, on the other
     49  * hand, letters are normally stored abstract, and a display system is expected
     50  * to select the necessary glyphs. (This makes searching and other text
     51  * processing easier because the same letter has only one code.) It is possible
     52  * to mimic this with text transformations because there are characters in
     53  * Unicode that are rendered as letters with a specific shape
     54  * (or cursive connectivity). They were included for interoperability with
     55  * legacy systems and codepages, and for unsophisticated display systems.</p>
     56  *
     57  * <p>A second kind of text transformations is supported for Arabic digits:
     58  * For compatibility with legacy codepages that only include European digits,
     59  * it is possible to replace one set of digits by another, changing the
     60  * character code points. These operations can be performed for either
     61  * Arabic-Indic Digits (U+0660...U+0669) or Eastern (Extended) Arabic-Indic
     62  * digits (U+06f0...U+06f9).</p>
     63  *
     64  * <p>Some replacements may result in more or fewer characters (code points).
     65  * By default, this means that the destination buffer may receive text with a
     66  * length different from the source length. Some legacy systems rely on the
     67  * length of the text to be constant. They expect extra spaces to be added
     68  * or consumed either next to the affected character or at the end of the
     69  * text.</p>
     70  *
     71  * <p>For details about the available operations, see the description of the
     72  * <code>U_SHAPE_...</code> options.</p>
     73  *
     74  * @param source The input text.
     75  *
     76  * @param sourceLength The number of UChars in <code>source</code>.
     77  *
     78  * @param dest The destination buffer that will receive the results of the
     79  *             requested operations. It may be <code>NULL</code> only if
     80  *             <code>destSize</code> is 0. The source and destination must not
     81  *             overlap.
     82  *
     83  * @param destSize The size (capacity) of the destination buffer in UChars.
     84  *                 If <code>destSize</code> is 0, then no output is produced,
     85  *                 but the necessary buffer size is returned ("preflighting").
     86  *
     87  * @param options This is a 32-bit set of flags that specify the operations
     88  *                that are performed on the input text. If no error occurs,
     89  *                then the result will always be written to the destination
     90  *                buffer.
     91  *
     92  * @param pErrorCode must be a valid pointer to an error code value,
     93  *        which must not indicate a failure before the function call.
     94  *
     95  * @return The number of UChars written to the destination buffer.
     96  *         If an error occured, then no output was written, or it may be
     97  *         incomplete. If <code>U_BUFFER_OVERFLOW_ERROR</code> is set, then
     98  *         the return value indicates the necessary destination buffer size.
     99  * @stable ICU 2.0
    100  */
    101 U_STABLE int32_t U_EXPORT2
    102 u_shapeArabic(const UChar *source, int32_t sourceLength,
    103               UChar *dest, int32_t destSize,
    104               uint32_t options,
    105               UErrorCode *pErrorCode);
    106 
    107 /**
    108  * Memory option: allow the result to have a different length than the source.
    109  * Affects: LamAlef options
    110  * @stable ICU 2.0
    111  */
    112 #define U_SHAPE_LENGTH_GROW_SHRINK              0
    113 
    114 /**
    115  * Memory option: allow the result to have a different length than the source.
    116  * Affects: LamAlef options
    117  * This option is an alias to U_SHAPE_LENGTH_GROW_SHRINK
    118  * @stable ICU 4.2
    119  */
    120 #define U_SHAPE_LAMALEF_RESIZE                  0
    121 
    122 /**
    123  * Memory option: the result must have the same length as the source.
    124  * If more room is necessary, then try to consume spaces next to modified characters.
    125  * @stable ICU 2.0
    126  */
    127 #define U_SHAPE_LENGTH_FIXED_SPACES_NEAR        1
    128 
    129 /**
    130  * Memory option: the result must have the same length as the source.
    131  * If more room is necessary, then try to consume spaces next to modified characters.
    132  * Affects: LamAlef options
    133  * This option is an alias to U_SHAPE_LENGTH_FIXED_SPACES_NEAR
    134  * @stable ICU 4.2
    135  */
    136 #define U_SHAPE_LAMALEF_NEAR                    1
    137 
    138 /**
    139  * Memory option: the result must have the same length as the source.
    140  * If more room is necessary, then try to consume spaces at the end of the text.
    141  * @stable ICU 2.0
    142  */
    143 #define U_SHAPE_LENGTH_FIXED_SPACES_AT_END      2
    144 
    145 /**
    146  * Memory option: the result must have the same length as the source.
    147  * If more room is necessary, then try to consume spaces at the end of the text.
    148  * Affects: LamAlef options
    149  * This option is an alias to U_SHAPE_LENGTH_FIXED_SPACES_AT_END
    150  * @stable ICU 4.2
    151  */
    152 #define U_SHAPE_LAMALEF_END                     2
    153 
    154 /**
    155  * Memory option: the result must have the same length as the source.
    156  * If more room is necessary, then try to consume spaces at the beginning of the text.
    157  * @stable ICU 2.0
    158  */
    159 #define U_SHAPE_LENGTH_FIXED_SPACES_AT_BEGINNING 3
    160 
    161 /**
    162  * Memory option: the result must have the same length as the source.
    163  * If more room is necessary, then try to consume spaces at the beginning of the text.
    164  * Affects: LamAlef options
    165  * This option is an alias to U_SHAPE_LENGTH_FIXED_SPACES_AT_BEGINNING
    166  * @stable ICU 4.2
    167  */
    168 #define U_SHAPE_LAMALEF_BEGIN                    3
    169 
    170 
    171 /**
    172  * Memory option: the result must have the same length as the source.
    173  * Shaping Mode: For each LAMALEF character found, expand LAMALEF using space at end.
    174  *               If there is no space at end, use spaces at beginning of the buffer. If there
    175  *               is no space at beginning of the buffer, use spaces at the near (i.e. the space
    176  *               after the LAMALEF character).
    177  *               If there are no spaces found, an error U_NO_SPACE_AVAILABLE (as defined in utypes.h)
    178  *               will be set in pErrorCode
    179  *
    180  * Deshaping Mode: Perform the same function as the flag equals U_SHAPE_LAMALEF_END.
    181  * Affects: LamAlef options
    182  * @stable ICU 4.2
    183  */
    184 #define U_SHAPE_LAMALEF_AUTO                     0x10000
    185 
    186 /** Bit mask for memory options. @stable ICU 2.0 */
    187 #define U_SHAPE_LENGTH_MASK                      0x10003 /* Changed old value 3 */
    188 
    189 
    190 /**
    191  * Bit mask for LamAlef memory options.
    192  * @stable ICU 4.2
    193  */
    194 #define U_SHAPE_LAMALEF_MASK                     0x10003 /* updated */
    195 
    196 /** Direction indicator: the source is in logical (keyboard) order. @stable ICU 2.0 */
    197 #define U_SHAPE_TEXT_DIRECTION_LOGICAL          0
    198 
    199 /**
    200  * Direction indicator:
    201  * the source is in visual RTL order,
    202  * the rightmost displayed character stored first.
    203  * This option is an alias to U_SHAPE_TEXT_DIRECTION_LOGICAL
    204  * @stable ICU 4.2
    205  */
    206 #define U_SHAPE_TEXT_DIRECTION_VISUAL_RTL       0
    207 
    208 /**
    209  * Direction indicator:
    210  * the source is in visual LTR order,
    211  * the leftmost displayed character stored first.
    212  * @stable ICU 2.0
    213  */
    214 #define U_SHAPE_TEXT_DIRECTION_VISUAL_LTR       4
    215 
    216 /** Bit mask for direction indicators. @stable ICU 2.0 */
    217 #define U_SHAPE_TEXT_DIRECTION_MASK             4
    218 
    219 
    220 /** Letter shaping option: do not perform letter shaping. @stable ICU 2.0 */
    221 #define U_SHAPE_LETTERS_NOOP                    0
    222 
    223 /** Letter shaping option: replace abstract letter characters by "shaped" ones. @stable ICU 2.0 */
    224 #define U_SHAPE_LETTERS_SHAPE                   8
    225 
    226 /** Letter shaping option: replace "shaped" letter characters by abstract ones. @stable ICU 2.0 */
    227 #define U_SHAPE_LETTERS_UNSHAPE                 0x10
    228 
    229 /**
    230  * Letter shaping option: replace abstract letter characters by "shaped" ones.
    231  * The only difference with U_SHAPE_LETTERS_SHAPE is that Tashkeel letters
    232  * are always "shaped" into the isolated form instead of the medial form
    233  * (selecting code points from the Arabic Presentation Forms-B block).
    234  * @stable ICU 2.0
    235  */
    236 #define U_SHAPE_LETTERS_SHAPE_TASHKEEL_ISOLATED 0x18
    237 
    238 
    239 /** Bit mask for letter shaping options. @stable ICU 2.0 */
    240 #define U_SHAPE_LETTERS_MASK                        0x18
    241 
    242 
    243 /** Digit shaping option: do not perform digit shaping. @stable ICU 2.0 */
    244 #define U_SHAPE_DIGITS_NOOP                     0
    245 
    246 /**
    247  * Digit shaping option:
    248  * Replace European digits (U+0030...) by Arabic-Indic digits.
    249  * @stable ICU 2.0
    250  */
    251 #define U_SHAPE_DIGITS_EN2AN                    0x20
    252 
    253 /**
    254  * Digit shaping option:
    255  * Replace Arabic-Indic digits by European digits (U+0030...).
    256  * @stable ICU 2.0
    257  */
    258 #define U_SHAPE_DIGITS_AN2EN                    0x40
    259 
    260 /**
    261  * Digit shaping option:
    262  * Replace European digits (U+0030...) by Arabic-Indic digits if the most recent
    263  * strongly directional character is an Arabic letter
    264  * (<code>u_charDirection()</code> result <code>U_RIGHT_TO_LEFT_ARABIC</code> [AL]).<br>
    265  * The direction of "preceding" depends on the direction indicator option.
    266  * For the first characters, the preceding strongly directional character
    267  * (initial state) is assumed to be not an Arabic letter
    268  * (it is <code>U_LEFT_TO_RIGHT</code> [L] or <code>U_RIGHT_TO_LEFT</code> [R]).
    269  * @stable ICU 2.0
    270  */
    271 #define U_SHAPE_DIGITS_ALEN2AN_INIT_LR          0x60
    272 
    273 /**
    274  * Digit shaping option:
    275  * Replace European digits (U+0030...) by Arabic-Indic digits if the most recent
    276  * strongly directional character is an Arabic letter
    277  * (<code>u_charDirection()</code> result <code>U_RIGHT_TO_LEFT_ARABIC</code> [AL]).<br>
    278  * The direction of "preceding" depends on the direction indicator option.
    279  * For the first characters, the preceding strongly directional character
    280  * (initial state) is assumed to be an Arabic letter.
    281  * @stable ICU 2.0
    282  */
    283 #define U_SHAPE_DIGITS_ALEN2AN_INIT_AL          0x80
    284 
    285 /** Not a valid option value. May be replaced by a new option. @stable ICU 2.0 */
    286 #define U_SHAPE_DIGITS_RESERVED                 0xa0
    287 
    288 /** Bit mask for digit shaping options. @stable ICU 2.0 */
    289 #define U_SHAPE_DIGITS_MASK                     0xe0
    290 
    291 
    292 /** Digit type option: Use Arabic-Indic digits (U+0660...U+0669). @stable ICU 2.0 */
    293 #define U_SHAPE_DIGIT_TYPE_AN                   0
    294 
    295 /** Digit type option: Use Eastern (Extended) Arabic-Indic digits (U+06f0...U+06f9). @stable ICU 2.0 */
    296 #define U_SHAPE_DIGIT_TYPE_AN_EXTENDED          0x100
    297 
    298 /** Not a valid option value. May be replaced by a new option. @stable ICU 2.0 */
    299 #define U_SHAPE_DIGIT_TYPE_RESERVED             0x200
    300 
    301 /** Bit mask for digit type options. @stable ICU 2.0 */
    302 #define U_SHAPE_DIGIT_TYPE_MASK                 0x300 /* I need to change this from 0x3f00 to 0x300 */
    303 
    304 /**
    305  * Tashkeel aggregation option:
    306  * Replaces any combination of U+0651 with one of
    307  * U+064C, U+064D, U+064E, U+064F, U+0650 with
    308  * U+FC5E, U+FC5F, U+FC60, U+FC61, U+FC62 consecutively.
    309  * @stable ICU 3.6
    310  */
    311 #define U_SHAPE_AGGREGATE_TASHKEEL              0x4000
    312 /** Tashkeel aggregation option: do not aggregate tashkeels. @stable ICU 3.6 */
    313 #define U_SHAPE_AGGREGATE_TASHKEEL_NOOP         0
    314 /** Bit mask for tashkeel aggregation. @stable ICU 3.6 */
    315 #define U_SHAPE_AGGREGATE_TASHKEEL_MASK         0x4000
    316 
    317 /**
    318  * Presentation form option:
    319  * Don't replace Arabic Presentation Forms-A and Arabic Presentation Forms-B
    320  * characters with 0+06xx characters, before shaping.
    321  * @stable ICU 3.6
    322  */
    323 #define U_SHAPE_PRESERVE_PRESENTATION           0x8000
    324 /** Presentation form option:
    325  * Replace Arabic Presentation Forms-A and Arabic Presentationo Forms-B with
    326  * their unshaped correspondants in range 0+06xx, before shaping.
    327  * @stable ICU 3.6
    328  */
    329 #define U_SHAPE_PRESERVE_PRESENTATION_NOOP      0
    330 /** Bit mask for preserve presentation form. @stable ICU 3.6 */
    331 #define U_SHAPE_PRESERVE_PRESENTATION_MASK      0x8000
    332 
    333 /* Seen Tail option */
    334 /**
    335  * Memory option: the result must have the same length as the source.
    336  * Shaping mode: The SEEN family character will expand into two characters using space near
    337  *               the SEEN family character(i.e. the space after the character).
    338  *               If there are no spaces found, an error U_NO_SPACE_AVAILABLE (as defined in utypes.h)
    339  *               will be set in pErrorCode
    340  *
    341  * De-shaping mode: Any Seen character followed by Tail character will be
    342  *                  replaced by one cell Seen and a space will replace the Tail.
    343  * Affects: Seen options
    344  * @stable ICU 4.2
    345  */
    346 #define U_SHAPE_SEEN_TWOCELL_NEAR     0x200000
    347 
    348 /**
    349  * Bit mask for Seen memory options.
    350  * @stable ICU 4.2
    351  */
    352 #define U_SHAPE_SEEN_MASK             0x700000
    353 
    354 /* YehHamza option */
    355 /**
    356  * Memory option: the result must have the same length as the source.
    357  * Shaping mode: The YEHHAMZA character will expand into two characters using space near it
    358  *              (i.e. the space after the character
    359  *               If there are no spaces found, an error U_NO_SPACE_AVAILABLE (as defined in utypes.h)
    360  *               will be set in pErrorCode
    361  *
    362  * De-shaping mode: Any Yeh (final or isolated) character followed by Hamza character will be
    363  *                  replaced by one cell YehHamza and space will replace the Hamza.
    364  * Affects: YehHamza options
    365  * @stable ICU 4.2
    366  */
    367 #define U_SHAPE_YEHHAMZA_TWOCELL_NEAR      0x1000000
    368 
    369 
    370 /**
    371  * Bit mask for YehHamza memory options.
    372  * @stable ICU 4.2
    373  */
    374 #define U_SHAPE_YEHHAMZA_MASK              0x3800000
    375 
    376 /* New Tashkeel options */
    377 /**
    378  * Memory option: the result must have the same length as the source.
    379  * Shaping mode: Tashkeel characters will be replaced by spaces.
    380  *               Spaces will be placed at beginning of the buffer
    381  *
    382  * De-shaping mode: N/A
    383  * Affects: Tashkeel options
    384  * @stable ICU 4.2
    385  */
    386 #define U_SHAPE_TASHKEEL_BEGIN                      0x40000
    387 
    388 /**
    389  * Memory option: the result must have the same length as the source.
    390  * Shaping mode: Tashkeel characters will be replaced by spaces.
    391  *               Spaces will be placed at end of the buffer
    392  *
    393  * De-shaping mode: N/A
    394  * Affects: Tashkeel options
    395  * @stable ICU 4.2
    396  */
    397 #define U_SHAPE_TASHKEEL_END                        0x60000
    398 
    399 /**
    400  * Memory option: allow the result to have a different length than the source.
    401  * Shaping mode: Tashkeel characters will be removed, buffer length will shrink.
    402  * De-shaping mode: N/A
    403  *
    404  * Affect: Tashkeel options
    405  * @stable ICU 4.2
    406  */
    407 #define U_SHAPE_TASHKEEL_RESIZE                     0x80000
    408 
    409 /**
    410  * Memory option: the result must have the same length as the source.
    411  * Shaping mode: Tashkeel characters will be replaced by Tatweel if it is connected to adjacent
    412  *               characters (i.e. shaped on Tatweel) or replaced by space if it is not connected.
    413  *
    414  * De-shaping mode: N/A
    415  * Affects: YehHamza options
    416  * @stable ICU 4.2
    417  */
    418 #define U_SHAPE_TASHKEEL_REPLACE_BY_TATWEEL         0xC0000
    419 
    420 /**
    421  * Bit mask for Tashkeel replacement with Space or Tatweel memory options.
    422  * @stable ICU 4.2
    423  */
    424 #define U_SHAPE_TASHKEEL_MASK                       0xE0000
    425 
    426 
    427 /* Space location Control options */
    428 /**
    429  * This option affect the meaning of BEGIN and END options. if this option is not used the default
    430  * for BEGIN and END will be as following:
    431  * The Default (for both Visual LTR, Visual RTL and Logical Text)
    432  *           1. BEGIN always refers to the start address of physical memory.
    433  *           2. END always refers to the end address of physical memory.
    434  *
    435  * If this option is used it will swap the meaning of BEGIN and END only for Visual LTR text.
    436  *
    437  * The effect on BEGIN and END Memory Options will be as following:
    438  *    A. BEGIN For Visual LTR text: This will be the beginning (right side) of the visual text(
    439  *       corresponding to the physical memory address end for Visual LTR text, Same as END in
    440  *       default behavior)
    441  *    B. BEGIN For Logical text: Same as BEGIN in default behavior.
    442  *    C. END For Visual LTR text: This will be the end (left side) of the visual text (corresponding
    443  *       to the physical memory address beginning for Visual LTR text, Same as BEGIN in default behavior.
    444  *    D. END For Logical text: Same as END in default behavior).
    445  * Affects: All LamAlef BEGIN, END and AUTO options.
    446  * @stable ICU 4.2
    447  */
    448 #define U_SHAPE_SPACES_RELATIVE_TO_TEXT_BEGIN_END 0x4000000
    449 
    450 /**
    451  * Bit mask for swapping BEGIN and END for Visual LTR text
    452  * @stable ICU 4.2
    453  */
    454 #define U_SHAPE_SPACES_RELATIVE_TO_TEXT_MASK      0x4000000
    455 
    456 /**
    457  * If this option is used, shaping will use the new Unicode code point for TAIL (i.e. 0xFE73).
    458  * If this option is not specified (Default), old unofficial Unicode TAIL code point is used (i.e. 0x200B)
    459  * De-shaping will not use this option as it will always search for both the new Unicode code point for the
    460  * TAIL (i.e. 0xFE73) or the old unofficial Unicode TAIL code point (i.e. 0x200B) and de-shape the
    461  * Seen-Family letter accordingly.
    462  *
    463  * Shaping Mode: Only shaping.
    464  * De-shaping Mode: N/A.
    465  * Affects: All Seen options
    466  * @stable ICU 4.8
    467  */
    468 #define U_SHAPE_TAIL_NEW_UNICODE        0x8000000
    469 
    470 /**
    471  * Bit mask for new Unicode Tail option
    472  * @stable ICU 4.8
    473  */
    474 #define U_SHAPE_TAIL_TYPE_MASK          0x8000000
    475 
    476 #endif
    477