Home | History | Annotate | Download | only in common
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 ******************************************************************************
      5 *
      6 *   Copyright (C) 1999-2016, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 ******************************************************************************
     10 *   file name:  ubidiimp.h
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 1999aug06
     16 *   created by: Markus W. Scherer, updated by Matitiahu Allouche
     17 */
     18 
     19 #ifndef UBIDIIMP_H
     20 #define UBIDIIMP_H
     21 
     22 #include "unicode/utypes.h"
     23 #include "unicode/ubidi.h"
     24 #include "unicode/uchar.h"
     25 #include "ubidi_props.h"
     26 
     27 /* miscellaneous definitions ---------------------------------------------- */
     28 
     29 typedef uint8_t DirProp;
     30 typedef uint32_t Flags;
     31 
     32 /*  Comparing the description of the BiDi algorithm with this implementation
     33     is easier with the same names for the BiDi types in the code as there.
     34     See UCharDirection in uchar.h .
     35 */
     36 enum {
     37     L=  U_LEFT_TO_RIGHT,                /*  0 */
     38     R=  U_RIGHT_TO_LEFT,                /*  1 */
     39     EN= U_EUROPEAN_NUMBER,              /*  2 */
     40     ES= U_EUROPEAN_NUMBER_SEPARATOR,    /*  3 */
     41     ET= U_EUROPEAN_NUMBER_TERMINATOR,   /*  4 */
     42     AN= U_ARABIC_NUMBER,                /*  5 */
     43     CS= U_COMMON_NUMBER_SEPARATOR,      /*  6 */
     44     B=  U_BLOCK_SEPARATOR,              /*  7 */
     45     S=  U_SEGMENT_SEPARATOR,            /*  8 */
     46     WS= U_WHITE_SPACE_NEUTRAL,          /*  9 */
     47     ON= U_OTHER_NEUTRAL,                /* 10 */
     48     LRE=U_LEFT_TO_RIGHT_EMBEDDING,      /* 11 */
     49     LRO=U_LEFT_TO_RIGHT_OVERRIDE,       /* 12 */
     50     AL= U_RIGHT_TO_LEFT_ARABIC,         /* 13 */
     51     RLE=U_RIGHT_TO_LEFT_EMBEDDING,      /* 14 */
     52     RLO=U_RIGHT_TO_LEFT_OVERRIDE,       /* 15 */
     53     PDF=U_POP_DIRECTIONAL_FORMAT,       /* 16 */
     54     NSM=U_DIR_NON_SPACING_MARK,         /* 17 */
     55     BN= U_BOUNDARY_NEUTRAL,             /* 18 */
     56     FSI=U_FIRST_STRONG_ISOLATE,         /* 19 */
     57     LRI=U_LEFT_TO_RIGHT_ISOLATE,        /* 20 */
     58     RLI=U_RIGHT_TO_LEFT_ISOLATE,        /* 21 */
     59     PDI=U_POP_DIRECTIONAL_ISOLATE,      /* 22 */
     60     ENL,    /* EN after W7 */           /* 23 */
     61     ENR,    /* EN not subject to W7 */  /* 24 */
     62     dirPropCount
     63 };
     64 
     65 /*  Sometimes, bit values are more appropriate
     66     to deal with directionality properties.
     67     Abbreviations in these macro names refer to names
     68     used in the BiDi algorithm.
     69 */
     70 #define DIRPROP_FLAG(dir) (1UL<<(dir))
     71 #define PURE_DIRPROP(prop)  ((prop)&~0xE0)    ?????????????????????????
     72 
     73 /* special flag for multiple runs from explicit embedding codes */
     74 #define DIRPROP_FLAG_MULTI_RUNS (1UL<<31)
     75 
     76 /* are there any characters that are LTR or RTL? */
     77 #define MASK_LTR (DIRPROP_FLAG(L)|DIRPROP_FLAG(EN)|DIRPROP_FLAG(ENL)|DIRPROP_FLAG(ENR)|DIRPROP_FLAG(AN)|DIRPROP_FLAG(LRE)|DIRPROP_FLAG(LRO)|DIRPROP_FLAG(LRI))
     78 #define MASK_RTL (DIRPROP_FLAG(R)|DIRPROP_FLAG(AL)|DIRPROP_FLAG(RLE)|DIRPROP_FLAG(RLO)|DIRPROP_FLAG(RLI))
     79 #define MASK_R_AL (DIRPROP_FLAG(R)|DIRPROP_FLAG(AL))
     80 #define MASK_STRONG_EN_AN (DIRPROP_FLAG(L)|DIRPROP_FLAG(R)|DIRPROP_FLAG(AL)|DIRPROP_FLAG(EN)|DIRPROP_FLAG(AN))
     81 
     82 /* explicit embedding codes */
     83 #define MASK_EXPLICIT (DIRPROP_FLAG(LRE)|DIRPROP_FLAG(LRO)|DIRPROP_FLAG(RLE)|DIRPROP_FLAG(RLO)|DIRPROP_FLAG(PDF))
     84 
     85 /* explicit isolate codes */
     86 #define MASK_ISO (DIRPROP_FLAG(LRI)|DIRPROP_FLAG(RLI)|DIRPROP_FLAG(FSI)|DIRPROP_FLAG(PDI))
     87 
     88 #define MASK_BN_EXPLICIT (DIRPROP_FLAG(BN)|MASK_EXPLICIT)
     89 
     90 /* paragraph and segment separators */
     91 #define MASK_B_S (DIRPROP_FLAG(B)|DIRPROP_FLAG(S))
     92 
     93 /* all types that are counted as White Space or Neutral in some steps */
     94 #define MASK_WS (MASK_B_S|DIRPROP_FLAG(WS)|MASK_BN_EXPLICIT|MASK_ISO)
     95 
     96 /* types that are neutrals or could becomes neutrals in (Wn) */
     97 #define MASK_POSSIBLE_N (DIRPROP_FLAG(ON)|DIRPROP_FLAG(CS)|DIRPROP_FLAG(ES)|DIRPROP_FLAG(ET)|MASK_WS)
     98 
     99 /*
    100  *  These types may be changed to "e",
    101  *  the embedding type (L or R) of the run,
    102  *  in the BiDi algorithm (N2)
    103  */
    104 #define MASK_EMBEDDING (DIRPROP_FLAG(NSM)|MASK_POSSIBLE_N)
    105 
    106 /* the dirProp's L and R are defined to 0 and 1 values in UCharDirection */
    107 #define GET_LR_FROM_LEVEL(level) ((DirProp)((level)&1))
    108 
    109 #define IS_DEFAULT_LEVEL(level) ((level)>=0xfe)
    110 
    111 /*
    112  *  The following bit is used for the directional isolate status.
    113  *  Stack entries corresponding to isolate sequences are greater than ISOLATE.
    114  */
    115 #define ISOLATE  0x0100
    116 
    117 U_CFUNC UBiDiLevel
    118 ubidi_getParaLevelAtIndex(const UBiDi *pBiDi, int32_t index);
    119 
    120 #define GET_PARALEVEL(ubidi, index) \
    121             ((UBiDiLevel)(!(ubidi)->defaultParaLevel || (index)<(ubidi)->paras[0].limit ? \
    122                          (ubidi)->paraLevel : ubidi_getParaLevelAtIndex((ubidi), (index))))
    123 
    124 /* number of paras entries allocated initially without malloc */
    125 #define SIMPLE_PARAS_COUNT      10
    126 /* number of isolate entries allocated initially without malloc */
    127 #define SIMPLE_ISOLATES_COUNT   5
    128 /* number of isolate run entries for paired brackets allocated initially without malloc */
    129 #define SIMPLE_OPENINGS_COUNT   20
    130 
    131 #define CR  0x000D
    132 #define LF  0x000A
    133 
    134 /* Run structure for reordering --------------------------------------------- */
    135 enum {
    136     LRM_BEFORE=1,
    137     LRM_AFTER=2,
    138     RLM_BEFORE=4,
    139     RLM_AFTER=8
    140 };
    141 
    142 typedef struct Para {
    143     int32_t limit;
    144     int32_t level;
    145 } Para;
    146 
    147 enum {                                  /* flags for Opening.flags */
    148     FOUND_L=DIRPROP_FLAG(L),
    149     FOUND_R=DIRPROP_FLAG(R)
    150 };
    151 
    152 typedef struct Opening {
    153     int32_t position;                   /* position of opening bracket */
    154     int32_t match;                      /* matching char or -position of closing bracket */
    155     int32_t contextPos;                 /* position of last strong char found before opening */
    156     uint16_t flags;                     /* bits for L or R/AL found within the pair */
    157     UBiDiDirection contextDir;          /* L or R according to last strong char before opening */
    158     uint8_t filler;                     /* to complete a nice multiple of 4 chars */
    159 } Opening;
    160 
    161 typedef struct IsoRun {
    162     int32_t  contextPos;                /* position of char determining context */
    163     uint16_t start;                     /* index of first opening entry for this run */
    164     uint16_t limit;                     /* index after last opening entry for this run */
    165     UBiDiLevel level;                   /* level of this run */
    166     DirProp lastStrong;                 /* bidi class of last strong char found in this run */
    167     DirProp lastBase;                   /* bidi class of last base char found in this run */
    168     UBiDiDirection contextDir;          /* L or R to use as context for following openings */
    169 } IsoRun;
    170 
    171 typedef struct BracketData {
    172     UBiDi   *pBiDi;
    173     /* array of opening entries which should be enough in most cases; no malloc() */
    174     Opening simpleOpenings[SIMPLE_OPENINGS_COUNT];
    175     Opening *openings;                  /* pointer to current array of entries */
    176     int32_t openingsCount;              /* number of allocated entries */
    177     int32_t isoRunLast;                 /* index of last used entry */
    178     /* array of nested isolated sequence entries; can never excess UBIDI_MAX_EXPLICIT_LEVEL
    179        + 1 for index 0, + 1 for before the first isolated sequence */
    180     IsoRun  isoRuns[UBIDI_MAX_EXPLICIT_LEVEL+2];
    181     UBool isNumbersSpecial;             /* reordering mode for NUMBERS_SPECIAL */
    182 } BracketData;
    183 
    184 typedef struct Isolate {
    185     int32_t startON;
    186     int32_t start1;
    187     int32_t state;
    188     int16_t stateImp;
    189 } Isolate;
    190 
    191 typedef struct Run {
    192     int32_t logicalStart,   /* first character of the run; b31 indicates even/odd level */
    193             visualLimit,    /* last visual position of the run +1 */
    194             insertRemove;   /* if >0, flags for inserting LRM/RLM before/after run,
    195                                if <0, count of bidi controls within run            */
    196 } Run;
    197 
    198 /* in a Run, logicalStart will get this bit set if the run level is odd */
    199 #define INDEX_ODD_BIT (1UL<<31)
    200 
    201 #define MAKE_INDEX_ODD_PAIR(index, level) ((index)|((int32_t)(level)<<31))
    202 #define ADD_ODD_BIT_FROM_LEVEL(x, level)  ((x)|=((int32_t)(level)<<31))
    203 #define REMOVE_ODD_BIT(x)                 ((x)&=~INDEX_ODD_BIT)
    204 
    205 #define GET_INDEX(x)   ((x)&~INDEX_ODD_BIT)
    206 #define GET_ODD_BIT(x) ((uint32_t)(x)>>31)
    207 #define IS_ODD_RUN(x)  ((UBool)(((x)&INDEX_ODD_BIT)!=0))
    208 #define IS_EVEN_RUN(x) ((UBool)(((x)&INDEX_ODD_BIT)==0))
    209 
    210 U_CFUNC UBool
    211 ubidi_getRuns(UBiDi *pBiDi, UErrorCode *pErrorCode);
    212 
    213 /** BiDi control code points */
    214 enum {
    215     ZWNJ_CHAR=0x200c,
    216     ZWJ_CHAR,
    217     LRM_CHAR,
    218     RLM_CHAR,
    219     LRE_CHAR=0x202a,
    220     RLE_CHAR,
    221     PDF_CHAR,
    222     LRO_CHAR,
    223     RLO_CHAR,
    224     LRI_CHAR=0x2066,
    225     RLI_CHAR,
    226     FSI_CHAR,
    227     PDI_CHAR
    228 };
    229 
    230 #define IS_BIDI_CONTROL_CHAR(c) (((uint32_t)(c)&0xfffffffc)==ZWNJ_CHAR || (uint32_t)((c)-LRE_CHAR)<5 || (uint32_t)((c)-LRI_CHAR)<4)
    231 
    232 /* InsertPoints structure for noting where to put BiDi marks ---------------- */
    233 
    234 typedef struct Point {
    235     int32_t pos;            /* position in text */
    236     int32_t flag;           /* flag for LRM/RLM, before/after */
    237 } Point;
    238 
    239 typedef struct InsertPoints {
    240     int32_t capacity;       /* number of points allocated */
    241     int32_t size;           /* number of points used */
    242     int32_t confirmed;      /* number of points confirmed */
    243     UErrorCode errorCode;   /* for eventual memory shortage */
    244     Point *points;          /* pointer to array of points */
    245 } InsertPoints;
    246 
    247 
    248 /* UBiDi structure ----------------------------------------------------------- */
    249 
    250 struct UBiDi {
    251     /* pointer to parent paragraph object (pointer to self if this object is
    252      * a paragraph object); set to NULL in a newly opened object; set to a
    253      * real value after a successful execution of ubidi_setPara or ubidi_setLine
    254      */
    255     const UBiDi * pParaBiDi;
    256 
    257     const UBiDiProps *bdp;
    258 
    259     /* alias pointer to the current text */
    260     const UChar *text;
    261 
    262     /* length of the current text */
    263     int32_t originalLength;
    264 
    265     /* if the UBIDI_OPTION_STREAMING option is set, this is the length
    266      * of text actually processed by ubidi_setPara, which may be shorter than
    267      * the original length.
    268      * Otherwise, it is identical to the original length.
    269      */
    270     int32_t length;
    271 
    272     /* if the UBIDI_OPTION_REMOVE_CONTROLS option is set, and/or
    273      * marks are allowed to be inserted in one of the reordering mode, the
    274      * length of the result string may be different from the processed length.
    275      */
    276     int32_t resultLength;
    277 
    278     /* memory sizes in bytes */
    279     int32_t dirPropsSize, levelsSize, openingsSize, parasSize, runsSize, isolatesSize;
    280 
    281     /* allocated memory */
    282     DirProp *dirPropsMemory;
    283     UBiDiLevel *levelsMemory;
    284     Opening *openingsMemory;
    285     Para *parasMemory;
    286     Run *runsMemory;
    287     Isolate *isolatesMemory;
    288 
    289     /* indicators for whether memory may be allocated after ubidi_open() */
    290     UBool mayAllocateText, mayAllocateRuns;
    291 
    292     /* arrays with one value per text-character */
    293     DirProp *dirProps;
    294     UBiDiLevel *levels;
    295 
    296     /* are we performing an approximation of the "inverse BiDi" algorithm? */
    297     UBool isInverse;
    298 
    299     /* are we using the basic algorithm or its variation? */
    300     UBiDiReorderingMode reorderingMode;
    301 
    302     /* UBIDI_REORDER_xxx values must be ordered so that all the regular
    303      * logical to visual modes come first, and all inverse BiDi modes
    304      * come last.
    305      */
    306     #define UBIDI_REORDER_LAST_LOGICAL_TO_VISUAL    UBIDI_REORDER_NUMBERS_SPECIAL
    307 
    308     /* bitmask for reordering options */
    309     uint32_t reorderingOptions;
    310 
    311     /* must block separators receive level 0? */
    312     UBool orderParagraphsLTR;
    313 
    314     /* the paragraph level */
    315     UBiDiLevel paraLevel;
    316     /* original paraLevel when contextual */
    317     /* must be one of UBIDI_DEFAULT_xxx or 0 if not contextual */
    318     UBiDiLevel defaultParaLevel;
    319 
    320     /* context data */
    321     const UChar *prologue;
    322     int32_t proLength;
    323     const UChar *epilogue;
    324     int32_t epiLength;
    325 
    326     /* the following is set in ubidi_setPara, used in processPropertySeq */
    327     const struct ImpTabPair * pImpTabPair;  /* pointer to levels state table pair */
    328 
    329     /* the overall paragraph or line directionality - see UBiDiDirection */
    330     UBiDiDirection direction;
    331 
    332     /* flags is a bit set for which directional properties are in the text */
    333     Flags flags;
    334 
    335     /* lastArabicPos is index to the last AL in the text, -1 if none */
    336     int32_t lastArabicPos;
    337 
    338     /* characters after trailingWSStart are WS and are */
    339     /* implicitly at the paraLevel (rule (L1)) - levels may not reflect that */
    340     int32_t trailingWSStart;
    341 
    342     /* fields for paragraph handling */
    343     int32_t paraCount;                  /* set in getDirProps() */
    344     /* filled in getDirProps() */
    345     Para *paras;
    346 
    347     /* for relatively short text, we only need a tiny array of paras (no malloc()) */
    348     Para simpleParas[SIMPLE_PARAS_COUNT];
    349 
    350     /* fields for line reordering */
    351     int32_t runCount;     /* ==-1: runs not set up yet */
    352     Run *runs;
    353 
    354     /* for non-mixed text, we only need a tiny array of runs (no malloc()) */
    355     Run simpleRuns[1];
    356 
    357     /* maximum or current nesting depth of isolate sequences */
    358     /* Within resolveExplicitLevels() and checkExplicitLevels(), this is the maximal
    359        nesting encountered.
    360        Within resolveImplicitLevels(), this is the index of the current isolates
    361        stack entry. */
    362     int32_t isolateCount;
    363     Isolate *isolates;
    364 
    365     /* for simple text, have a small stack (no malloc()) */
    366     Isolate simpleIsolates[SIMPLE_ISOLATES_COUNT];
    367 
    368     /* for inverse Bidi with insertion of directional marks */
    369     InsertPoints insertPoints;
    370 
    371     /* for option UBIDI_OPTION_REMOVE_CONTROLS */
    372     int32_t controlCount;
    373 
    374     /* for Bidi class callback */
    375     UBiDiClassCallback *fnClassCallback;    /* action pointer */
    376     const void *coClassCallback;            /* context pointer */
    377 };
    378 
    379 #define IS_VALID_PARA(x) ((x) && ((x)->pParaBiDi==(x)))
    380 #define IS_VALID_PARA_OR_LINE(x) ((x) && ((x)->pParaBiDi==(x) || (((x)->pParaBiDi) && (x)->pParaBiDi->pParaBiDi==(x)->pParaBiDi)))
    381 
    382 typedef union {
    383     DirProp *dirPropsMemory;
    384     UBiDiLevel *levelsMemory;
    385     Opening *openingsMemory;
    386     Para *parasMemory;
    387     Run *runsMemory;
    388     Isolate *isolatesMemory;
    389 } BidiMemoryForAllocation;
    390 
    391 /* Macros for initial checks at function entry */
    392 #define RETURN_IF_NULL_OR_FAILING_ERRCODE(pErrcode, retvalue)   \
    393         if((pErrcode)==NULL || U_FAILURE(*pErrcode)) return retvalue
    394 #define RETURN_IF_NOT_VALID_PARA(bidi, errcode, retvalue)   \
    395         if(!IS_VALID_PARA(bidi)) {  \
    396             errcode=U_INVALID_STATE_ERROR;  \
    397             return retvalue;                \
    398         }
    399 #define RETURN_IF_NOT_VALID_PARA_OR_LINE(bidi, errcode, retvalue)   \
    400         if(!IS_VALID_PARA_OR_LINE(bidi)) {  \
    401             errcode=U_INVALID_STATE_ERROR;  \
    402             return retvalue;                \
    403         }
    404 #define RETURN_IF_BAD_RANGE(arg, start, limit, errcode, retvalue)   \
    405         if((arg)<(start) || (arg)>=(limit)) {       \
    406             (errcode)=U_ILLEGAL_ARGUMENT_ERROR;     \
    407             return retvalue;                        \
    408         }
    409 
    410 #define RETURN_VOID_IF_NULL_OR_FAILING_ERRCODE(pErrcode)   \
    411         if((pErrcode)==NULL || U_FAILURE(*pErrcode)) return
    412 #define RETURN_VOID_IF_NOT_VALID_PARA(bidi, errcode)   \
    413         if(!IS_VALID_PARA(bidi)) {  \
    414             errcode=U_INVALID_STATE_ERROR;  \
    415             return;                \
    416         }
    417 #define RETURN_VOID_IF_NOT_VALID_PARA_OR_LINE(bidi, errcode)   \
    418         if(!IS_VALID_PARA_OR_LINE(bidi)) {  \
    419             errcode=U_INVALID_STATE_ERROR;  \
    420             return;                \
    421         }
    422 #define RETURN_VOID_IF_BAD_RANGE(arg, start, limit, errcode)   \
    423         if((arg)<(start) || (arg)>=(limit)) {       \
    424             (errcode)=U_ILLEGAL_ARGUMENT_ERROR;     \
    425             return;                        \
    426         }
    427 
    428 /* helper function to (re)allocate memory if allowed */
    429 U_CFUNC UBool
    430 ubidi_getMemory(BidiMemoryForAllocation *pMemory, int32_t *pSize, UBool mayAllocate, int32_t sizeNeeded);
    431 
    432 /* helper macros for each allocated array in UBiDi */
    433 #define getDirPropsMemory(pBiDi, length) \
    434         ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->dirPropsMemory, &(pBiDi)->dirPropsSize, \
    435                         (pBiDi)->mayAllocateText, (length))
    436 
    437 #define getLevelsMemory(pBiDi, length) \
    438         ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->levelsMemory, &(pBiDi)->levelsSize, \
    439                         (pBiDi)->mayAllocateText, (length))
    440 
    441 #define getRunsMemory(pBiDi, length) \
    442         ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->runsMemory, &(pBiDi)->runsSize, \
    443                         (pBiDi)->mayAllocateRuns, (length)*sizeof(Run))
    444 
    445 /* additional macros used by ubidi_open() - always allow allocation */
    446 #define getInitialDirPropsMemory(pBiDi, length) \
    447         ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->dirPropsMemory, &(pBiDi)->dirPropsSize, \
    448                         TRUE, (length))
    449 
    450 #define getInitialLevelsMemory(pBiDi, length) \
    451         ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->levelsMemory, &(pBiDi)->levelsSize, \
    452                         TRUE, (length))
    453 
    454 #define getInitialOpeningsMemory(pBiDi, length) \
    455         ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->openingsMemory, &(pBiDi)->openingsSize, \
    456                         TRUE, (length)*sizeof(Opening))
    457 
    458 #define getInitialParasMemory(pBiDi, length) \
    459         ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->parasMemory, &(pBiDi)->parasSize, \
    460                         TRUE, (length)*sizeof(Para))
    461 
    462 #define getInitialRunsMemory(pBiDi, length) \
    463         ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->runsMemory, &(pBiDi)->runsSize, \
    464                         TRUE, (length)*sizeof(Run))
    465 
    466 #define getInitialIsolatesMemory(pBiDi, length) \
    467         ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->isolatesMemory, &(pBiDi)->isolatesSize, \
    468                         TRUE, (length)*sizeof(Isolate))
    469 
    470 #endif
    471