Home | History | Annotate | Download | only in common
      1 /*
      2 ******************************************************************************
      3 *
      4 *   Copyright (C) 1999-2015, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 ******************************************************************************
      8 *   file name:  ubidiimp.h
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 1999aug06
     14 *   created by: Markus W. Scherer, updated by Matitiahu Allouche
     15 */
     16 
     17 #ifndef UBIDIIMP_H
     18 #define UBIDIIMP_H
     19 
     20 #include "unicode/utypes.h"
     21 #include "unicode/uchar.h"
     22 #include "ubidi_props.h"
     23 
     24 /* miscellaneous definitions ---------------------------------------------- */
     25 
     26 typedef uint8_t DirProp;
     27 typedef uint32_t Flags;
     28 
     29 /*  Comparing the description of the BiDi algorithm with this implementation
     30     is easier with the same names for the BiDi types in the code as there.
     31     See UCharDirection in uchar.h .
     32 */
     33 enum {
     34     L=  U_LEFT_TO_RIGHT,                /*  0 */
     35     R=  U_RIGHT_TO_LEFT,                /*  1 */
     36     EN= U_EUROPEAN_NUMBER,              /*  2 */
     37     ES= U_EUROPEAN_NUMBER_SEPARATOR,    /*  3 */
     38     ET= U_EUROPEAN_NUMBER_TERMINATOR,   /*  4 */
     39     AN= U_ARABIC_NUMBER,                /*  5 */
     40     CS= U_COMMON_NUMBER_SEPARATOR,      /*  6 */
     41     B=  U_BLOCK_SEPARATOR,              /*  7 */
     42     S=  U_SEGMENT_SEPARATOR,            /*  8 */
     43     WS= U_WHITE_SPACE_NEUTRAL,          /*  9 */
     44     ON= U_OTHER_NEUTRAL,                /* 10 */
     45     LRE=U_LEFT_TO_RIGHT_EMBEDDING,      /* 11 */
     46     LRO=U_LEFT_TO_RIGHT_OVERRIDE,       /* 12 */
     47     AL= U_RIGHT_TO_LEFT_ARABIC,         /* 13 */
     48     RLE=U_RIGHT_TO_LEFT_EMBEDDING,      /* 14 */
     49     RLO=U_RIGHT_TO_LEFT_OVERRIDE,       /* 15 */
     50     PDF=U_POP_DIRECTIONAL_FORMAT,       /* 16 */
     51     NSM=U_DIR_NON_SPACING_MARK,         /* 17 */
     52     BN= U_BOUNDARY_NEUTRAL,             /* 18 */
     53     FSI=U_FIRST_STRONG_ISOLATE,         /* 19 */
     54     LRI=U_LEFT_TO_RIGHT_ISOLATE,        /* 20 */
     55     RLI=U_RIGHT_TO_LEFT_ISOLATE,        /* 21 */
     56     PDI=U_POP_DIRECTIONAL_ISOLATE,      /* 22 */
     57     ENL,    /* EN after W7 */           /* 23 */
     58     ENR,    /* EN not subject to W7 */  /* 24 */
     59     dirPropCount
     60 };
     61 
     62 /*  Sometimes, bit values are more appropriate
     63     to deal with directionality properties.
     64     Abbreviations in these macro names refer to names
     65     used in the BiDi algorithm.
     66 */
     67 #define DIRPROP_FLAG(dir) (1UL<<(dir))
     68 #define PURE_DIRPROP(prop)  ((prop)&~0xE0)    ?????????????????????????
     69 
     70 /* special flag for multiple runs from explicit embedding codes */
     71 #define DIRPROP_FLAG_MULTI_RUNS (1UL<<31)
     72 
     73 /* are there any characters that are LTR or RTL? */
     74 #define MASK_LTR (DIRPROP_FLAG(L)|DIRPROP_FLAG(EN)|DIRPROP_FLAG(ENL)|DIRPROP_FLAG(ENR)|DIRPROP_FLAG(AN)|DIRPROP_FLAG(LRE)|DIRPROP_FLAG(LRO)|DIRPROP_FLAG(LRI))
     75 #define MASK_RTL (DIRPROP_FLAG(R)|DIRPROP_FLAG(AL)|DIRPROP_FLAG(RLE)|DIRPROP_FLAG(RLO)|DIRPROP_FLAG(RLI))
     76 #define MASK_R_AL (DIRPROP_FLAG(R)|DIRPROP_FLAG(AL))
     77 #define MASK_STRONG_EN_AN (DIRPROP_FLAG(L)|DIRPROP_FLAG(R)|DIRPROP_FLAG(AL)|DIRPROP_FLAG(EN)|DIRPROP_FLAG(AN))
     78 
     79 /* explicit embedding codes */
     80 #define MASK_EXPLICIT (DIRPROP_FLAG(LRE)|DIRPROP_FLAG(LRO)|DIRPROP_FLAG(RLE)|DIRPROP_FLAG(RLO)|DIRPROP_FLAG(PDF))
     81 
     82 /* explicit isolate codes */
     83 #define MASK_ISO (DIRPROP_FLAG(LRI)|DIRPROP_FLAG(RLI)|DIRPROP_FLAG(FSI)|DIRPROP_FLAG(PDI))
     84 
     85 #define MASK_BN_EXPLICIT (DIRPROP_FLAG(BN)|MASK_EXPLICIT)
     86 
     87 /* paragraph and segment separators */
     88 #define MASK_B_S (DIRPROP_FLAG(B)|DIRPROP_FLAG(S))
     89 
     90 /* all types that are counted as White Space or Neutral in some steps */
     91 #define MASK_WS (MASK_B_S|DIRPROP_FLAG(WS)|MASK_BN_EXPLICIT|MASK_ISO)
     92 
     93 /* types that are neutrals or could becomes neutrals in (Wn) */
     94 #define MASK_POSSIBLE_N (DIRPROP_FLAG(ON)|DIRPROP_FLAG(CS)|DIRPROP_FLAG(ES)|DIRPROP_FLAG(ET)|MASK_WS)
     95 
     96 /*
     97  *  These types may be changed to "e",
     98  *  the embedding type (L or R) of the run,
     99  *  in the BiDi algorithm (N2)
    100  */
    101 #define MASK_EMBEDDING (DIRPROP_FLAG(NSM)|MASK_POSSIBLE_N)
    102 
    103 /* the dirProp's L and R are defined to 0 and 1 values in UCharDirection */
    104 #define GET_LR_FROM_LEVEL(level) ((DirProp)((level)&1))
    105 
    106 #define IS_DEFAULT_LEVEL(level) ((level)>=0xfe)
    107 
    108 /*
    109  *  The following bit is used for the directional isolate status.
    110  *  Stack entries corresponding to isolate sequences are greater than ISOLATE.
    111  */
    112 #define ISOLATE  0x0100
    113 
    114 U_CFUNC UBiDiLevel
    115 ubidi_getParaLevelAtIndex(const UBiDi *pBiDi, int32_t index);
    116 
    117 #define GET_PARALEVEL(ubidi, index) \
    118             ((UBiDiLevel)(!(ubidi)->defaultParaLevel || (index)<(ubidi)->paras[0].limit ? \
    119                          (ubidi)->paraLevel : ubidi_getParaLevelAtIndex((ubidi), (index))))
    120 
    121 /* number of paras entries allocated initially without malloc */
    122 #define SIMPLE_PARAS_COUNT      10
    123 /* number of isolate entries allocated initially without malloc */
    124 #define SIMPLE_ISOLATES_COUNT   5
    125 /* number of isolate run entries for paired brackets allocated initially without malloc */
    126 #define SIMPLE_OPENINGS_COUNT   20
    127 
    128 #define CR  0x000D
    129 #define LF  0x000A
    130 
    131 /* Run structure for reordering --------------------------------------------- */
    132 enum {
    133     LRM_BEFORE=1,
    134     LRM_AFTER=2,
    135     RLM_BEFORE=4,
    136     RLM_AFTER=8
    137 };
    138 
    139 typedef struct Para {
    140     int32_t limit;
    141     int32_t level;
    142 } Para;
    143 
    144 enum {                                  /* flags for Opening.flags */
    145     FOUND_L=DIRPROP_FLAG(L),
    146     FOUND_R=DIRPROP_FLAG(R)
    147 };
    148 
    149 typedef struct Opening {
    150     int32_t position;                   /* position of opening bracket */
    151     int32_t match;                      /* matching char or -position of closing bracket */
    152     int32_t contextPos;                 /* position of last strong char found before opening */
    153     uint16_t flags;                     /* bits for L or R/AL found within the pair */
    154     UBiDiDirection contextDir;          /* L or R according to last strong char before opening */
    155     uint8_t filler;                     /* to complete a nice multiple of 4 chars */
    156 } Opening;
    157 
    158 typedef struct IsoRun {
    159     int32_t  contextPos;                /* position of char determining context */
    160     uint16_t start;                     /* index of first opening entry for this run */
    161     uint16_t limit;                     /* index after last opening entry for this run */
    162     UBiDiLevel level;                   /* level of this run */
    163     DirProp lastStrong;                 /* bidi class of last strong char found in this run */
    164     DirProp lastBase;                   /* bidi class of last base char found in this run */
    165     UBiDiDirection contextDir;          /* L or R to use as context for following openings */
    166 } IsoRun;
    167 
    168 typedef struct BracketData {
    169     UBiDi   *pBiDi;
    170     /* array of opening entries which should be enough in most cases; no malloc() */
    171     Opening simpleOpenings[SIMPLE_OPENINGS_COUNT];
    172     Opening *openings;                  /* pointer to current array of entries */
    173     int32_t openingsCount;              /* number of allocated entries */
    174     int32_t isoRunLast;                 /* index of last used entry */
    175     /* array of nested isolated sequence entries; can never excess UBIDI_MAX_EXPLICIT_LEVEL
    176        + 1 for index 0, + 1 for before the first isolated sequence */
    177     IsoRun  isoRuns[UBIDI_MAX_EXPLICIT_LEVEL+2];
    178     UBool isNumbersSpecial;             /* reordering mode for NUMBERS_SPECIAL */
    179 } BracketData;
    180 
    181 typedef struct Isolate {
    182     int32_t startON;
    183     int32_t start1;
    184     int32_t state;
    185     int16_t stateImp;
    186 } Isolate;
    187 
    188 typedef struct Run {
    189     int32_t logicalStart,   /* first character of the run; b31 indicates even/odd level */
    190             visualLimit,    /* last visual position of the run +1 */
    191             insertRemove;   /* if >0, flags for inserting LRM/RLM before/after run,
    192                                if <0, count of bidi controls within run            */
    193 } Run;
    194 
    195 /* in a Run, logicalStart will get this bit set if the run level is odd */
    196 #define INDEX_ODD_BIT (1UL<<31)
    197 
    198 #define MAKE_INDEX_ODD_PAIR(index, level) ((index)|((int32_t)(level)<<31))
    199 #define ADD_ODD_BIT_FROM_LEVEL(x, level)  ((x)|=((int32_t)(level)<<31))
    200 #define REMOVE_ODD_BIT(x)                 ((x)&=~INDEX_ODD_BIT)
    201 
    202 #define GET_INDEX(x)   ((x)&~INDEX_ODD_BIT)
    203 #define GET_ODD_BIT(x) ((uint32_t)(x)>>31)
    204 #define IS_ODD_RUN(x)  ((UBool)(((x)&INDEX_ODD_BIT)!=0))
    205 #define IS_EVEN_RUN(x) ((UBool)(((x)&INDEX_ODD_BIT)==0))
    206 
    207 U_CFUNC UBool
    208 ubidi_getRuns(UBiDi *pBiDi, UErrorCode *pErrorCode);
    209 
    210 /** BiDi control code points */
    211 enum {
    212     ZWNJ_CHAR=0x200c,
    213     ZWJ_CHAR,
    214     LRM_CHAR,
    215     RLM_CHAR,
    216     LRE_CHAR=0x202a,
    217     RLE_CHAR,
    218     PDF_CHAR,
    219     LRO_CHAR,
    220     RLO_CHAR,
    221     LRI_CHAR=0x2066,
    222     RLI_CHAR,
    223     FSI_CHAR,
    224     PDI_CHAR
    225 };
    226 
    227 #define IS_BIDI_CONTROL_CHAR(c) (((uint32_t)(c)&0xfffffffc)==ZWNJ_CHAR || (uint32_t)((c)-LRE_CHAR)<5 || (uint32_t)((c)-LRI_CHAR)<4)
    228 
    229 /* InsertPoints structure for noting where to put BiDi marks ---------------- */
    230 
    231 typedef struct Point {
    232     int32_t pos;            /* position in text */
    233     int32_t flag;           /* flag for LRM/RLM, before/after */
    234 } Point;
    235 
    236 typedef struct InsertPoints {
    237     int32_t capacity;       /* number of points allocated */
    238     int32_t size;           /* number of points used */
    239     int32_t confirmed;      /* number of points confirmed */
    240     UErrorCode errorCode;   /* for eventual memory shortage */
    241     Point *points;          /* pointer to array of points */
    242 } InsertPoints;
    243 
    244 
    245 /* UBiDi structure ----------------------------------------------------------- */
    246 
    247 struct UBiDi {
    248     /* pointer to parent paragraph object (pointer to self if this object is
    249      * a paragraph object); set to NULL in a newly opened object; set to a
    250      * real value after a successful execution of ubidi_setPara or ubidi_setLine
    251      */
    252     const UBiDi * pParaBiDi;
    253 
    254     const UBiDiProps *bdp;
    255 
    256     /* alias pointer to the current text */
    257     const UChar *text;
    258 
    259     /* length of the current text */
    260     int32_t originalLength;
    261 
    262     /* if the UBIDI_OPTION_STREAMING option is set, this is the length
    263      * of text actually processed by ubidi_setPara, which may be shorter than
    264      * the original length.
    265      * Otherwise, it is identical to the original length.
    266      */
    267     int32_t length;
    268 
    269     /* if the UBIDI_OPTION_REMOVE_CONTROLS option is set, and/or
    270      * marks are allowed to be inserted in one of the reordering mode, the
    271      * length of the result string may be different from the processed length.
    272      */
    273     int32_t resultLength;
    274 
    275     /* memory sizes in bytes */
    276     int32_t dirPropsSize, levelsSize, openingsSize, parasSize, runsSize, isolatesSize;
    277 
    278     /* allocated memory */
    279     DirProp *dirPropsMemory;
    280     UBiDiLevel *levelsMemory;
    281     Opening *openingsMemory;
    282     Para *parasMemory;
    283     Run *runsMemory;
    284     Isolate *isolatesMemory;
    285 
    286     /* indicators for whether memory may be allocated after ubidi_open() */
    287     UBool mayAllocateText, mayAllocateRuns;
    288 
    289     /* arrays with one value per text-character */
    290     DirProp *dirProps;
    291     UBiDiLevel *levels;
    292 
    293     /* are we performing an approximation of the "inverse BiDi" algorithm? */
    294     UBool isInverse;
    295 
    296     /* are we using the basic algorithm or its variation? */
    297     UBiDiReorderingMode reorderingMode;
    298 
    299     /* UBIDI_REORDER_xxx values must be ordered so that all the regular
    300      * logical to visual modes come first, and all inverse BiDi modes
    301      * come last.
    302      */
    303     #define UBIDI_REORDER_LAST_LOGICAL_TO_VISUAL    UBIDI_REORDER_NUMBERS_SPECIAL
    304 
    305     /* bitmask for reordering options */
    306     uint32_t reorderingOptions;
    307 
    308     /* must block separators receive level 0? */
    309     UBool orderParagraphsLTR;
    310 
    311     /* the paragraph level */
    312     UBiDiLevel paraLevel;
    313     /* original paraLevel when contextual */
    314     /* must be one of UBIDI_DEFAULT_xxx or 0 if not contextual */
    315     UBiDiLevel defaultParaLevel;
    316 
    317     /* context data */
    318     const UChar *prologue;
    319     int32_t proLength;
    320     const UChar *epilogue;
    321     int32_t epiLength;
    322 
    323     /* the following is set in ubidi_setPara, used in processPropertySeq */
    324     const struct ImpTabPair * pImpTabPair;  /* pointer to levels state table pair */
    325 
    326     /* the overall paragraph or line directionality - see UBiDiDirection */
    327     UBiDiDirection direction;
    328 
    329     /* flags is a bit set for which directional properties are in the text */
    330     Flags flags;
    331 
    332     /* lastArabicPos is index to the last AL in the text, -1 if none */
    333     int32_t lastArabicPos;
    334 
    335     /* characters after trailingWSStart are WS and are */
    336     /* implicitly at the paraLevel (rule (L1)) - levels may not reflect that */
    337     int32_t trailingWSStart;
    338 
    339     /* fields for paragraph handling */
    340     int32_t paraCount;                  /* set in getDirProps() */
    341     /* filled in getDirProps() */
    342     Para *paras;
    343 
    344     /* for relatively short text, we only need a tiny array of paras (no malloc()) */
    345     Para simpleParas[SIMPLE_PARAS_COUNT];
    346 
    347     /* fields for line reordering */
    348     int32_t runCount;     /* ==-1: runs not set up yet */
    349     Run *runs;
    350 
    351     /* for non-mixed text, we only need a tiny array of runs (no malloc()) */
    352     Run simpleRuns[1];
    353 
    354     /* maximum or current nesting depth of isolate sequences */
    355     /* Within resolveExplicitLevels() and checkExplicitLevels(), this is the maximal
    356        nesting encountered.
    357        Within resolveImplicitLevels(), this is the index of the current isolates
    358        stack entry. */
    359     int32_t isolateCount;
    360     Isolate *isolates;
    361 
    362     /* for simple text, have a small stack (no malloc()) */
    363     Isolate simpleIsolates[SIMPLE_ISOLATES_COUNT];
    364 
    365     /* for inverse Bidi with insertion of directional marks */
    366     InsertPoints insertPoints;
    367 
    368     /* for option UBIDI_OPTION_REMOVE_CONTROLS */
    369     int32_t controlCount;
    370 
    371     /* for Bidi class callback */
    372     UBiDiClassCallback *fnClassCallback;    /* action pointer */
    373     const void *coClassCallback;            /* context pointer */
    374 };
    375 
    376 #define IS_VALID_PARA(x) ((x) && ((x)->pParaBiDi==(x)))
    377 #define IS_VALID_PARA_OR_LINE(x) ((x) && ((x)->pParaBiDi==(x) || (((x)->pParaBiDi) && (x)->pParaBiDi->pParaBiDi==(x)->pParaBiDi)))
    378 
    379 typedef union {
    380     DirProp *dirPropsMemory;
    381     UBiDiLevel *levelsMemory;
    382     Opening *openingsMemory;
    383     Para *parasMemory;
    384     Run *runsMemory;
    385     Isolate *isolatesMemory;
    386 } BidiMemoryForAllocation;
    387 
    388 /* Macros for initial checks at function entry */
    389 #define RETURN_IF_NULL_OR_FAILING_ERRCODE(pErrcode, retvalue)   \
    390         if((pErrcode)==NULL || U_FAILURE(*pErrcode)) return retvalue
    391 #define RETURN_IF_NOT_VALID_PARA(bidi, errcode, retvalue)   \
    392         if(!IS_VALID_PARA(bidi)) {  \
    393             errcode=U_INVALID_STATE_ERROR;  \
    394             return retvalue;                \
    395         }
    396 #define RETURN_IF_NOT_VALID_PARA_OR_LINE(bidi, errcode, retvalue)   \
    397         if(!IS_VALID_PARA_OR_LINE(bidi)) {  \
    398             errcode=U_INVALID_STATE_ERROR;  \
    399             return retvalue;                \
    400         }
    401 #define RETURN_IF_BAD_RANGE(arg, start, limit, errcode, retvalue)   \
    402         if((arg)<(start) || (arg)>=(limit)) {       \
    403             (errcode)=U_ILLEGAL_ARGUMENT_ERROR;     \
    404             return retvalue;                        \
    405         }
    406 
    407 #define RETURN_VOID_IF_NULL_OR_FAILING_ERRCODE(pErrcode)   \
    408         if((pErrcode)==NULL || U_FAILURE(*pErrcode)) return
    409 #define RETURN_VOID_IF_NOT_VALID_PARA(bidi, errcode)   \
    410         if(!IS_VALID_PARA(bidi)) {  \
    411             errcode=U_INVALID_STATE_ERROR;  \
    412             return;                \
    413         }
    414 #define RETURN_VOID_IF_NOT_VALID_PARA_OR_LINE(bidi, errcode)   \
    415         if(!IS_VALID_PARA_OR_LINE(bidi)) {  \
    416             errcode=U_INVALID_STATE_ERROR;  \
    417             return;                \
    418         }
    419 #define RETURN_VOID_IF_BAD_RANGE(arg, start, limit, errcode)   \
    420         if((arg)<(start) || (arg)>=(limit)) {       \
    421             (errcode)=U_ILLEGAL_ARGUMENT_ERROR;     \
    422             return;                        \
    423         }
    424 
    425 /* helper function to (re)allocate memory if allowed */
    426 U_CFUNC UBool
    427 ubidi_getMemory(BidiMemoryForAllocation *pMemory, int32_t *pSize, UBool mayAllocate, int32_t sizeNeeded);
    428 
    429 /* helper macros for each allocated array in UBiDi */
    430 #define getDirPropsMemory(pBiDi, length) \
    431         ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->dirPropsMemory, &(pBiDi)->dirPropsSize, \
    432                         (pBiDi)->mayAllocateText, (length))
    433 
    434 #define getLevelsMemory(pBiDi, length) \
    435         ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->levelsMemory, &(pBiDi)->levelsSize, \
    436                         (pBiDi)->mayAllocateText, (length))
    437 
    438 #define getRunsMemory(pBiDi, length) \
    439         ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->runsMemory, &(pBiDi)->runsSize, \
    440                         (pBiDi)->mayAllocateRuns, (length)*sizeof(Run))
    441 
    442 /* additional macros used by ubidi_open() - always allow allocation */
    443 #define getInitialDirPropsMemory(pBiDi, length) \
    444         ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->dirPropsMemory, &(pBiDi)->dirPropsSize, \
    445                         TRUE, (length))
    446 
    447 #define getInitialLevelsMemory(pBiDi, length) \
    448         ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->levelsMemory, &(pBiDi)->levelsSize, \
    449                         TRUE, (length))
    450 
    451 #define getInitialOpeningsMemory(pBiDi, length) \
    452         ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->openingsMemory, &(pBiDi)->openingsSize, \
    453                         TRUE, (length)*sizeof(Opening))
    454 
    455 #define getInitialParasMemory(pBiDi, length) \
    456         ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->parasMemory, &(pBiDi)->parasSize, \
    457                         TRUE, (length)*sizeof(Para))
    458 
    459 #define getInitialRunsMemory(pBiDi, length) \
    460         ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->runsMemory, &(pBiDi)->runsSize, \
    461                         TRUE, (length)*sizeof(Run))
    462 
    463 #define getInitialIsolatesMemory(pBiDi, length) \
    464         ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->isolatesMemory, &(pBiDi)->isolatesSize, \
    465                         TRUE, (length)*sizeof(Isolate))
    466 
    467 #endif
    468