1 /* 2 ****************************************************************************** 3 * 4 * Copyright (C) 1999-2015, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ****************************************************************************** 8 * file name: ubidiimp.h 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 1999aug06 14 * created by: Markus W. Scherer, updated by Matitiahu Allouche 15 */ 16 17 #ifndef UBIDIIMP_H 18 #define UBIDIIMP_H 19 20 #include "unicode/utypes.h" 21 #include "unicode/uchar.h" 22 #include "ubidi_props.h" 23 24 /* miscellaneous definitions ---------------------------------------------- */ 25 26 typedef uint8_t DirProp; 27 typedef uint32_t Flags; 28 29 /* Comparing the description of the BiDi algorithm with this implementation 30 is easier with the same names for the BiDi types in the code as there. 31 See UCharDirection in uchar.h . 32 */ 33 enum { 34 L= U_LEFT_TO_RIGHT, /* 0 */ 35 R= U_RIGHT_TO_LEFT, /* 1 */ 36 EN= U_EUROPEAN_NUMBER, /* 2 */ 37 ES= U_EUROPEAN_NUMBER_SEPARATOR, /* 3 */ 38 ET= U_EUROPEAN_NUMBER_TERMINATOR, /* 4 */ 39 AN= U_ARABIC_NUMBER, /* 5 */ 40 CS= U_COMMON_NUMBER_SEPARATOR, /* 6 */ 41 B= U_BLOCK_SEPARATOR, /* 7 */ 42 S= U_SEGMENT_SEPARATOR, /* 8 */ 43 WS= U_WHITE_SPACE_NEUTRAL, /* 9 */ 44 ON= U_OTHER_NEUTRAL, /* 10 */ 45 LRE=U_LEFT_TO_RIGHT_EMBEDDING, /* 11 */ 46 LRO=U_LEFT_TO_RIGHT_OVERRIDE, /* 12 */ 47 AL= U_RIGHT_TO_LEFT_ARABIC, /* 13 */ 48 RLE=U_RIGHT_TO_LEFT_EMBEDDING, /* 14 */ 49 RLO=U_RIGHT_TO_LEFT_OVERRIDE, /* 15 */ 50 PDF=U_POP_DIRECTIONAL_FORMAT, /* 16 */ 51 NSM=U_DIR_NON_SPACING_MARK, /* 17 */ 52 BN= U_BOUNDARY_NEUTRAL, /* 18 */ 53 FSI=U_FIRST_STRONG_ISOLATE, /* 19 */ 54 LRI=U_LEFT_TO_RIGHT_ISOLATE, /* 20 */ 55 RLI=U_RIGHT_TO_LEFT_ISOLATE, /* 21 */ 56 PDI=U_POP_DIRECTIONAL_ISOLATE, /* 22 */ 57 ENL, /* EN after W7 */ /* 23 */ 58 ENR, /* EN not subject to W7 */ /* 24 */ 59 dirPropCount 60 }; 61 62 /* Sometimes, bit values are more appropriate 63 to deal with directionality properties. 64 Abbreviations in these macro names refer to names 65 used in the BiDi algorithm. 66 */ 67 #define DIRPROP_FLAG(dir) (1UL<<(dir)) 68 #define PURE_DIRPROP(prop) ((prop)&~0xE0) ????????????????????????? 69 70 /* special flag for multiple runs from explicit embedding codes */ 71 #define DIRPROP_FLAG_MULTI_RUNS (1UL<<31) 72 73 /* are there any characters that are LTR or RTL? */ 74 #define MASK_LTR (DIRPROP_FLAG(L)|DIRPROP_FLAG(EN)|DIRPROP_FLAG(ENL)|DIRPROP_FLAG(ENR)|DIRPROP_FLAG(AN)|DIRPROP_FLAG(LRE)|DIRPROP_FLAG(LRO)|DIRPROP_FLAG(LRI)) 75 #define MASK_RTL (DIRPROP_FLAG(R)|DIRPROP_FLAG(AL)|DIRPROP_FLAG(RLE)|DIRPROP_FLAG(RLO)|DIRPROP_FLAG(RLI)) 76 #define MASK_R_AL (DIRPROP_FLAG(R)|DIRPROP_FLAG(AL)) 77 #define MASK_STRONG_EN_AN (DIRPROP_FLAG(L)|DIRPROP_FLAG(R)|DIRPROP_FLAG(AL)|DIRPROP_FLAG(EN)|DIRPROP_FLAG(AN)) 78 79 /* explicit embedding codes */ 80 #define MASK_EXPLICIT (DIRPROP_FLAG(LRE)|DIRPROP_FLAG(LRO)|DIRPROP_FLAG(RLE)|DIRPROP_FLAG(RLO)|DIRPROP_FLAG(PDF)) 81 82 /* explicit isolate codes */ 83 #define MASK_ISO (DIRPROP_FLAG(LRI)|DIRPROP_FLAG(RLI)|DIRPROP_FLAG(FSI)|DIRPROP_FLAG(PDI)) 84 85 #define MASK_BN_EXPLICIT (DIRPROP_FLAG(BN)|MASK_EXPLICIT) 86 87 /* paragraph and segment separators */ 88 #define MASK_B_S (DIRPROP_FLAG(B)|DIRPROP_FLAG(S)) 89 90 /* all types that are counted as White Space or Neutral in some steps */ 91 #define MASK_WS (MASK_B_S|DIRPROP_FLAG(WS)|MASK_BN_EXPLICIT|MASK_ISO) 92 93 /* types that are neutrals or could becomes neutrals in (Wn) */ 94 #define MASK_POSSIBLE_N (DIRPROP_FLAG(ON)|DIRPROP_FLAG(CS)|DIRPROP_FLAG(ES)|DIRPROP_FLAG(ET)|MASK_WS) 95 96 /* 97 * These types may be changed to "e", 98 * the embedding type (L or R) of the run, 99 * in the BiDi algorithm (N2) 100 */ 101 #define MASK_EMBEDDING (DIRPROP_FLAG(NSM)|MASK_POSSIBLE_N) 102 103 /* the dirProp's L and R are defined to 0 and 1 values in UCharDirection */ 104 #define GET_LR_FROM_LEVEL(level) ((DirProp)((level)&1)) 105 106 #define IS_DEFAULT_LEVEL(level) ((level)>=0xfe) 107 108 /* 109 * The following bit is used for the directional isolate status. 110 * Stack entries corresponding to isolate sequences are greater than ISOLATE. 111 */ 112 #define ISOLATE 0x0100 113 114 U_CFUNC UBiDiLevel 115 ubidi_getParaLevelAtIndex(const UBiDi *pBiDi, int32_t index); 116 117 #define GET_PARALEVEL(ubidi, index) \ 118 ((UBiDiLevel)(!(ubidi)->defaultParaLevel || (index)<(ubidi)->paras[0].limit ? \ 119 (ubidi)->paraLevel : ubidi_getParaLevelAtIndex((ubidi), (index)))) 120 121 /* number of paras entries allocated initially without malloc */ 122 #define SIMPLE_PARAS_COUNT 10 123 /* number of isolate entries allocated initially without malloc */ 124 #define SIMPLE_ISOLATES_COUNT 5 125 /* number of isolate run entries for paired brackets allocated initially without malloc */ 126 #define SIMPLE_OPENINGS_COUNT 20 127 128 #define CR 0x000D 129 #define LF 0x000A 130 131 /* Run structure for reordering --------------------------------------------- */ 132 enum { 133 LRM_BEFORE=1, 134 LRM_AFTER=2, 135 RLM_BEFORE=4, 136 RLM_AFTER=8 137 }; 138 139 typedef struct Para { 140 int32_t limit; 141 int32_t level; 142 } Para; 143 144 enum { /* flags for Opening.flags */ 145 FOUND_L=DIRPROP_FLAG(L), 146 FOUND_R=DIRPROP_FLAG(R) 147 }; 148 149 typedef struct Opening { 150 int32_t position; /* position of opening bracket */ 151 int32_t match; /* matching char or -position of closing bracket */ 152 int32_t contextPos; /* position of last strong char found before opening */ 153 uint16_t flags; /* bits for L or R/AL found within the pair */ 154 UBiDiDirection contextDir; /* L or R according to last strong char before opening */ 155 uint8_t filler; /* to complete a nice multiple of 4 chars */ 156 } Opening; 157 158 typedef struct IsoRun { 159 int32_t contextPos; /* position of char determining context */ 160 uint16_t start; /* index of first opening entry for this run */ 161 uint16_t limit; /* index after last opening entry for this run */ 162 UBiDiLevel level; /* level of this run */ 163 DirProp lastStrong; /* bidi class of last strong char found in this run */ 164 DirProp lastBase; /* bidi class of last base char found in this run */ 165 UBiDiDirection contextDir; /* L or R to use as context for following openings */ 166 } IsoRun; 167 168 typedef struct BracketData { 169 UBiDi *pBiDi; 170 /* array of opening entries which should be enough in most cases; no malloc() */ 171 Opening simpleOpenings[SIMPLE_OPENINGS_COUNT]; 172 Opening *openings; /* pointer to current array of entries */ 173 int32_t openingsCount; /* number of allocated entries */ 174 int32_t isoRunLast; /* index of last used entry */ 175 /* array of nested isolated sequence entries; can never excess UBIDI_MAX_EXPLICIT_LEVEL 176 + 1 for index 0, + 1 for before the first isolated sequence */ 177 IsoRun isoRuns[UBIDI_MAX_EXPLICIT_LEVEL+2]; 178 UBool isNumbersSpecial; /* reordering mode for NUMBERS_SPECIAL */ 179 } BracketData; 180 181 typedef struct Isolate { 182 int32_t startON; 183 int32_t start1; 184 int32_t state; 185 int16_t stateImp; 186 } Isolate; 187 188 typedef struct Run { 189 int32_t logicalStart, /* first character of the run; b31 indicates even/odd level */ 190 visualLimit, /* last visual position of the run +1 */ 191 insertRemove; /* if >0, flags for inserting LRM/RLM before/after run, 192 if <0, count of bidi controls within run */ 193 } Run; 194 195 /* in a Run, logicalStart will get this bit set if the run level is odd */ 196 #define INDEX_ODD_BIT (1UL<<31) 197 198 #define MAKE_INDEX_ODD_PAIR(index, level) ((index)|((int32_t)(level)<<31)) 199 #define ADD_ODD_BIT_FROM_LEVEL(x, level) ((x)|=((int32_t)(level)<<31)) 200 #define REMOVE_ODD_BIT(x) ((x)&=~INDEX_ODD_BIT) 201 202 #define GET_INDEX(x) ((x)&~INDEX_ODD_BIT) 203 #define GET_ODD_BIT(x) ((uint32_t)(x)>>31) 204 #define IS_ODD_RUN(x) ((UBool)(((x)&INDEX_ODD_BIT)!=0)) 205 #define IS_EVEN_RUN(x) ((UBool)(((x)&INDEX_ODD_BIT)==0)) 206 207 U_CFUNC UBool 208 ubidi_getRuns(UBiDi *pBiDi, UErrorCode *pErrorCode); 209 210 /** BiDi control code points */ 211 enum { 212 ZWNJ_CHAR=0x200c, 213 ZWJ_CHAR, 214 LRM_CHAR, 215 RLM_CHAR, 216 LRE_CHAR=0x202a, 217 RLE_CHAR, 218 PDF_CHAR, 219 LRO_CHAR, 220 RLO_CHAR, 221 LRI_CHAR=0x2066, 222 RLI_CHAR, 223 FSI_CHAR, 224 PDI_CHAR 225 }; 226 227 #define IS_BIDI_CONTROL_CHAR(c) (((uint32_t)(c)&0xfffffffc)==ZWNJ_CHAR || (uint32_t)((c)-LRE_CHAR)<5 || (uint32_t)((c)-LRI_CHAR)<4) 228 229 /* InsertPoints structure for noting where to put BiDi marks ---------------- */ 230 231 typedef struct Point { 232 int32_t pos; /* position in text */ 233 int32_t flag; /* flag for LRM/RLM, before/after */ 234 } Point; 235 236 typedef struct InsertPoints { 237 int32_t capacity; /* number of points allocated */ 238 int32_t size; /* number of points used */ 239 int32_t confirmed; /* number of points confirmed */ 240 UErrorCode errorCode; /* for eventual memory shortage */ 241 Point *points; /* pointer to array of points */ 242 } InsertPoints; 243 244 245 /* UBiDi structure ----------------------------------------------------------- */ 246 247 struct UBiDi { 248 /* pointer to parent paragraph object (pointer to self if this object is 249 * a paragraph object); set to NULL in a newly opened object; set to a 250 * real value after a successful execution of ubidi_setPara or ubidi_setLine 251 */ 252 const UBiDi * pParaBiDi; 253 254 const UBiDiProps *bdp; 255 256 /* alias pointer to the current text */ 257 const UChar *text; 258 259 /* length of the current text */ 260 int32_t originalLength; 261 262 /* if the UBIDI_OPTION_STREAMING option is set, this is the length 263 * of text actually processed by ubidi_setPara, which may be shorter than 264 * the original length. 265 * Otherwise, it is identical to the original length. 266 */ 267 int32_t length; 268 269 /* if the UBIDI_OPTION_REMOVE_CONTROLS option is set, and/or 270 * marks are allowed to be inserted in one of the reordering mode, the 271 * length of the result string may be different from the processed length. 272 */ 273 int32_t resultLength; 274 275 /* memory sizes in bytes */ 276 int32_t dirPropsSize, levelsSize, openingsSize, parasSize, runsSize, isolatesSize; 277 278 /* allocated memory */ 279 DirProp *dirPropsMemory; 280 UBiDiLevel *levelsMemory; 281 Opening *openingsMemory; 282 Para *parasMemory; 283 Run *runsMemory; 284 Isolate *isolatesMemory; 285 286 /* indicators for whether memory may be allocated after ubidi_open() */ 287 UBool mayAllocateText, mayAllocateRuns; 288 289 /* arrays with one value per text-character */ 290 DirProp *dirProps; 291 UBiDiLevel *levels; 292 293 /* are we performing an approximation of the "inverse BiDi" algorithm? */ 294 UBool isInverse; 295 296 /* are we using the basic algorithm or its variation? */ 297 UBiDiReorderingMode reorderingMode; 298 299 /* UBIDI_REORDER_xxx values must be ordered so that all the regular 300 * logical to visual modes come first, and all inverse BiDi modes 301 * come last. 302 */ 303 #define UBIDI_REORDER_LAST_LOGICAL_TO_VISUAL UBIDI_REORDER_NUMBERS_SPECIAL 304 305 /* bitmask for reordering options */ 306 uint32_t reorderingOptions; 307 308 /* must block separators receive level 0? */ 309 UBool orderParagraphsLTR; 310 311 /* the paragraph level */ 312 UBiDiLevel paraLevel; 313 /* original paraLevel when contextual */ 314 /* must be one of UBIDI_DEFAULT_xxx or 0 if not contextual */ 315 UBiDiLevel defaultParaLevel; 316 317 /* context data */ 318 const UChar *prologue; 319 int32_t proLength; 320 const UChar *epilogue; 321 int32_t epiLength; 322 323 /* the following is set in ubidi_setPara, used in processPropertySeq */ 324 const struct ImpTabPair * pImpTabPair; /* pointer to levels state table pair */ 325 326 /* the overall paragraph or line directionality - see UBiDiDirection */ 327 UBiDiDirection direction; 328 329 /* flags is a bit set for which directional properties are in the text */ 330 Flags flags; 331 332 /* lastArabicPos is index to the last AL in the text, -1 if none */ 333 int32_t lastArabicPos; 334 335 /* characters after trailingWSStart are WS and are */ 336 /* implicitly at the paraLevel (rule (L1)) - levels may not reflect that */ 337 int32_t trailingWSStart; 338 339 /* fields for paragraph handling */ 340 int32_t paraCount; /* set in getDirProps() */ 341 /* filled in getDirProps() */ 342 Para *paras; 343 344 /* for relatively short text, we only need a tiny array of paras (no malloc()) */ 345 Para simpleParas[SIMPLE_PARAS_COUNT]; 346 347 /* fields for line reordering */ 348 int32_t runCount; /* ==-1: runs not set up yet */ 349 Run *runs; 350 351 /* for non-mixed text, we only need a tiny array of runs (no malloc()) */ 352 Run simpleRuns[1]; 353 354 /* maximum or current nesting depth of isolate sequences */ 355 /* Within resolveExplicitLevels() and checkExplicitLevels(), this is the maximal 356 nesting encountered. 357 Within resolveImplicitLevels(), this is the index of the current isolates 358 stack entry. */ 359 int32_t isolateCount; 360 Isolate *isolates; 361 362 /* for simple text, have a small stack (no malloc()) */ 363 Isolate simpleIsolates[SIMPLE_ISOLATES_COUNT]; 364 365 /* for inverse Bidi with insertion of directional marks */ 366 InsertPoints insertPoints; 367 368 /* for option UBIDI_OPTION_REMOVE_CONTROLS */ 369 int32_t controlCount; 370 371 /* for Bidi class callback */ 372 UBiDiClassCallback *fnClassCallback; /* action pointer */ 373 const void *coClassCallback; /* context pointer */ 374 }; 375 376 #define IS_VALID_PARA(x) ((x) && ((x)->pParaBiDi==(x))) 377 #define IS_VALID_PARA_OR_LINE(x) ((x) && ((x)->pParaBiDi==(x) || (((x)->pParaBiDi) && (x)->pParaBiDi->pParaBiDi==(x)->pParaBiDi))) 378 379 typedef union { 380 DirProp *dirPropsMemory; 381 UBiDiLevel *levelsMemory; 382 Opening *openingsMemory; 383 Para *parasMemory; 384 Run *runsMemory; 385 Isolate *isolatesMemory; 386 } BidiMemoryForAllocation; 387 388 /* Macros for initial checks at function entry */ 389 #define RETURN_IF_NULL_OR_FAILING_ERRCODE(pErrcode, retvalue) \ 390 if((pErrcode)==NULL || U_FAILURE(*pErrcode)) return retvalue 391 #define RETURN_IF_NOT_VALID_PARA(bidi, errcode, retvalue) \ 392 if(!IS_VALID_PARA(bidi)) { \ 393 errcode=U_INVALID_STATE_ERROR; \ 394 return retvalue; \ 395 } 396 #define RETURN_IF_NOT_VALID_PARA_OR_LINE(bidi, errcode, retvalue) \ 397 if(!IS_VALID_PARA_OR_LINE(bidi)) { \ 398 errcode=U_INVALID_STATE_ERROR; \ 399 return retvalue; \ 400 } 401 #define RETURN_IF_BAD_RANGE(arg, start, limit, errcode, retvalue) \ 402 if((arg)<(start) || (arg)>=(limit)) { \ 403 (errcode)=U_ILLEGAL_ARGUMENT_ERROR; \ 404 return retvalue; \ 405 } 406 407 #define RETURN_VOID_IF_NULL_OR_FAILING_ERRCODE(pErrcode) \ 408 if((pErrcode)==NULL || U_FAILURE(*pErrcode)) return 409 #define RETURN_VOID_IF_NOT_VALID_PARA(bidi, errcode) \ 410 if(!IS_VALID_PARA(bidi)) { \ 411 errcode=U_INVALID_STATE_ERROR; \ 412 return; \ 413 } 414 #define RETURN_VOID_IF_NOT_VALID_PARA_OR_LINE(bidi, errcode) \ 415 if(!IS_VALID_PARA_OR_LINE(bidi)) { \ 416 errcode=U_INVALID_STATE_ERROR; \ 417 return; \ 418 } 419 #define RETURN_VOID_IF_BAD_RANGE(arg, start, limit, errcode) \ 420 if((arg)<(start) || (arg)>=(limit)) { \ 421 (errcode)=U_ILLEGAL_ARGUMENT_ERROR; \ 422 return; \ 423 } 424 425 /* helper function to (re)allocate memory if allowed */ 426 U_CFUNC UBool 427 ubidi_getMemory(BidiMemoryForAllocation *pMemory, int32_t *pSize, UBool mayAllocate, int32_t sizeNeeded); 428 429 /* helper macros for each allocated array in UBiDi */ 430 #define getDirPropsMemory(pBiDi, length) \ 431 ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->dirPropsMemory, &(pBiDi)->dirPropsSize, \ 432 (pBiDi)->mayAllocateText, (length)) 433 434 #define getLevelsMemory(pBiDi, length) \ 435 ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->levelsMemory, &(pBiDi)->levelsSize, \ 436 (pBiDi)->mayAllocateText, (length)) 437 438 #define getRunsMemory(pBiDi, length) \ 439 ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->runsMemory, &(pBiDi)->runsSize, \ 440 (pBiDi)->mayAllocateRuns, (length)*sizeof(Run)) 441 442 /* additional macros used by ubidi_open() - always allow allocation */ 443 #define getInitialDirPropsMemory(pBiDi, length) \ 444 ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->dirPropsMemory, &(pBiDi)->dirPropsSize, \ 445 TRUE, (length)) 446 447 #define getInitialLevelsMemory(pBiDi, length) \ 448 ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->levelsMemory, &(pBiDi)->levelsSize, \ 449 TRUE, (length)) 450 451 #define getInitialOpeningsMemory(pBiDi, length) \ 452 ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->openingsMemory, &(pBiDi)->openingsSize, \ 453 TRUE, (length)*sizeof(Opening)) 454 455 #define getInitialParasMemory(pBiDi, length) \ 456 ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->parasMemory, &(pBiDi)->parasSize, \ 457 TRUE, (length)*sizeof(Para)) 458 459 #define getInitialRunsMemory(pBiDi, length) \ 460 ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->runsMemory, &(pBiDi)->runsSize, \ 461 TRUE, (length)*sizeof(Run)) 462 463 #define getInitialIsolatesMemory(pBiDi, length) \ 464 ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->isolatesMemory, &(pBiDi)->isolatesSize, \ 465 TRUE, (length)*sizeof(Isolate)) 466 467 #endif 468