1 /* 2 ****************************************************************************** 3 * 4 * Copyright (C) 2000-2015, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ****************************************************************************** 8 * file name: ubidiwrt.c 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 1999aug06 14 * created by: Markus W. Scherer, updated by Matitiahu Allouche 15 * 16 * This file contains implementations for BiDi functions that use 17 * the core algorithm and core API to write reordered text. 18 */ 19 20 #include "unicode/utypes.h" 21 #include "unicode/ustring.h" 22 #include "unicode/uchar.h" 23 #include "unicode/ubidi.h" 24 #include "unicode/utf16.h" 25 #include "cmemory.h" 26 #include "ustr_imp.h" 27 #include "ubidiimp.h" 28 29 /* 30 * The function implementations in this file are designed 31 * for UTF-16 and UTF-32, not for UTF-8. 32 * 33 * Assumptions that are not true for UTF-8: 34 * - Any code point always needs the same number of code units 35 * ("minimum-length-problem" of UTF-8) 36 * - The BiDi control characters need only one code unit each 37 * 38 * Further assumptions for all UTFs: 39 * - u_charMirror(c) needs the same number of code units as c 40 */ 41 #if UTF_SIZE==8 42 # error reimplement ubidi_writeReordered() for UTF-8, see comment above 43 #endif 44 45 #define IS_COMBINING(type) ((1UL<<(type))&(1UL<<U_NON_SPACING_MARK|1UL<<U_COMBINING_SPACING_MARK|1UL<<U_ENCLOSING_MARK)) 46 47 /* 48 * When we have UBIDI_OUTPUT_REVERSE set on ubidi_writeReordered(), then we 49 * semantically write RTL runs in reverse and later reverse them again. 50 * Instead, we actually write them in forward order to begin with. 51 * However, if the RTL run was to be mirrored, we need to mirror here now 52 * since the implicit second reversal must not do it. 53 * It looks strange to do mirroring in LTR output, but it is only because 54 * we are writing RTL output in reverse. 55 */ 56 static int32_t 57 doWriteForward(const UChar *src, int32_t srcLength, 58 UChar *dest, int32_t destSize, 59 uint16_t options, 60 UErrorCode *pErrorCode) { 61 /* optimize for several combinations of options */ 62 switch(options&(UBIDI_REMOVE_BIDI_CONTROLS|UBIDI_DO_MIRRORING)) { 63 case 0: { 64 /* simply copy the LTR run to the destination */ 65 int32_t length=srcLength; 66 if(destSize<length) { 67 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 68 return srcLength; 69 } 70 do { 71 *dest++=*src++; 72 } while(--length>0); 73 return srcLength; 74 } 75 case UBIDI_DO_MIRRORING: { 76 /* do mirroring */ 77 int32_t i=0, j=0; 78 UChar32 c; 79 80 if(destSize<srcLength) { 81 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 82 return srcLength; 83 } 84 do { 85 U16_NEXT(src, i, srcLength, c); 86 c=u_charMirror(c); 87 U16_APPEND_UNSAFE(dest, j, c); 88 } while(i<srcLength); 89 return srcLength; 90 } 91 case UBIDI_REMOVE_BIDI_CONTROLS: { 92 /* copy the LTR run and remove any BiDi control characters */ 93 int32_t remaining=destSize; 94 UChar c; 95 do { 96 c=*src++; 97 if(!IS_BIDI_CONTROL_CHAR(c)) { 98 if(--remaining<0) { 99 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 100 101 /* preflight the length */ 102 while(--srcLength>0) { 103 c=*src++; 104 if(!IS_BIDI_CONTROL_CHAR(c)) { 105 --remaining; 106 } 107 } 108 return destSize-remaining; 109 } 110 *dest++=c; 111 } 112 } while(--srcLength>0); 113 return destSize-remaining; 114 } 115 default: { 116 /* remove BiDi control characters and do mirroring */ 117 int32_t remaining=destSize; 118 int32_t i, j=0; 119 UChar32 c; 120 do { 121 i=0; 122 U16_NEXT(src, i, srcLength, c); 123 src+=i; 124 srcLength-=i; 125 if(!IS_BIDI_CONTROL_CHAR(c)) { 126 remaining-=i; 127 if(remaining<0) { 128 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 129 130 /* preflight the length */ 131 while(srcLength>0) { 132 c=*src++; 133 if(!IS_BIDI_CONTROL_CHAR(c)) { 134 --remaining; 135 } 136 --srcLength; 137 } 138 return destSize-remaining; 139 } 140 c=u_charMirror(c); 141 U16_APPEND_UNSAFE(dest, j, c); 142 } 143 } while(srcLength>0); 144 return j; 145 } 146 } /* end of switch */ 147 } 148 149 static int32_t 150 doWriteReverse(const UChar *src, int32_t srcLength, 151 UChar *dest, int32_t destSize, 152 uint16_t options, 153 UErrorCode *pErrorCode) { 154 /* 155 * RTL run - 156 * 157 * RTL runs need to be copied to the destination in reverse order 158 * of code points, not code units, to keep Unicode characters intact. 159 * 160 * The general strategy for this is to read the source text 161 * in backward order, collect all code units for a code point 162 * (and optionally following combining characters, see below), 163 * and copy all these code units in ascending order 164 * to the destination for this run. 165 * 166 * Several options request whether combining characters 167 * should be kept after their base characters, 168 * whether BiDi control characters should be removed, and 169 * whether characters should be replaced by their mirror-image 170 * equivalent Unicode characters. 171 */ 172 int32_t i, j; 173 UChar32 c; 174 175 /* optimize for several combinations of options */ 176 switch(options&(UBIDI_REMOVE_BIDI_CONTROLS|UBIDI_DO_MIRRORING|UBIDI_KEEP_BASE_COMBINING)) { 177 case 0: 178 /* 179 * With none of the "complicated" options set, the destination 180 * run will have the same length as the source run, 181 * and there is no mirroring and no keeping combining characters 182 * with their base characters. 183 */ 184 if(destSize<srcLength) { 185 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 186 return srcLength; 187 } 188 destSize=srcLength; 189 190 /* preserve character integrity */ 191 do { 192 /* i is always after the last code unit known to need to be kept in this segment */ 193 i=srcLength; 194 195 /* collect code units for one base character */ 196 U16_BACK_1(src, 0, srcLength); 197 198 /* copy this base character */ 199 j=srcLength; 200 do { 201 *dest++=src[j++]; 202 } while(j<i); 203 } while(srcLength>0); 204 break; 205 case UBIDI_KEEP_BASE_COMBINING: 206 /* 207 * Here, too, the destination 208 * run will have the same length as the source run, 209 * and there is no mirroring. 210 * We do need to keep combining characters with their base characters. 211 */ 212 if(destSize<srcLength) { 213 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 214 return srcLength; 215 } 216 destSize=srcLength; 217 218 /* preserve character integrity */ 219 do { 220 /* i is always after the last code unit known to need to be kept in this segment */ 221 i=srcLength; 222 223 /* collect code units and modifier letters for one base character */ 224 do { 225 U16_PREV(src, 0, srcLength, c); 226 } while(srcLength>0 && IS_COMBINING(u_charType(c))); 227 228 /* copy this "user character" */ 229 j=srcLength; 230 do { 231 *dest++=src[j++]; 232 } while(j<i); 233 } while(srcLength>0); 234 break; 235 default: 236 /* 237 * With several "complicated" options set, this is the most 238 * general and the slowest copying of an RTL run. 239 * We will do mirroring, remove BiDi controls, and 240 * keep combining characters with their base characters 241 * as requested. 242 */ 243 if(!(options&UBIDI_REMOVE_BIDI_CONTROLS)) { 244 i=srcLength; 245 } else { 246 /* we need to find out the destination length of the run, 247 which will not include the BiDi control characters */ 248 int32_t length=srcLength; 249 UChar ch; 250 251 i=0; 252 do { 253 ch=*src++; 254 if(!IS_BIDI_CONTROL_CHAR(ch)) { 255 ++i; 256 } 257 } while(--length>0); 258 src-=srcLength; 259 } 260 261 if(destSize<i) { 262 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 263 return i; 264 } 265 destSize=i; 266 267 /* preserve character integrity */ 268 do { 269 /* i is always after the last code unit known to need to be kept in this segment */ 270 i=srcLength; 271 272 /* collect code units for one base character */ 273 U16_PREV(src, 0, srcLength, c); 274 if(options&UBIDI_KEEP_BASE_COMBINING) { 275 /* collect modifier letters for this base character */ 276 while(srcLength>0 && IS_COMBINING(u_charType(c))) { 277 U16_PREV(src, 0, srcLength, c); 278 } 279 } 280 281 if(options&UBIDI_REMOVE_BIDI_CONTROLS && IS_BIDI_CONTROL_CHAR(c)) { 282 /* do not copy this BiDi control character */ 283 continue; 284 } 285 286 /* copy this "user character" */ 287 j=srcLength; 288 if(options&UBIDI_DO_MIRRORING) { 289 /* mirror only the base character */ 290 int32_t k=0; 291 c=u_charMirror(c); 292 U16_APPEND_UNSAFE(dest, k, c); 293 dest+=k; 294 j+=k; 295 } 296 while(j<i) { 297 *dest++=src[j++]; 298 } 299 } while(srcLength>0); 300 break; 301 } /* end of switch */ 302 303 return destSize; 304 } 305 306 U_CAPI int32_t U_EXPORT2 307 ubidi_writeReverse(const UChar *src, int32_t srcLength, 308 UChar *dest, int32_t destSize, 309 uint16_t options, 310 UErrorCode *pErrorCode) { 311 int32_t destLength; 312 313 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 314 return 0; 315 } 316 317 /* more error checking */ 318 if( src==NULL || srcLength<-1 || 319 destSize<0 || (destSize>0 && dest==NULL)) 320 { 321 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 322 return 0; 323 } 324 325 /* do input and output overlap? */ 326 if( dest!=NULL && 327 ((src>=dest && src<dest+destSize) || 328 (dest>=src && dest<src+srcLength))) 329 { 330 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 331 return 0; 332 } 333 334 if(srcLength==-1) { 335 srcLength=u_strlen(src); 336 } 337 if(srcLength>0) { 338 destLength=doWriteReverse(src, srcLength, dest, destSize, options, pErrorCode); 339 } else { 340 /* nothing to do */ 341 destLength=0; 342 } 343 344 return u_terminateUChars(dest, destSize, destLength, pErrorCode); 345 } 346 347 U_CAPI int32_t U_EXPORT2 348 ubidi_writeReordered(UBiDi *pBiDi, 349 UChar *dest, int32_t destSize, 350 uint16_t options, 351 UErrorCode *pErrorCode) { 352 const UChar *text; 353 UChar *saveDest; 354 int32_t length, destCapacity; 355 int32_t run, runCount, logicalStart, runLength; 356 357 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 358 return 0; 359 } 360 361 /* more error checking */ 362 if( pBiDi==NULL || 363 (text=pBiDi->text)==NULL || (length=pBiDi->length)<0 || 364 destSize<0 || (destSize>0 && dest==NULL)) 365 { 366 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 367 return 0; 368 } 369 370 /* do input and output overlap? */ 371 if( dest!=NULL && 372 ((text>=dest && text<dest+destSize) || 373 (dest>=text && dest<text+pBiDi->originalLength))) 374 { 375 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 376 return 0; 377 } 378 379 if(length==0) { 380 /* nothing to do */ 381 return u_terminateUChars(dest, destSize, 0, pErrorCode); 382 } 383 384 runCount=ubidi_countRuns(pBiDi, pErrorCode); 385 if(U_FAILURE(*pErrorCode)) { 386 return 0; 387 } 388 389 /* destSize shrinks, later destination length=destCapacity-destSize */ 390 saveDest=dest; 391 destCapacity=destSize; 392 393 /* 394 * Option "insert marks" implies UBIDI_INSERT_LRM_FOR_NUMERIC if the 395 * reordering mode (checked below) is appropriate. 396 */ 397 if(pBiDi->reorderingOptions & UBIDI_OPTION_INSERT_MARKS) { 398 options|=UBIDI_INSERT_LRM_FOR_NUMERIC; 399 options&=~UBIDI_REMOVE_BIDI_CONTROLS; 400 } 401 /* 402 * Option "remove controls" implies UBIDI_REMOVE_BIDI_CONTROLS 403 * and cancels UBIDI_INSERT_LRM_FOR_NUMERIC. 404 */ 405 if(pBiDi->reorderingOptions & UBIDI_OPTION_REMOVE_CONTROLS) { 406 options|=UBIDI_REMOVE_BIDI_CONTROLS; 407 options&=~UBIDI_INSERT_LRM_FOR_NUMERIC; 408 } 409 /* 410 * If we do not perform the "inverse BiDi" algorithm, then we 411 * don't need to insert any LRMs, and don't need to test for it. 412 */ 413 if((pBiDi->reorderingMode != UBIDI_REORDER_INVERSE_NUMBERS_AS_L) && 414 (pBiDi->reorderingMode != UBIDI_REORDER_INVERSE_LIKE_DIRECT) && 415 (pBiDi->reorderingMode != UBIDI_REORDER_INVERSE_FOR_NUMBERS_SPECIAL) && 416 (pBiDi->reorderingMode != UBIDI_REORDER_RUNS_ONLY)) { 417 options&=~UBIDI_INSERT_LRM_FOR_NUMERIC; 418 } 419 /* 420 * Iterate through all visual runs and copy the run text segments to 421 * the destination, according to the options. 422 * 423 * The tests for where to insert LRMs ignore the fact that there may be 424 * BN codes or non-BMP code points at the beginning and end of a run; 425 * they may insert LRMs unnecessarily but the tests are faster this way 426 * (this would have to be improved for UTF-8). 427 * 428 * Note that the only errors that are set by doWriteXY() are buffer overflow 429 * errors. Ignore them until the end, and continue for preflighting. 430 */ 431 if(!(options&UBIDI_OUTPUT_REVERSE)) { 432 /* forward output */ 433 if(!(options&UBIDI_INSERT_LRM_FOR_NUMERIC)) { 434 /* do not insert BiDi controls */ 435 for(run=0; run<runCount; ++run) { 436 if(UBIDI_LTR==ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength)) { 437 runLength=doWriteForward(text+logicalStart, runLength, 438 dest, destSize, 439 (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode); 440 } else { 441 runLength=doWriteReverse(text+logicalStart, runLength, 442 dest, destSize, 443 options, pErrorCode); 444 } 445 if(dest!=NULL) { 446 dest+=runLength; 447 } 448 destSize-=runLength; 449 } 450 } else { 451 /* insert BiDi controls for "inverse BiDi" */ 452 const DirProp *dirProps=pBiDi->dirProps; 453 const UChar *src; 454 UChar uc; 455 UBiDiDirection dir; 456 int32_t markFlag; 457 458 for(run=0; run<runCount; ++run) { 459 dir=ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength); 460 src=text+logicalStart; 461 /* check if something relevant in insertPoints */ 462 markFlag=pBiDi->runs[run].insertRemove; 463 if(markFlag<0) { /* BiDi controls count */ 464 markFlag=0; 465 } 466 467 if(UBIDI_LTR==dir) { 468 if((pBiDi->isInverse) && 469 (/*run>0 &&*/ dirProps[logicalStart]!=L)) { 470 markFlag |= LRM_BEFORE; 471 } 472 if (markFlag & LRM_BEFORE) { 473 uc=LRM_CHAR; 474 } 475 else if (markFlag & RLM_BEFORE) { 476 uc=RLM_CHAR; 477 } 478 else uc=0; 479 if(uc) { 480 if(destSize>0) { 481 *dest++=uc; 482 } 483 --destSize; 484 } 485 486 runLength=doWriteForward(src, runLength, 487 dest, destSize, 488 (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode); 489 if(dest!=NULL) { 490 dest+=runLength; 491 } 492 destSize-=runLength; 493 494 if((pBiDi->isInverse) && 495 (/*run<runCount-1 &&*/ dirProps[logicalStart+runLength-1]!=L)) { 496 markFlag |= LRM_AFTER; 497 } 498 if (markFlag & LRM_AFTER) { 499 uc=LRM_CHAR; 500 } 501 else if (markFlag & RLM_AFTER) { 502 uc=RLM_CHAR; 503 } 504 else uc=0; 505 if(uc) { 506 if(destSize>0) { 507 *dest++=uc; 508 } 509 --destSize; 510 } 511 } else { /* RTL run */ 512 if((pBiDi->isInverse) && 513 (/*run>0 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart+runLength-1])))) { 514 markFlag |= RLM_BEFORE; 515 } 516 if (markFlag & LRM_BEFORE) { 517 uc=LRM_CHAR; 518 } 519 else if (markFlag & RLM_BEFORE) { 520 uc=RLM_CHAR; 521 } 522 else uc=0; 523 if(uc) { 524 if(destSize>0) { 525 *dest++=uc; 526 } 527 --destSize; 528 } 529 530 runLength=doWriteReverse(src, runLength, 531 dest, destSize, 532 options, pErrorCode); 533 if(dest!=NULL) { 534 dest+=runLength; 535 } 536 destSize-=runLength; 537 538 if((pBiDi->isInverse) && 539 (/*run<runCount-1 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart])))) { 540 markFlag |= RLM_AFTER; 541 } 542 if (markFlag & LRM_AFTER) { 543 uc=LRM_CHAR; 544 } 545 else if (markFlag & RLM_AFTER) { 546 uc=RLM_CHAR; 547 } 548 else uc=0; 549 if(uc) { 550 if(destSize>0) { 551 *dest++=uc; 552 } 553 --destSize; 554 } 555 } 556 } 557 } 558 } else { 559 /* reverse output */ 560 if(!(options&UBIDI_INSERT_LRM_FOR_NUMERIC)) { 561 /* do not insert BiDi controls */ 562 for(run=runCount; --run>=0;) { 563 if(UBIDI_LTR==ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength)) { 564 runLength=doWriteReverse(text+logicalStart, runLength, 565 dest, destSize, 566 (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode); 567 } else { 568 runLength=doWriteForward(text+logicalStart, runLength, 569 dest, destSize, 570 options, pErrorCode); 571 } 572 if(dest!=NULL) { 573 dest+=runLength; 574 } 575 destSize-=runLength; 576 } 577 } else { 578 /* insert BiDi controls for "inverse BiDi" */ 579 const DirProp *dirProps=pBiDi->dirProps; 580 const UChar *src; 581 UBiDiDirection dir; 582 583 for(run=runCount; --run>=0;) { 584 /* reverse output */ 585 dir=ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength); 586 src=text+logicalStart; 587 588 if(UBIDI_LTR==dir) { 589 if(/*run<runCount-1 &&*/ dirProps[logicalStart+runLength-1]!=L) { 590 if(destSize>0) { 591 *dest++=LRM_CHAR; 592 } 593 --destSize; 594 } 595 596 runLength=doWriteReverse(src, runLength, 597 dest, destSize, 598 (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode); 599 if(dest!=NULL) { 600 dest+=runLength; 601 } 602 destSize-=runLength; 603 604 if(/*run>0 &&*/ dirProps[logicalStart]!=L) { 605 if(destSize>0) { 606 *dest++=LRM_CHAR; 607 } 608 --destSize; 609 } 610 } else { 611 if(/*run<runCount-1 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart]))) { 612 if(destSize>0) { 613 *dest++=RLM_CHAR; 614 } 615 --destSize; 616 } 617 618 runLength=doWriteForward(src, runLength, 619 dest, destSize, 620 options, pErrorCode); 621 if(dest!=NULL) { 622 dest+=runLength; 623 } 624 destSize-=runLength; 625 626 if(/*run>0 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart+runLength-1]))) { 627 if(destSize>0) { 628 *dest++=RLM_CHAR; 629 } 630 --destSize; 631 } 632 } 633 } 634 } 635 } 636 637 return u_terminateUChars(saveDest, destCapacity, destCapacity-destSize, pErrorCode); 638 } 639