1 /* 2 ****************************************************************************** 3 * 4 * Copyright (C) 2000-2011, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ****************************************************************************** 8 * file name: ubidiwrt.c 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 1999aug06 14 * created by: Markus W. Scherer, updated by Matitiahu Allouche 15 * 16 * This file contains implementations for BiDi functions that use 17 * the core algorithm and core API to write reordered text. 18 */ 19 20 /* set import/export definitions */ 21 #ifndef U_COMMON_IMPLEMENTATION 22 # define U_COMMON_IMPLEMENTATION 23 #endif 24 25 #include "unicode/utypes.h" 26 #include "unicode/ustring.h" 27 #include "unicode/uchar.h" 28 #include "unicode/ubidi.h" 29 #include "unicode/utf16.h" 30 #include "cmemory.h" 31 #include "ustr_imp.h" 32 #include "ubidiimp.h" 33 34 /* 35 * The function implementations in this file are designed 36 * for UTF-16 and UTF-32, not for UTF-8. 37 * 38 * Assumptions that are not true for UTF-8: 39 * - Any code point always needs the same number of code units 40 * ("minimum-length-problem" of UTF-8) 41 * - The BiDi control characters need only one code unit each 42 * 43 * Further assumptions for all UTFs: 44 * - u_charMirror(c) needs the same number of code units as c 45 */ 46 #if UTF_SIZE==8 47 # error reimplement ubidi_writeReordered() for UTF-8, see comment above 48 #endif 49 50 #define IS_COMBINING(type) ((1UL<<(type))&(1UL<<U_NON_SPACING_MARK|1UL<<U_COMBINING_SPACING_MARK|1UL<<U_ENCLOSING_MARK)) 51 52 /* 53 * When we have UBIDI_OUTPUT_REVERSE set on ubidi_writeReordered(), then we 54 * semantically write RTL runs in reverse and later reverse them again. 55 * Instead, we actually write them in forward order to begin with. 56 * However, if the RTL run was to be mirrored, we need to mirror here now 57 * since the implicit second reversal must not do it. 58 * It looks strange to do mirroring in LTR output, but it is only because 59 * we are writing RTL output in reverse. 60 */ 61 static int32_t 62 doWriteForward(const UChar *src, int32_t srcLength, 63 UChar *dest, int32_t destSize, 64 uint16_t options, 65 UErrorCode *pErrorCode) { 66 /* optimize for several combinations of options */ 67 switch(options&(UBIDI_REMOVE_BIDI_CONTROLS|UBIDI_DO_MIRRORING)) { 68 case 0: { 69 /* simply copy the LTR run to the destination */ 70 int32_t length=srcLength; 71 if(destSize<length) { 72 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 73 return srcLength; 74 } 75 do { 76 *dest++=*src++; 77 } while(--length>0); 78 return srcLength; 79 } 80 case UBIDI_DO_MIRRORING: { 81 /* do mirroring */ 82 int32_t i=0, j=0; 83 UChar32 c; 84 85 if(destSize<srcLength) { 86 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 87 return srcLength; 88 } 89 do { 90 U16_NEXT(src, i, srcLength, c); 91 c=u_charMirror(c); 92 U16_APPEND_UNSAFE(dest, j, c); 93 } while(i<srcLength); 94 return srcLength; 95 } 96 case UBIDI_REMOVE_BIDI_CONTROLS: { 97 /* copy the LTR run and remove any BiDi control characters */ 98 int32_t remaining=destSize; 99 UChar c; 100 do { 101 c=*src++; 102 if(!IS_BIDI_CONTROL_CHAR(c)) { 103 if(--remaining<0) { 104 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 105 106 /* preflight the length */ 107 while(--srcLength>0) { 108 c=*src++; 109 if(!IS_BIDI_CONTROL_CHAR(c)) { 110 --remaining; 111 } 112 } 113 return destSize-remaining; 114 } 115 *dest++=c; 116 } 117 } while(--srcLength>0); 118 return destSize-remaining; 119 } 120 default: { 121 /* remove BiDi control characters and do mirroring */ 122 int32_t remaining=destSize; 123 int32_t i, j=0; 124 UChar32 c; 125 do { 126 i=0; 127 U16_NEXT(src, i, srcLength, c); 128 src+=i; 129 srcLength-=i; 130 if(!IS_BIDI_CONTROL_CHAR(c)) { 131 remaining-=i; 132 if(remaining<0) { 133 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 134 135 /* preflight the length */ 136 while(srcLength>0) { 137 c=*src++; 138 if(!IS_BIDI_CONTROL_CHAR(c)) { 139 --remaining; 140 } 141 --srcLength; 142 } 143 return destSize-remaining; 144 } 145 c=u_charMirror(c); 146 U16_APPEND_UNSAFE(dest, j, c); 147 } 148 } while(srcLength>0); 149 return j; 150 } 151 } /* end of switch */ 152 } 153 154 static int32_t 155 doWriteReverse(const UChar *src, int32_t srcLength, 156 UChar *dest, int32_t destSize, 157 uint16_t options, 158 UErrorCode *pErrorCode) { 159 /* 160 * RTL run - 161 * 162 * RTL runs need to be copied to the destination in reverse order 163 * of code points, not code units, to keep Unicode characters intact. 164 * 165 * The general strategy for this is to read the source text 166 * in backward order, collect all code units for a code point 167 * (and optionally following combining characters, see below), 168 * and copy all these code units in ascending order 169 * to the destination for this run. 170 * 171 * Several options request whether combining characters 172 * should be kept after their base characters, 173 * whether BiDi control characters should be removed, and 174 * whether characters should be replaced by their mirror-image 175 * equivalent Unicode characters. 176 */ 177 int32_t i, j; 178 UChar32 c; 179 180 /* optimize for several combinations of options */ 181 switch(options&(UBIDI_REMOVE_BIDI_CONTROLS|UBIDI_DO_MIRRORING|UBIDI_KEEP_BASE_COMBINING)) { 182 case 0: 183 /* 184 * With none of the "complicated" options set, the destination 185 * run will have the same length as the source run, 186 * and there is no mirroring and no keeping combining characters 187 * with their base characters. 188 */ 189 if(destSize<srcLength) { 190 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 191 return srcLength; 192 } 193 destSize=srcLength; 194 195 /* preserve character integrity */ 196 do { 197 /* i is always after the last code unit known to need to be kept in this segment */ 198 i=srcLength; 199 200 /* collect code units for one base character */ 201 U16_BACK_1(src, 0, srcLength); 202 203 /* copy this base character */ 204 j=srcLength; 205 do { 206 *dest++=src[j++]; 207 } while(j<i); 208 } while(srcLength>0); 209 break; 210 case UBIDI_KEEP_BASE_COMBINING: 211 /* 212 * Here, too, the destination 213 * run will have the same length as the source run, 214 * and there is no mirroring. 215 * We do need to keep combining characters with their base characters. 216 */ 217 if(destSize<srcLength) { 218 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 219 return srcLength; 220 } 221 destSize=srcLength; 222 223 /* preserve character integrity */ 224 do { 225 /* i is always after the last code unit known to need to be kept in this segment */ 226 i=srcLength; 227 228 /* collect code units and modifier letters for one base character */ 229 do { 230 U16_PREV(src, 0, srcLength, c); 231 } while(srcLength>0 && IS_COMBINING(u_charType(c))); 232 233 /* copy this "user character" */ 234 j=srcLength; 235 do { 236 *dest++=src[j++]; 237 } while(j<i); 238 } while(srcLength>0); 239 break; 240 default: 241 /* 242 * With several "complicated" options set, this is the most 243 * general and the slowest copying of an RTL run. 244 * We will do mirroring, remove BiDi controls, and 245 * keep combining characters with their base characters 246 * as requested. 247 */ 248 if(!(options&UBIDI_REMOVE_BIDI_CONTROLS)) { 249 i=srcLength; 250 } else { 251 /* we need to find out the destination length of the run, 252 which will not include the BiDi control characters */ 253 int32_t length=srcLength; 254 UChar ch; 255 256 i=0; 257 do { 258 ch=*src++; 259 if(!IS_BIDI_CONTROL_CHAR(ch)) { 260 ++i; 261 } 262 } while(--length>0); 263 src-=srcLength; 264 } 265 266 if(destSize<i) { 267 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 268 return i; 269 } 270 destSize=i; 271 272 /* preserve character integrity */ 273 do { 274 /* i is always after the last code unit known to need to be kept in this segment */ 275 i=srcLength; 276 277 /* collect code units for one base character */ 278 U16_PREV(src, 0, srcLength, c); 279 if(options&UBIDI_KEEP_BASE_COMBINING) { 280 /* collect modifier letters for this base character */ 281 while(srcLength>0 && IS_COMBINING(u_charType(c))) { 282 U16_PREV(src, 0, srcLength, c); 283 } 284 } 285 286 if(options&UBIDI_REMOVE_BIDI_CONTROLS && IS_BIDI_CONTROL_CHAR(c)) { 287 /* do not copy this BiDi control character */ 288 continue; 289 } 290 291 /* copy this "user character" */ 292 j=srcLength; 293 if(options&UBIDI_DO_MIRRORING) { 294 /* mirror only the base character */ 295 int32_t k=0; 296 c=u_charMirror(c); 297 U16_APPEND_UNSAFE(dest, k, c); 298 dest+=k; 299 j+=k; 300 } 301 while(j<i) { 302 *dest++=src[j++]; 303 } 304 } while(srcLength>0); 305 break; 306 } /* end of switch */ 307 308 return destSize; 309 } 310 311 U_CAPI int32_t U_EXPORT2 312 ubidi_writeReverse(const UChar *src, int32_t srcLength, 313 UChar *dest, int32_t destSize, 314 uint16_t options, 315 UErrorCode *pErrorCode) { 316 int32_t destLength; 317 318 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 319 return 0; 320 } 321 322 /* more error checking */ 323 if( src==NULL || srcLength<-1 || 324 destSize<0 || (destSize>0 && dest==NULL)) 325 { 326 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 327 return 0; 328 } 329 330 /* do input and output overlap? */ 331 if( dest!=NULL && 332 ((src>=dest && src<dest+destSize) || 333 (dest>=src && dest<src+srcLength))) 334 { 335 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 336 return 0; 337 } 338 339 if(srcLength==-1) { 340 srcLength=u_strlen(src); 341 } 342 if(srcLength>0) { 343 destLength=doWriteReverse(src, srcLength, dest, destSize, options, pErrorCode); 344 } else { 345 /* nothing to do */ 346 destLength=0; 347 } 348 349 return u_terminateUChars(dest, destSize, destLength, pErrorCode); 350 } 351 352 U_CAPI int32_t U_EXPORT2 353 ubidi_writeReordered(UBiDi *pBiDi, 354 UChar *dest, int32_t destSize, 355 uint16_t options, 356 UErrorCode *pErrorCode) { 357 const UChar *text; 358 UChar *saveDest; 359 int32_t length, destCapacity; 360 int32_t run, runCount, logicalStart, runLength; 361 362 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 363 return 0; 364 } 365 366 /* more error checking */ 367 if( pBiDi==NULL || 368 (text=pBiDi->text)==NULL || (length=pBiDi->length)<0 || 369 destSize<0 || (destSize>0 && dest==NULL)) 370 { 371 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 372 return 0; 373 } 374 375 /* do input and output overlap? */ 376 if( dest!=NULL && 377 ((text>=dest && text<dest+destSize) || 378 (dest>=text && dest<text+pBiDi->originalLength))) 379 { 380 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 381 return 0; 382 } 383 384 if(length==0) { 385 /* nothing to do */ 386 return u_terminateUChars(dest, destSize, 0, pErrorCode); 387 } 388 389 runCount=ubidi_countRuns(pBiDi, pErrorCode); 390 if(U_FAILURE(*pErrorCode)) { 391 return 0; 392 } 393 394 /* destSize shrinks, later destination length=destCapacity-destSize */ 395 saveDest=dest; 396 destCapacity=destSize; 397 398 /* 399 * Option "insert marks" implies UBIDI_INSERT_LRM_FOR_NUMERIC if the 400 * reordering mode (checked below) is appropriate. 401 */ 402 if(pBiDi->reorderingOptions & UBIDI_OPTION_INSERT_MARKS) { 403 options|=UBIDI_INSERT_LRM_FOR_NUMERIC; 404 options&=~UBIDI_REMOVE_BIDI_CONTROLS; 405 } 406 /* 407 * Option "remove controls" implies UBIDI_REMOVE_BIDI_CONTROLS 408 * and cancels UBIDI_INSERT_LRM_FOR_NUMERIC. 409 */ 410 if(pBiDi->reorderingOptions & UBIDI_OPTION_REMOVE_CONTROLS) { 411 options|=UBIDI_REMOVE_BIDI_CONTROLS; 412 options&=~UBIDI_INSERT_LRM_FOR_NUMERIC; 413 } 414 /* 415 * If we do not perform the "inverse BiDi" algorithm, then we 416 * don't need to insert any LRMs, and don't need to test for it. 417 */ 418 if((pBiDi->reorderingMode != UBIDI_REORDER_INVERSE_NUMBERS_AS_L) && 419 (pBiDi->reorderingMode != UBIDI_REORDER_INVERSE_LIKE_DIRECT) && 420 (pBiDi->reorderingMode != UBIDI_REORDER_INVERSE_FOR_NUMBERS_SPECIAL) && 421 (pBiDi->reorderingMode != UBIDI_REORDER_RUNS_ONLY)) { 422 options&=~UBIDI_INSERT_LRM_FOR_NUMERIC; 423 } 424 /* 425 * Iterate through all visual runs and copy the run text segments to 426 * the destination, according to the options. 427 * 428 * The tests for where to insert LRMs ignore the fact that there may be 429 * BN codes or non-BMP code points at the beginning and end of a run; 430 * they may insert LRMs unnecessarily but the tests are faster this way 431 * (this would have to be improved for UTF-8). 432 * 433 * Note that the only errors that are set by doWriteXY() are buffer overflow 434 * errors. Ignore them until the end, and continue for preflighting. 435 */ 436 if(!(options&UBIDI_OUTPUT_REVERSE)) { 437 /* forward output */ 438 if(!(options&UBIDI_INSERT_LRM_FOR_NUMERIC)) { 439 /* do not insert BiDi controls */ 440 for(run=0; run<runCount; ++run) { 441 if(UBIDI_LTR==ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength)) { 442 runLength=doWriteForward(text+logicalStart, runLength, 443 dest, destSize, 444 (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode); 445 } else { 446 runLength=doWriteReverse(text+logicalStart, runLength, 447 dest, destSize, 448 options, pErrorCode); 449 } 450 if(dest!=NULL) { 451 dest+=runLength; 452 } 453 destSize-=runLength; 454 } 455 } else { 456 /* insert BiDi controls for "inverse BiDi" */ 457 const DirProp *dirProps=pBiDi->dirProps; 458 const UChar *src; 459 UChar uc; 460 UBiDiDirection dir; 461 int32_t markFlag; 462 463 for(run=0; run<runCount; ++run) { 464 dir=ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength); 465 src=text+logicalStart; 466 /* check if something relevant in insertPoints */ 467 markFlag=pBiDi->runs[run].insertRemove; 468 if(markFlag<0) { /* BiDi controls count */ 469 markFlag=0; 470 } 471 472 if(UBIDI_LTR==dir) { 473 if((pBiDi->isInverse) && 474 (/*run>0 &&*/ dirProps[logicalStart]!=L)) { 475 markFlag |= LRM_BEFORE; 476 } 477 if (markFlag & LRM_BEFORE) { 478 uc=LRM_CHAR; 479 } 480 else if (markFlag & RLM_BEFORE) { 481 uc=RLM_CHAR; 482 } 483 else uc=0; 484 if(uc) { 485 if(destSize>0) { 486 *dest++=uc; 487 } 488 --destSize; 489 } 490 491 runLength=doWriteForward(src, runLength, 492 dest, destSize, 493 (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode); 494 if(dest!=NULL) { 495 dest+=runLength; 496 } 497 destSize-=runLength; 498 499 if((pBiDi->isInverse) && 500 (/*run<runCount-1 &&*/ dirProps[logicalStart+runLength-1]!=L)) { 501 markFlag |= LRM_AFTER; 502 } 503 if (markFlag & LRM_AFTER) { 504 uc=LRM_CHAR; 505 } 506 else if (markFlag & RLM_AFTER) { 507 uc=RLM_CHAR; 508 } 509 else uc=0; 510 if(uc) { 511 if(destSize>0) { 512 *dest++=uc; 513 } 514 --destSize; 515 } 516 } else { /* RTL run */ 517 if((pBiDi->isInverse) && 518 (/*run>0 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart+runLength-1])))) { 519 markFlag |= RLM_BEFORE; 520 } 521 if (markFlag & LRM_BEFORE) { 522 uc=LRM_CHAR; 523 } 524 else if (markFlag & RLM_BEFORE) { 525 uc=RLM_CHAR; 526 } 527 else uc=0; 528 if(uc) { 529 if(destSize>0) { 530 *dest++=uc; 531 } 532 --destSize; 533 } 534 535 runLength=doWriteReverse(src, runLength, 536 dest, destSize, 537 options, pErrorCode); 538 if(dest!=NULL) { 539 dest+=runLength; 540 } 541 destSize-=runLength; 542 543 if((pBiDi->isInverse) && 544 (/*run<runCount-1 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart])))) { 545 markFlag |= RLM_AFTER; 546 } 547 if (markFlag & LRM_AFTER) { 548 uc=LRM_CHAR; 549 } 550 else if (markFlag & RLM_AFTER) { 551 uc=RLM_CHAR; 552 } 553 else uc=0; 554 if(uc) { 555 if(destSize>0) { 556 *dest++=uc; 557 } 558 --destSize; 559 } 560 } 561 } 562 } 563 } else { 564 /* reverse output */ 565 if(!(options&UBIDI_INSERT_LRM_FOR_NUMERIC)) { 566 /* do not insert BiDi controls */ 567 for(run=runCount; --run>=0;) { 568 if(UBIDI_LTR==ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength)) { 569 runLength=doWriteReverse(text+logicalStart, runLength, 570 dest, destSize, 571 (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode); 572 } else { 573 runLength=doWriteForward(text+logicalStart, runLength, 574 dest, destSize, 575 options, pErrorCode); 576 } 577 if(dest!=NULL) { 578 dest+=runLength; 579 } 580 destSize-=runLength; 581 } 582 } else { 583 /* insert BiDi controls for "inverse BiDi" */ 584 const DirProp *dirProps=pBiDi->dirProps; 585 const UChar *src; 586 UBiDiDirection dir; 587 588 for(run=runCount; --run>=0;) { 589 /* reverse output */ 590 dir=ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength); 591 src=text+logicalStart; 592 593 if(UBIDI_LTR==dir) { 594 if(/*run<runCount-1 &&*/ dirProps[logicalStart+runLength-1]!=L) { 595 if(destSize>0) { 596 *dest++=LRM_CHAR; 597 } 598 --destSize; 599 } 600 601 runLength=doWriteReverse(src, runLength, 602 dest, destSize, 603 (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode); 604 if(dest!=NULL) { 605 dest+=runLength; 606 } 607 destSize-=runLength; 608 609 if(/*run>0 &&*/ dirProps[logicalStart]!=L) { 610 if(destSize>0) { 611 *dest++=LRM_CHAR; 612 } 613 --destSize; 614 } 615 } else { 616 if(/*run<runCount-1 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart]))) { 617 if(destSize>0) { 618 *dest++=RLM_CHAR; 619 } 620 --destSize; 621 } 622 623 runLength=doWriteForward(src, runLength, 624 dest, destSize, 625 options, pErrorCode); 626 if(dest!=NULL) { 627 dest+=runLength; 628 } 629 destSize-=runLength; 630 631 if(/*run>0 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart+runLength-1]))) { 632 if(destSize>0) { 633 *dest++=RLM_CHAR; 634 } 635 --destSize; 636 } 637 } 638 } 639 } 640 } 641 642 return u_terminateUChars(saveDest, destCapacity, destCapacity-destSize, pErrorCode); 643 } 644