1 /* 2 ****************************************************************************** 3 * 4 * Copyright (C) 2000-2007, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ****************************************************************************** 8 * file name: ubidiwrt.c 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 1999aug06 14 * created by: Markus W. Scherer, updated by Matitiahu Allouche 15 * 16 * This file contains implementations for BiDi functions that use 17 * the core algorithm and core API to write reordered text. 18 */ 19 20 /* set import/export definitions */ 21 #ifndef U_COMMON_IMPLEMENTATION 22 # define U_COMMON_IMPLEMENTATION 23 #endif 24 25 #include "unicode/utypes.h" 26 #include "unicode/ustring.h" 27 #include "unicode/uchar.h" 28 #include "unicode/ubidi.h" 29 #include "cmemory.h" 30 #include "ustr_imp.h" 31 #include "ubidiimp.h" 32 33 /* 34 * The function implementations in this file are designed 35 * for UTF-16 and UTF-32, not for UTF-8. 36 * 37 * Assumptions that are not true for UTF-8: 38 * - Any code point always needs the same number of code units 39 * ("minimum-length-problem" of UTF-8) 40 * - The BiDi control characters need only one code unit each 41 * 42 * Further assumptions for all UTFs: 43 * - u_charMirror(c) needs the same number of code units as c 44 */ 45 #if UTF_SIZE==8 46 # error reimplement ubidi_writeReordered() for UTF-8, see comment above 47 #endif 48 49 #define IS_COMBINING(type) ((1UL<<(type))&(1UL<<U_NON_SPACING_MARK|1UL<<U_COMBINING_SPACING_MARK|1UL<<U_ENCLOSING_MARK)) 50 51 /* 52 * When we have UBIDI_OUTPUT_REVERSE set on ubidi_writeReordered(), then we 53 * semantically write RTL runs in reverse and later reverse them again. 54 * Instead, we actually write them in forward order to begin with. 55 * However, if the RTL run was to be mirrored, we need to mirror here now 56 * since the implicit second reversal must not do it. 57 * It looks strange to do mirroring in LTR output, but it is only because 58 * we are writing RTL output in reverse. 59 */ 60 static int32_t 61 doWriteForward(const UChar *src, int32_t srcLength, 62 UChar *dest, int32_t destSize, 63 uint16_t options, 64 UErrorCode *pErrorCode) { 65 /* optimize for several combinations of options */ 66 switch(options&(UBIDI_REMOVE_BIDI_CONTROLS|UBIDI_DO_MIRRORING)) { 67 case 0: { 68 /* simply copy the LTR run to the destination */ 69 int32_t length=srcLength; 70 if(destSize<length) { 71 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 72 return srcLength; 73 } 74 do { 75 *dest++=*src++; 76 } while(--length>0); 77 return srcLength; 78 } 79 case UBIDI_DO_MIRRORING: { 80 /* do mirroring */ 81 int32_t i=0, j=0; 82 UChar32 c; 83 84 if(destSize<srcLength) { 85 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 86 return srcLength; 87 } 88 do { 89 UTF_NEXT_CHAR(src, i, srcLength, c); 90 c=u_charMirror(c); 91 UTF_APPEND_CHAR_UNSAFE(dest, j, c); 92 } while(i<srcLength); 93 return srcLength; 94 } 95 case UBIDI_REMOVE_BIDI_CONTROLS: { 96 /* copy the LTR run and remove any BiDi control characters */ 97 int32_t remaining=destSize; 98 UChar c; 99 do { 100 c=*src++; 101 if(!IS_BIDI_CONTROL_CHAR(c)) { 102 if(--remaining<0) { 103 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 104 105 /* preflight the length */ 106 while(--srcLength>0) { 107 c=*src++; 108 if(!IS_BIDI_CONTROL_CHAR(c)) { 109 --remaining; 110 } 111 } 112 return destSize-remaining; 113 } 114 *dest++=c; 115 } 116 } while(--srcLength>0); 117 return destSize-remaining; 118 } 119 default: { 120 /* remove BiDi control characters and do mirroring */ 121 int32_t remaining=destSize; 122 int32_t i, j=0; 123 UChar32 c; 124 do { 125 i=0; 126 UTF_NEXT_CHAR(src, i, srcLength, c); 127 src+=i; 128 srcLength-=i; 129 if(!IS_BIDI_CONTROL_CHAR(c)) { 130 remaining-=i; 131 if(remaining<0) { 132 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 133 134 /* preflight the length */ 135 while(srcLength>0) { 136 c=*src++; 137 if(!IS_BIDI_CONTROL_CHAR(c)) { 138 --remaining; 139 } 140 --srcLength; 141 } 142 return destSize-remaining; 143 } 144 c=u_charMirror(c); 145 UTF_APPEND_CHAR_UNSAFE(dest, j, c); 146 } 147 } while(srcLength>0); 148 return j; 149 } 150 } /* end of switch */ 151 } 152 153 static int32_t 154 doWriteReverse(const UChar *src, int32_t srcLength, 155 UChar *dest, int32_t destSize, 156 uint16_t options, 157 UErrorCode *pErrorCode) { 158 /* 159 * RTL run - 160 * 161 * RTL runs need to be copied to the destination in reverse order 162 * of code points, not code units, to keep Unicode characters intact. 163 * 164 * The general strategy for this is to read the source text 165 * in backward order, collect all code units for a code point 166 * (and optionally following combining characters, see below), 167 * and copy all these code units in ascending order 168 * to the destination for this run. 169 * 170 * Several options request whether combining characters 171 * should be kept after their base characters, 172 * whether BiDi control characters should be removed, and 173 * whether characters should be replaced by their mirror-image 174 * equivalent Unicode characters. 175 */ 176 int32_t i, j; 177 UChar32 c; 178 179 /* optimize for several combinations of options */ 180 switch(options&(UBIDI_REMOVE_BIDI_CONTROLS|UBIDI_DO_MIRRORING|UBIDI_KEEP_BASE_COMBINING)) { 181 case 0: 182 /* 183 * With none of the "complicated" options set, the destination 184 * run will have the same length as the source run, 185 * and there is no mirroring and no keeping combining characters 186 * with their base characters. 187 */ 188 if(destSize<srcLength) { 189 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 190 return srcLength; 191 } 192 destSize=srcLength; 193 194 /* preserve character integrity */ 195 do { 196 /* i is always after the last code unit known to need to be kept in this segment */ 197 i=srcLength; 198 199 /* collect code units for one base character */ 200 UTF_BACK_1(src, 0, srcLength); 201 202 /* copy this base character */ 203 j=srcLength; 204 do { 205 *dest++=src[j++]; 206 } while(j<i); 207 } while(srcLength>0); 208 break; 209 case UBIDI_KEEP_BASE_COMBINING: 210 /* 211 * Here, too, the destination 212 * run will have the same length as the source run, 213 * and there is no mirroring. 214 * We do need to keep combining characters with their base characters. 215 */ 216 if(destSize<srcLength) { 217 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 218 return srcLength; 219 } 220 destSize=srcLength; 221 222 /* preserve character integrity */ 223 do { 224 /* i is always after the last code unit known to need to be kept in this segment */ 225 i=srcLength; 226 227 /* collect code units and modifier letters for one base character */ 228 do { 229 UTF_PREV_CHAR(src, 0, srcLength, c); 230 } while(srcLength>0 && IS_COMBINING(u_charType(c))); 231 232 /* copy this "user character" */ 233 j=srcLength; 234 do { 235 *dest++=src[j++]; 236 } while(j<i); 237 } while(srcLength>0); 238 break; 239 default: 240 /* 241 * With several "complicated" options set, this is the most 242 * general and the slowest copying of an RTL run. 243 * We will do mirroring, remove BiDi controls, and 244 * keep combining characters with their base characters 245 * as requested. 246 */ 247 if(!(options&UBIDI_REMOVE_BIDI_CONTROLS)) { 248 i=srcLength; 249 } else { 250 /* we need to find out the destination length of the run, 251 which will not include the BiDi control characters */ 252 int32_t length=srcLength; 253 UChar ch; 254 255 i=0; 256 do { 257 ch=*src++; 258 if(!IS_BIDI_CONTROL_CHAR(ch)) { 259 ++i; 260 } 261 } while(--length>0); 262 src-=srcLength; 263 } 264 265 if(destSize<i) { 266 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 267 return i; 268 } 269 destSize=i; 270 271 /* preserve character integrity */ 272 do { 273 /* i is always after the last code unit known to need to be kept in this segment */ 274 i=srcLength; 275 276 /* collect code units for one base character */ 277 UTF_PREV_CHAR(src, 0, srcLength, c); 278 if(options&UBIDI_KEEP_BASE_COMBINING) { 279 /* collect modifier letters for this base character */ 280 while(srcLength>0 && IS_COMBINING(u_charType(c))) { 281 UTF_PREV_CHAR(src, 0, srcLength, c); 282 } 283 } 284 285 if(options&UBIDI_REMOVE_BIDI_CONTROLS && IS_BIDI_CONTROL_CHAR(c)) { 286 /* do not copy this BiDi control character */ 287 continue; 288 } 289 290 /* copy this "user character" */ 291 j=srcLength; 292 if(options&UBIDI_DO_MIRRORING) { 293 /* mirror only the base character */ 294 int32_t k=0; 295 c=u_charMirror(c); 296 UTF_APPEND_CHAR_UNSAFE(dest, k, c); 297 dest+=k; 298 j+=k; 299 } 300 while(j<i) { 301 *dest++=src[j++]; 302 } 303 } while(srcLength>0); 304 break; 305 } /* end of switch */ 306 307 return destSize; 308 } 309 310 U_CAPI int32_t U_EXPORT2 311 ubidi_writeReverse(const UChar *src, int32_t srcLength, 312 UChar *dest, int32_t destSize, 313 uint16_t options, 314 UErrorCode *pErrorCode) { 315 int32_t destLength; 316 317 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 318 return 0; 319 } 320 321 /* more error checking */ 322 if( src==NULL || srcLength<-1 || 323 destSize<0 || (destSize>0 && dest==NULL)) 324 { 325 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 326 return 0; 327 } 328 329 /* do input and output overlap? */ 330 if( dest!=NULL && 331 ((src>=dest && src<dest+destSize) || 332 (dest>=src && dest<src+srcLength))) 333 { 334 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 335 return 0; 336 } 337 338 if(srcLength==-1) { 339 srcLength=u_strlen(src); 340 } 341 if(srcLength>0) { 342 destLength=doWriteReverse(src, srcLength, dest, destSize, options, pErrorCode); 343 } else { 344 /* nothing to do */ 345 destLength=0; 346 } 347 348 return u_terminateUChars(dest, destSize, destLength, pErrorCode); 349 } 350 351 U_CAPI int32_t U_EXPORT2 352 ubidi_writeReordered(UBiDi *pBiDi, 353 UChar *dest, int32_t destSize, 354 uint16_t options, 355 UErrorCode *pErrorCode) { 356 const UChar *text; 357 UChar *saveDest; 358 int32_t length, destCapacity; 359 int32_t run, runCount, logicalStart, runLength; 360 361 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 362 return 0; 363 } 364 365 /* more error checking */ 366 if( pBiDi==NULL || 367 (text=pBiDi->text)==NULL || (length=pBiDi->length)<0 || 368 destSize<0 || (destSize>0 && dest==NULL)) 369 { 370 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 371 return 0; 372 } 373 374 /* do input and output overlap? */ 375 if( dest!=NULL && 376 ((text>=dest && text<dest+destSize) || 377 (dest>=text && dest<text+pBiDi->originalLength))) 378 { 379 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 380 return 0; 381 } 382 383 if(length==0) { 384 /* nothing to do */ 385 return u_terminateUChars(dest, destSize, 0, pErrorCode); 386 } 387 388 runCount=ubidi_countRuns(pBiDi, pErrorCode); 389 if(U_FAILURE(*pErrorCode)) { 390 return 0; 391 } 392 393 /* destSize shrinks, later destination length=destCapacity-destSize */ 394 saveDest=dest; 395 destCapacity=destSize; 396 397 /* 398 * Option "insert marks" implies UBIDI_INSERT_LRM_FOR_NUMERIC if the 399 * reordering mode (checked below) is appropriate. 400 */ 401 if(pBiDi->reorderingOptions & UBIDI_OPTION_INSERT_MARKS) { 402 options|=UBIDI_INSERT_LRM_FOR_NUMERIC; 403 options&=~UBIDI_REMOVE_BIDI_CONTROLS; 404 } 405 /* 406 * Option "remove controls" implies UBIDI_REMOVE_BIDI_CONTROLS 407 * and cancels UBIDI_INSERT_LRM_FOR_NUMERIC. 408 */ 409 if(pBiDi->reorderingOptions & UBIDI_OPTION_REMOVE_CONTROLS) { 410 options|=UBIDI_REMOVE_BIDI_CONTROLS; 411 options&=~UBIDI_INSERT_LRM_FOR_NUMERIC; 412 } 413 /* 414 * If we do not perform the "inverse BiDi" algorithm, then we 415 * don't need to insert any LRMs, and don't need to test for it. 416 */ 417 if((pBiDi->reorderingMode != UBIDI_REORDER_INVERSE_NUMBERS_AS_L) && 418 (pBiDi->reorderingMode != UBIDI_REORDER_INVERSE_LIKE_DIRECT) && 419 (pBiDi->reorderingMode != UBIDI_REORDER_INVERSE_FOR_NUMBERS_SPECIAL) && 420 (pBiDi->reorderingMode != UBIDI_REORDER_RUNS_ONLY)) { 421 options&=~UBIDI_INSERT_LRM_FOR_NUMERIC; 422 } 423 /* 424 * Iterate through all visual runs and copy the run text segments to 425 * the destination, according to the options. 426 * 427 * The tests for where to insert LRMs ignore the fact that there may be 428 * BN codes or non-BMP code points at the beginning and end of a run; 429 * they may insert LRMs unnecessarily but the tests are faster this way 430 * (this would have to be improved for UTF-8). 431 * 432 * Note that the only errors that are set by doWriteXY() are buffer overflow 433 * errors. Ignore them until the end, and continue for preflighting. 434 */ 435 if(!(options&UBIDI_OUTPUT_REVERSE)) { 436 /* forward output */ 437 if(!(options&UBIDI_INSERT_LRM_FOR_NUMERIC)) { 438 /* do not insert BiDi controls */ 439 for(run=0; run<runCount; ++run) { 440 if(UBIDI_LTR==ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength)) { 441 runLength=doWriteForward(text+logicalStart, runLength, 442 dest, destSize, 443 (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode); 444 } else { 445 runLength=doWriteReverse(text+logicalStart, runLength, 446 dest, destSize, 447 options, pErrorCode); 448 } 449 dest+=runLength; 450 destSize-=runLength; 451 } 452 } else { 453 /* insert BiDi controls for "inverse BiDi" */ 454 const DirProp *dirProps=pBiDi->dirProps; 455 const UChar *src; 456 UChar uc; 457 UBiDiDirection dir; 458 int32_t markFlag; 459 460 for(run=0; run<runCount; ++run) { 461 dir=ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength); 462 src=text+logicalStart; 463 /* check if something relevant in insertPoints */ 464 markFlag=pBiDi->runs[run].insertRemove; 465 if(markFlag<0) { /* BiDi controls count */ 466 markFlag=0; 467 } 468 469 if(UBIDI_LTR==dir) { 470 if((pBiDi->isInverse) && 471 (/*run>0 &&*/ dirProps[logicalStart]!=L)) { 472 markFlag |= LRM_BEFORE; 473 } 474 if (markFlag & LRM_BEFORE) { 475 uc=LRM_CHAR; 476 } 477 else if (markFlag & RLM_BEFORE) { 478 uc=RLM_CHAR; 479 } 480 else uc=0; 481 if(uc) { 482 if(destSize>0) { 483 *dest++=uc; 484 } 485 --destSize; 486 } 487 488 runLength=doWriteForward(src, runLength, 489 dest, destSize, 490 (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode); 491 dest+=runLength; 492 destSize-=runLength; 493 494 if((pBiDi->isInverse) && 495 (/*run<runCount-1 &&*/ dirProps[logicalStart+runLength-1]!=L)) { 496 markFlag |= LRM_AFTER; 497 } 498 if (markFlag & LRM_AFTER) { 499 uc=LRM_CHAR; 500 } 501 else if (markFlag & RLM_AFTER) { 502 uc=RLM_CHAR; 503 } 504 else uc=0; 505 if(uc) { 506 if(destSize>0) { 507 *dest++=uc; 508 } 509 --destSize; 510 } 511 } else { /* RTL run */ 512 if((pBiDi->isInverse) && 513 (/*run>0 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart+runLength-1])))) { 514 markFlag |= RLM_BEFORE; 515 } 516 if (markFlag & LRM_BEFORE) { 517 uc=LRM_CHAR; 518 } 519 else if (markFlag & RLM_BEFORE) { 520 uc=RLM_CHAR; 521 } 522 else uc=0; 523 if(uc) { 524 if(destSize>0) { 525 *dest++=uc; 526 } 527 --destSize; 528 } 529 530 runLength=doWriteReverse(src, runLength, 531 dest, destSize, 532 options, pErrorCode); 533 dest+=runLength; 534 destSize-=runLength; 535 536 if((pBiDi->isInverse) && 537 (/*run<runCount-1 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart])))) { 538 markFlag |= RLM_AFTER; 539 } 540 if (markFlag & LRM_AFTER) { 541 uc=LRM_CHAR; 542 } 543 else if (markFlag & RLM_AFTER) { 544 uc=RLM_CHAR; 545 } 546 else uc=0; 547 if(uc) { 548 if(destSize>0) { 549 *dest++=uc; 550 } 551 --destSize; 552 } 553 } 554 } 555 } 556 } else { 557 /* reverse output */ 558 if(!(options&UBIDI_INSERT_LRM_FOR_NUMERIC)) { 559 /* do not insert BiDi controls */ 560 for(run=runCount; --run>=0;) { 561 if(UBIDI_LTR==ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength)) { 562 runLength=doWriteReverse(text+logicalStart, runLength, 563 dest, destSize, 564 (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode); 565 } else { 566 runLength=doWriteForward(text+logicalStart, runLength, 567 dest, destSize, 568 options, pErrorCode); 569 } 570 dest+=runLength; 571 destSize-=runLength; 572 } 573 } else { 574 /* insert BiDi controls for "inverse BiDi" */ 575 const DirProp *dirProps=pBiDi->dirProps; 576 const UChar *src; 577 UBiDiDirection dir; 578 579 for(run=runCount; --run>=0;) { 580 /* reverse output */ 581 dir=ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength); 582 src=text+logicalStart; 583 584 if(UBIDI_LTR==dir) { 585 if(/*run<runCount-1 &&*/ dirProps[logicalStart+runLength-1]!=L) { 586 if(destSize>0) { 587 *dest++=LRM_CHAR; 588 } 589 --destSize; 590 } 591 592 runLength=doWriteReverse(src, runLength, 593 dest, destSize, 594 (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode); 595 dest+=runLength; 596 destSize-=runLength; 597 598 if(/*run>0 &&*/ dirProps[logicalStart]!=L) { 599 if(destSize>0) { 600 *dest++=LRM_CHAR; 601 } 602 --destSize; 603 } 604 } else { 605 if(/*run<runCount-1 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart]))) { 606 if(destSize>0) { 607 *dest++=RLM_CHAR; 608 } 609 --destSize; 610 } 611 612 runLength=doWriteForward(src, runLength, 613 dest, destSize, 614 options, pErrorCode); 615 dest+=runLength; 616 destSize-=runLength; 617 618 if(/*run>0 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart+runLength-1]))) { 619 if(destSize>0) { 620 *dest++=RLM_CHAR; 621 } 622 --destSize; 623 } 624 } 625 } 626 } 627 } 628 629 return u_terminateUChars(saveDest, destCapacity, destCapacity-destSize, pErrorCode); 630 } 631