Home | History | Annotate | Download | only in common
      1 /*
      2 ******************************************************************************
      3 *
      4 *   Copyright (C) 2000-2007, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 ******************************************************************************
      8 *   file name:  ubidiwrt.c
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 1999aug06
     14 *   created by: Markus W. Scherer, updated by Matitiahu Allouche
     15 *
     16 * This file contains implementations for BiDi functions that use
     17 * the core algorithm and core API to write reordered text.
     18 */
     19 
     20 /* set import/export definitions */
     21 #ifndef U_COMMON_IMPLEMENTATION
     22 #   define U_COMMON_IMPLEMENTATION
     23 #endif
     24 
     25 #include "unicode/utypes.h"
     26 #include "unicode/ustring.h"
     27 #include "unicode/uchar.h"
     28 #include "unicode/ubidi.h"
     29 #include "cmemory.h"
     30 #include "ustr_imp.h"
     31 #include "ubidiimp.h"
     32 
     33 /*
     34  * The function implementations in this file are designed
     35  * for UTF-16 and UTF-32, not for UTF-8.
     36  *
     37  * Assumptions that are not true for UTF-8:
     38  * - Any code point always needs the same number of code units
     39  *   ("minimum-length-problem" of UTF-8)
     40  * - The BiDi control characters need only one code unit each
     41  *
     42  * Further assumptions for all UTFs:
     43  * - u_charMirror(c) needs the same number of code units as c
     44  */
     45 #if UTF_SIZE==8
     46 # error reimplement ubidi_writeReordered() for UTF-8, see comment above
     47 #endif
     48 
     49 #define IS_COMBINING(type) ((1UL<<(type))&(1UL<<U_NON_SPACING_MARK|1UL<<U_COMBINING_SPACING_MARK|1UL<<U_ENCLOSING_MARK))
     50 
     51 /*
     52  * When we have UBIDI_OUTPUT_REVERSE set on ubidi_writeReordered(), then we
     53  * semantically write RTL runs in reverse and later reverse them again.
     54  * Instead, we actually write them in forward order to begin with.
     55  * However, if the RTL run was to be mirrored, we need to mirror here now
     56  * since the implicit second reversal must not do it.
     57  * It looks strange to do mirroring in LTR output, but it is only because
     58  * we are writing RTL output in reverse.
     59  */
     60 static int32_t
     61 doWriteForward(const UChar *src, int32_t srcLength,
     62                UChar *dest, int32_t destSize,
     63                uint16_t options,
     64                UErrorCode *pErrorCode) {
     65     /* optimize for several combinations of options */
     66     switch(options&(UBIDI_REMOVE_BIDI_CONTROLS|UBIDI_DO_MIRRORING)) {
     67     case 0: {
     68         /* simply copy the LTR run to the destination */
     69         int32_t length=srcLength;
     70         if(destSize<length) {
     71             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
     72             return srcLength;
     73         }
     74         do {
     75             *dest++=*src++;
     76         } while(--length>0);
     77         return srcLength;
     78     }
     79     case UBIDI_DO_MIRRORING: {
     80         /* do mirroring */
     81         int32_t i=0, j=0;
     82         UChar32 c;
     83 
     84         if(destSize<srcLength) {
     85             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
     86             return srcLength;
     87         }
     88         do {
     89             UTF_NEXT_CHAR(src, i, srcLength, c);
     90             c=u_charMirror(c);
     91             UTF_APPEND_CHAR_UNSAFE(dest, j, c);
     92         } while(i<srcLength);
     93         return srcLength;
     94     }
     95     case UBIDI_REMOVE_BIDI_CONTROLS: {
     96         /* copy the LTR run and remove any BiDi control characters */
     97         int32_t remaining=destSize;
     98         UChar c;
     99         do {
    100             c=*src++;
    101             if(!IS_BIDI_CONTROL_CHAR(c)) {
    102                 if(--remaining<0) {
    103                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    104 
    105                     /* preflight the length */
    106                     while(--srcLength>0) {
    107                         c=*src++;
    108                         if(!IS_BIDI_CONTROL_CHAR(c)) {
    109                             --remaining;
    110                         }
    111                     }
    112                     return destSize-remaining;
    113                 }
    114                 *dest++=c;
    115             }
    116         } while(--srcLength>0);
    117         return destSize-remaining;
    118     }
    119     default: {
    120         /* remove BiDi control characters and do mirroring */
    121         int32_t remaining=destSize;
    122         int32_t i, j=0;
    123         UChar32 c;
    124         do {
    125             i=0;
    126             UTF_NEXT_CHAR(src, i, srcLength, c);
    127             src+=i;
    128             srcLength-=i;
    129             if(!IS_BIDI_CONTROL_CHAR(c)) {
    130                 remaining-=i;
    131                 if(remaining<0) {
    132                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    133 
    134                     /* preflight the length */
    135                     while(srcLength>0) {
    136                         c=*src++;
    137                         if(!IS_BIDI_CONTROL_CHAR(c)) {
    138                             --remaining;
    139                         }
    140                         --srcLength;
    141                     }
    142                     return destSize-remaining;
    143                 }
    144                 c=u_charMirror(c);
    145                 UTF_APPEND_CHAR_UNSAFE(dest, j, c);
    146             }
    147         } while(srcLength>0);
    148         return j;
    149     }
    150     } /* end of switch */
    151 }
    152 
    153 static int32_t
    154 doWriteReverse(const UChar *src, int32_t srcLength,
    155                UChar *dest, int32_t destSize,
    156                uint16_t options,
    157                UErrorCode *pErrorCode) {
    158     /*
    159      * RTL run -
    160      *
    161      * RTL runs need to be copied to the destination in reverse order
    162      * of code points, not code units, to keep Unicode characters intact.
    163      *
    164      * The general strategy for this is to read the source text
    165      * in backward order, collect all code units for a code point
    166      * (and optionally following combining characters, see below),
    167      * and copy all these code units in ascending order
    168      * to the destination for this run.
    169      *
    170      * Several options request whether combining characters
    171      * should be kept after their base characters,
    172      * whether BiDi control characters should be removed, and
    173      * whether characters should be replaced by their mirror-image
    174      * equivalent Unicode characters.
    175      */
    176     int32_t i, j;
    177     UChar32 c;
    178 
    179     /* optimize for several combinations of options */
    180     switch(options&(UBIDI_REMOVE_BIDI_CONTROLS|UBIDI_DO_MIRRORING|UBIDI_KEEP_BASE_COMBINING)) {
    181     case 0:
    182         /*
    183          * With none of the "complicated" options set, the destination
    184          * run will have the same length as the source run,
    185          * and there is no mirroring and no keeping combining characters
    186          * with their base characters.
    187          */
    188         if(destSize<srcLength) {
    189             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    190             return srcLength;
    191         }
    192         destSize=srcLength;
    193 
    194         /* preserve character integrity */
    195         do {
    196             /* i is always after the last code unit known to need to be kept in this segment */
    197             i=srcLength;
    198 
    199             /* collect code units for one base character */
    200             UTF_BACK_1(src, 0, srcLength);
    201 
    202             /* copy this base character */
    203             j=srcLength;
    204             do {
    205                 *dest++=src[j++];
    206             } while(j<i);
    207         } while(srcLength>0);
    208         break;
    209     case UBIDI_KEEP_BASE_COMBINING:
    210         /*
    211          * Here, too, the destination
    212          * run will have the same length as the source run,
    213          * and there is no mirroring.
    214          * We do need to keep combining characters with their base characters.
    215          */
    216         if(destSize<srcLength) {
    217             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    218             return srcLength;
    219         }
    220         destSize=srcLength;
    221 
    222         /* preserve character integrity */
    223         do {
    224             /* i is always after the last code unit known to need to be kept in this segment */
    225             i=srcLength;
    226 
    227             /* collect code units and modifier letters for one base character */
    228             do {
    229                 UTF_PREV_CHAR(src, 0, srcLength, c);
    230             } while(srcLength>0 && IS_COMBINING(u_charType(c)));
    231 
    232             /* copy this "user character" */
    233             j=srcLength;
    234             do {
    235                 *dest++=src[j++];
    236             } while(j<i);
    237         } while(srcLength>0);
    238         break;
    239     default:
    240         /*
    241          * With several "complicated" options set, this is the most
    242          * general and the slowest copying of an RTL run.
    243          * We will do mirroring, remove BiDi controls, and
    244          * keep combining characters with their base characters
    245          * as requested.
    246          */
    247         if(!(options&UBIDI_REMOVE_BIDI_CONTROLS)) {
    248             i=srcLength;
    249         } else {
    250             /* we need to find out the destination length of the run,
    251                which will not include the BiDi control characters */
    252             int32_t length=srcLength;
    253             UChar ch;
    254 
    255             i=0;
    256             do {
    257                 ch=*src++;
    258                 if(!IS_BIDI_CONTROL_CHAR(ch)) {
    259                     ++i;
    260                 }
    261             } while(--length>0);
    262             src-=srcLength;
    263         }
    264 
    265         if(destSize<i) {
    266             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    267             return i;
    268         }
    269         destSize=i;
    270 
    271         /* preserve character integrity */
    272         do {
    273             /* i is always after the last code unit known to need to be kept in this segment */
    274             i=srcLength;
    275 
    276             /* collect code units for one base character */
    277             UTF_PREV_CHAR(src, 0, srcLength, c);
    278             if(options&UBIDI_KEEP_BASE_COMBINING) {
    279                 /* collect modifier letters for this base character */
    280                 while(srcLength>0 && IS_COMBINING(u_charType(c))) {
    281                     UTF_PREV_CHAR(src, 0, srcLength, c);
    282                 }
    283             }
    284 
    285             if(options&UBIDI_REMOVE_BIDI_CONTROLS && IS_BIDI_CONTROL_CHAR(c)) {
    286                 /* do not copy this BiDi control character */
    287                 continue;
    288             }
    289 
    290             /* copy this "user character" */
    291             j=srcLength;
    292             if(options&UBIDI_DO_MIRRORING) {
    293                 /* mirror only the base character */
    294                 int32_t k=0;
    295                 c=u_charMirror(c);
    296                 UTF_APPEND_CHAR_UNSAFE(dest, k, c);
    297                 dest+=k;
    298                 j+=k;
    299             }
    300             while(j<i) {
    301                 *dest++=src[j++];
    302             }
    303         } while(srcLength>0);
    304         break;
    305     } /* end of switch */
    306 
    307     return destSize;
    308 }
    309 
    310 U_CAPI int32_t U_EXPORT2
    311 ubidi_writeReverse(const UChar *src, int32_t srcLength,
    312                    UChar *dest, int32_t destSize,
    313                    uint16_t options,
    314                    UErrorCode *pErrorCode) {
    315     int32_t destLength;
    316 
    317     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    318         return 0;
    319     }
    320 
    321     /* more error checking */
    322     if( src==NULL || srcLength<-1 ||
    323         destSize<0 || (destSize>0 && dest==NULL))
    324     {
    325         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    326         return 0;
    327     }
    328 
    329     /* do input and output overlap? */
    330     if( dest!=NULL &&
    331         ((src>=dest && src<dest+destSize) ||
    332          (dest>=src && dest<src+srcLength)))
    333     {
    334         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    335         return 0;
    336     }
    337 
    338     if(srcLength==-1) {
    339         srcLength=u_strlen(src);
    340     }
    341     if(srcLength>0) {
    342         destLength=doWriteReverse(src, srcLength, dest, destSize, options, pErrorCode);
    343     } else {
    344         /* nothing to do */
    345         destLength=0;
    346     }
    347 
    348     return u_terminateUChars(dest, destSize, destLength, pErrorCode);
    349 }
    350 
    351 U_CAPI int32_t U_EXPORT2
    352 ubidi_writeReordered(UBiDi *pBiDi,
    353                      UChar *dest, int32_t destSize,
    354                      uint16_t options,
    355                      UErrorCode *pErrorCode) {
    356     const UChar *text;
    357     UChar *saveDest;
    358     int32_t length, destCapacity;
    359     int32_t run, runCount, logicalStart, runLength;
    360 
    361     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    362         return 0;
    363     }
    364 
    365     /* more error checking */
    366     if( pBiDi==NULL ||
    367         (text=pBiDi->text)==NULL || (length=pBiDi->length)<0 ||
    368         destSize<0 || (destSize>0 && dest==NULL))
    369     {
    370         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    371         return 0;
    372     }
    373 
    374     /* do input and output overlap? */
    375     if( dest!=NULL &&
    376         ((text>=dest && text<dest+destSize) ||
    377          (dest>=text && dest<text+pBiDi->originalLength)))
    378     {
    379         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    380         return 0;
    381     }
    382 
    383     if(length==0) {
    384         /* nothing to do */
    385         return u_terminateUChars(dest, destSize, 0, pErrorCode);
    386     }
    387 
    388     runCount=ubidi_countRuns(pBiDi, pErrorCode);
    389     if(U_FAILURE(*pErrorCode)) {
    390         return 0;
    391     }
    392 
    393     /* destSize shrinks, later destination length=destCapacity-destSize */
    394     saveDest=dest;
    395     destCapacity=destSize;
    396 
    397     /*
    398      * Option "insert marks" implies UBIDI_INSERT_LRM_FOR_NUMERIC if the
    399      * reordering mode (checked below) is appropriate.
    400      */
    401     if(pBiDi->reorderingOptions & UBIDI_OPTION_INSERT_MARKS) {
    402         options|=UBIDI_INSERT_LRM_FOR_NUMERIC;
    403         options&=~UBIDI_REMOVE_BIDI_CONTROLS;
    404     }
    405     /*
    406      * Option "remove controls" implies UBIDI_REMOVE_BIDI_CONTROLS
    407      * and cancels UBIDI_INSERT_LRM_FOR_NUMERIC.
    408      */
    409     if(pBiDi->reorderingOptions & UBIDI_OPTION_REMOVE_CONTROLS) {
    410         options|=UBIDI_REMOVE_BIDI_CONTROLS;
    411         options&=~UBIDI_INSERT_LRM_FOR_NUMERIC;
    412     }
    413     /*
    414      * If we do not perform the "inverse BiDi" algorithm, then we
    415      * don't need to insert any LRMs, and don't need to test for it.
    416      */
    417     if((pBiDi->reorderingMode != UBIDI_REORDER_INVERSE_NUMBERS_AS_L) &&
    418        (pBiDi->reorderingMode != UBIDI_REORDER_INVERSE_LIKE_DIRECT)  &&
    419        (pBiDi->reorderingMode != UBIDI_REORDER_INVERSE_FOR_NUMBERS_SPECIAL) &&
    420        (pBiDi->reorderingMode != UBIDI_REORDER_RUNS_ONLY)) {
    421         options&=~UBIDI_INSERT_LRM_FOR_NUMERIC;
    422     }
    423     /*
    424      * Iterate through all visual runs and copy the run text segments to
    425      * the destination, according to the options.
    426      *
    427      * The tests for where to insert LRMs ignore the fact that there may be
    428      * BN codes or non-BMP code points at the beginning and end of a run;
    429      * they may insert LRMs unnecessarily but the tests are faster this way
    430      * (this would have to be improved for UTF-8).
    431      *
    432      * Note that the only errors that are set by doWriteXY() are buffer overflow
    433      * errors. Ignore them until the end, and continue for preflighting.
    434      */
    435     if(!(options&UBIDI_OUTPUT_REVERSE)) {
    436         /* forward output */
    437         if(!(options&UBIDI_INSERT_LRM_FOR_NUMERIC)) {
    438             /* do not insert BiDi controls */
    439             for(run=0; run<runCount; ++run) {
    440                 if(UBIDI_LTR==ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength)) {
    441                     runLength=doWriteForward(text+logicalStart, runLength,
    442                                              dest, destSize,
    443                                              (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
    444                 } else {
    445                     runLength=doWriteReverse(text+logicalStart, runLength,
    446                                              dest, destSize,
    447                                              options, pErrorCode);
    448                 }
    449                 dest+=runLength;
    450                 destSize-=runLength;
    451             }
    452         } else {
    453             /* insert BiDi controls for "inverse BiDi" */
    454             const DirProp *dirProps=pBiDi->dirProps;
    455             const UChar *src;
    456             UChar uc;
    457             UBiDiDirection dir;
    458             int32_t markFlag;
    459 
    460             for(run=0; run<runCount; ++run) {
    461                 dir=ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength);
    462                 src=text+logicalStart;
    463                 /* check if something relevant in insertPoints */
    464                 markFlag=pBiDi->runs[run].insertRemove;
    465                 if(markFlag<0) {        /* BiDi controls count */
    466                     markFlag=0;
    467                 }
    468 
    469                 if(UBIDI_LTR==dir) {
    470                     if((pBiDi->isInverse) &&
    471                        (/*run>0 &&*/ dirProps[logicalStart]!=L)) {
    472                         markFlag |= LRM_BEFORE;
    473                     }
    474                     if (markFlag & LRM_BEFORE) {
    475                         uc=LRM_CHAR;
    476                     }
    477                     else if (markFlag & RLM_BEFORE) {
    478                         uc=RLM_CHAR;
    479                     }
    480                     else  uc=0;
    481                     if(uc) {
    482                         if(destSize>0) {
    483                             *dest++=uc;
    484                         }
    485                         --destSize;
    486                     }
    487 
    488                     runLength=doWriteForward(src, runLength,
    489                                              dest, destSize,
    490                                              (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
    491                     dest+=runLength;
    492                     destSize-=runLength;
    493 
    494                     if((pBiDi->isInverse) &&
    495                        (/*run<runCount-1 &&*/ dirProps[logicalStart+runLength-1]!=L)) {
    496                         markFlag |= LRM_AFTER;
    497                     }
    498                     if (markFlag & LRM_AFTER) {
    499                         uc=LRM_CHAR;
    500                     }
    501                     else if (markFlag & RLM_AFTER) {
    502                         uc=RLM_CHAR;
    503                     }
    504                     else  uc=0;
    505                     if(uc) {
    506                         if(destSize>0) {
    507                             *dest++=uc;
    508                         }
    509                         --destSize;
    510                     }
    511                 } else {                /* RTL run */
    512                     if((pBiDi->isInverse) &&
    513                        (/*run>0 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart+runLength-1])))) {
    514                         markFlag |= RLM_BEFORE;
    515                     }
    516                     if (markFlag & LRM_BEFORE) {
    517                         uc=LRM_CHAR;
    518                     }
    519                     else if (markFlag & RLM_BEFORE) {
    520                         uc=RLM_CHAR;
    521                     }
    522                     else  uc=0;
    523                     if(uc) {
    524                         if(destSize>0) {
    525                             *dest++=uc;
    526                         }
    527                         --destSize;
    528                     }
    529 
    530                     runLength=doWriteReverse(src, runLength,
    531                                              dest, destSize,
    532                                              options, pErrorCode);
    533                     dest+=runLength;
    534                     destSize-=runLength;
    535 
    536                     if((pBiDi->isInverse) &&
    537                        (/*run<runCount-1 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart])))) {
    538                         markFlag |= RLM_AFTER;
    539                     }
    540                     if (markFlag & LRM_AFTER) {
    541                         uc=LRM_CHAR;
    542                     }
    543                     else if (markFlag & RLM_AFTER) {
    544                         uc=RLM_CHAR;
    545                     }
    546                     else  uc=0;
    547                     if(uc) {
    548                         if(destSize>0) {
    549                             *dest++=uc;
    550                         }
    551                         --destSize;
    552                     }
    553                 }
    554             }
    555         }
    556     } else {
    557         /* reverse output */
    558         if(!(options&UBIDI_INSERT_LRM_FOR_NUMERIC)) {
    559             /* do not insert BiDi controls */
    560             for(run=runCount; --run>=0;) {
    561                 if(UBIDI_LTR==ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength)) {
    562                     runLength=doWriteReverse(text+logicalStart, runLength,
    563                                              dest, destSize,
    564                                              (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
    565                 } else {
    566                     runLength=doWriteForward(text+logicalStart, runLength,
    567                                              dest, destSize,
    568                                              options, pErrorCode);
    569                 }
    570                 dest+=runLength;
    571                 destSize-=runLength;
    572             }
    573         } else {
    574             /* insert BiDi controls for "inverse BiDi" */
    575             const DirProp *dirProps=pBiDi->dirProps;
    576             const UChar *src;
    577             UBiDiDirection dir;
    578 
    579             for(run=runCount; --run>=0;) {
    580                 /* reverse output */
    581                 dir=ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength);
    582                 src=text+logicalStart;
    583 
    584                 if(UBIDI_LTR==dir) {
    585                     if(/*run<runCount-1 &&*/ dirProps[logicalStart+runLength-1]!=L) {
    586                         if(destSize>0) {
    587                             *dest++=LRM_CHAR;
    588                         }
    589                         --destSize;
    590                     }
    591 
    592                     runLength=doWriteReverse(src, runLength,
    593                                              dest, destSize,
    594                                              (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
    595                     dest+=runLength;
    596                     destSize-=runLength;
    597 
    598                     if(/*run>0 &&*/ dirProps[logicalStart]!=L) {
    599                         if(destSize>0) {
    600                             *dest++=LRM_CHAR;
    601                         }
    602                         --destSize;
    603                     }
    604                 } else {
    605                     if(/*run<runCount-1 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart]))) {
    606                         if(destSize>0) {
    607                             *dest++=RLM_CHAR;
    608                         }
    609                         --destSize;
    610                     }
    611 
    612                     runLength=doWriteForward(src, runLength,
    613                                              dest, destSize,
    614                                              options, pErrorCode);
    615                     dest+=runLength;
    616                     destSize-=runLength;
    617 
    618                     if(/*run>0 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart+runLength-1]))) {
    619                         if(destSize>0) {
    620                             *dest++=RLM_CHAR;
    621                         }
    622                         --destSize;
    623                     }
    624                 }
    625             }
    626         }
    627     }
    628 
    629     return u_terminateUChars(saveDest, destCapacity, destCapacity-destSize, pErrorCode);
    630 }
    631