Home | History | Annotate | Download | only in common
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 ******************************************************************************
      5 *
      6 *   Copyright (C) 2000-2015, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 ******************************************************************************
     10 *   file name:  ubidiwrt.c
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 1999aug06
     16 *   created by: Markus W. Scherer, updated by Matitiahu Allouche
     17 *
     18 * This file contains implementations for BiDi functions that use
     19 * the core algorithm and core API to write reordered text.
     20 */
     21 
     22 #include "unicode/utypes.h"
     23 #include "unicode/ustring.h"
     24 #include "unicode/uchar.h"
     25 #include "unicode/ubidi.h"
     26 #include "unicode/utf16.h"
     27 #include "cmemory.h"
     28 #include "ustr_imp.h"
     29 #include "ubidiimp.h"
     30 
     31 /*
     32  * The function implementations in this file are designed
     33  * for UTF-16 and UTF-32, not for UTF-8.
     34  *
     35  * Assumptions that are not true for UTF-8:
     36  * - Any code point always needs the same number of code units
     37  *   ("minimum-length-problem" of UTF-8)
     38  * - The BiDi control characters need only one code unit each
     39  *
     40  * Further assumptions for all UTFs:
     41  * - u_charMirror(c) needs the same number of code units as c
     42  */
     43 #if UTF_SIZE==8
     44 # error reimplement ubidi_writeReordered() for UTF-8, see comment above
     45 #endif
     46 
     47 #define IS_COMBINING(type) ((1UL<<(type))&(1UL<<U_NON_SPACING_MARK|1UL<<U_COMBINING_SPACING_MARK|1UL<<U_ENCLOSING_MARK))
     48 
     49 /*
     50  * When we have UBIDI_OUTPUT_REVERSE set on ubidi_writeReordered(), then we
     51  * semantically write RTL runs in reverse and later reverse them again.
     52  * Instead, we actually write them in forward order to begin with.
     53  * However, if the RTL run was to be mirrored, we need to mirror here now
     54  * since the implicit second reversal must not do it.
     55  * It looks strange to do mirroring in LTR output, but it is only because
     56  * we are writing RTL output in reverse.
     57  */
     58 static int32_t
     59 doWriteForward(const UChar *src, int32_t srcLength,
     60                UChar *dest, int32_t destSize,
     61                uint16_t options,
     62                UErrorCode *pErrorCode) {
     63     /* optimize for several combinations of options */
     64     switch(options&(UBIDI_REMOVE_BIDI_CONTROLS|UBIDI_DO_MIRRORING)) {
     65     case 0: {
     66         /* simply copy the LTR run to the destination */
     67         int32_t length=srcLength;
     68         if(destSize<length) {
     69             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
     70             return srcLength;
     71         }
     72         do {
     73             *dest++=*src++;
     74         } while(--length>0);
     75         return srcLength;
     76     }
     77     case UBIDI_DO_MIRRORING: {
     78         /* do mirroring */
     79         int32_t i=0, j=0;
     80         UChar32 c;
     81 
     82         if(destSize<srcLength) {
     83             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
     84             return srcLength;
     85         }
     86         do {
     87             U16_NEXT(src, i, srcLength, c);
     88             c=u_charMirror(c);
     89             U16_APPEND_UNSAFE(dest, j, c);
     90         } while(i<srcLength);
     91         return srcLength;
     92     }
     93     case UBIDI_REMOVE_BIDI_CONTROLS: {
     94         /* copy the LTR run and remove any BiDi control characters */
     95         int32_t remaining=destSize;
     96         UChar c;
     97         do {
     98             c=*src++;
     99             if(!IS_BIDI_CONTROL_CHAR(c)) {
    100                 if(--remaining<0) {
    101                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    102 
    103                     /* preflight the length */
    104                     while(--srcLength>0) {
    105                         c=*src++;
    106                         if(!IS_BIDI_CONTROL_CHAR(c)) {
    107                             --remaining;
    108                         }
    109                     }
    110                     return destSize-remaining;
    111                 }
    112                 *dest++=c;
    113             }
    114         } while(--srcLength>0);
    115         return destSize-remaining;
    116     }
    117     default: {
    118         /* remove BiDi control characters and do mirroring */
    119         int32_t remaining=destSize;
    120         int32_t i, j=0;
    121         UChar32 c;
    122         do {
    123             i=0;
    124             U16_NEXT(src, i, srcLength, c);
    125             src+=i;
    126             srcLength-=i;
    127             if(!IS_BIDI_CONTROL_CHAR(c)) {
    128                 remaining-=i;
    129                 if(remaining<0) {
    130                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    131 
    132                     /* preflight the length */
    133                     while(srcLength>0) {
    134                         c=*src++;
    135                         if(!IS_BIDI_CONTROL_CHAR(c)) {
    136                             --remaining;
    137                         }
    138                         --srcLength;
    139                     }
    140                     return destSize-remaining;
    141                 }
    142                 c=u_charMirror(c);
    143                 U16_APPEND_UNSAFE(dest, j, c);
    144             }
    145         } while(srcLength>0);
    146         return j;
    147     }
    148     } /* end of switch */
    149 }
    150 
    151 static int32_t
    152 doWriteReverse(const UChar *src, int32_t srcLength,
    153                UChar *dest, int32_t destSize,
    154                uint16_t options,
    155                UErrorCode *pErrorCode) {
    156     /*
    157      * RTL run -
    158      *
    159      * RTL runs need to be copied to the destination in reverse order
    160      * of code points, not code units, to keep Unicode characters intact.
    161      *
    162      * The general strategy for this is to read the source text
    163      * in backward order, collect all code units for a code point
    164      * (and optionally following combining characters, see below),
    165      * and copy all these code units in ascending order
    166      * to the destination for this run.
    167      *
    168      * Several options request whether combining characters
    169      * should be kept after their base characters,
    170      * whether BiDi control characters should be removed, and
    171      * whether characters should be replaced by their mirror-image
    172      * equivalent Unicode characters.
    173      */
    174     int32_t i, j;
    175     UChar32 c;
    176 
    177     /* optimize for several combinations of options */
    178     switch(options&(UBIDI_REMOVE_BIDI_CONTROLS|UBIDI_DO_MIRRORING|UBIDI_KEEP_BASE_COMBINING)) {
    179     case 0:
    180         /*
    181          * With none of the "complicated" options set, the destination
    182          * run will have the same length as the source run,
    183          * and there is no mirroring and no keeping combining characters
    184          * with their base characters.
    185          */
    186         if(destSize<srcLength) {
    187             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    188             return srcLength;
    189         }
    190         destSize=srcLength;
    191 
    192         /* preserve character integrity */
    193         do {
    194             /* i is always after the last code unit known to need to be kept in this segment */
    195             i=srcLength;
    196 
    197             /* collect code units for one base character */
    198             U16_BACK_1(src, 0, srcLength);
    199 
    200             /* copy this base character */
    201             j=srcLength;
    202             do {
    203                 *dest++=src[j++];
    204             } while(j<i);
    205         } while(srcLength>0);
    206         break;
    207     case UBIDI_KEEP_BASE_COMBINING:
    208         /*
    209          * Here, too, the destination
    210          * run will have the same length as the source run,
    211          * and there is no mirroring.
    212          * We do need to keep combining characters with their base characters.
    213          */
    214         if(destSize<srcLength) {
    215             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    216             return srcLength;
    217         }
    218         destSize=srcLength;
    219 
    220         /* preserve character integrity */
    221         do {
    222             /* i is always after the last code unit known to need to be kept in this segment */
    223             i=srcLength;
    224 
    225             /* collect code units and modifier letters for one base character */
    226             do {
    227                 U16_PREV(src, 0, srcLength, c);
    228             } while(srcLength>0 && IS_COMBINING(u_charType(c)));
    229 
    230             /* copy this "user character" */
    231             j=srcLength;
    232             do {
    233                 *dest++=src[j++];
    234             } while(j<i);
    235         } while(srcLength>0);
    236         break;
    237     default:
    238         /*
    239          * With several "complicated" options set, this is the most
    240          * general and the slowest copying of an RTL run.
    241          * We will do mirroring, remove BiDi controls, and
    242          * keep combining characters with their base characters
    243          * as requested.
    244          */
    245         if(!(options&UBIDI_REMOVE_BIDI_CONTROLS)) {
    246             i=srcLength;
    247         } else {
    248             /* we need to find out the destination length of the run,
    249                which will not include the BiDi control characters */
    250             int32_t length=srcLength;
    251             UChar ch;
    252 
    253             i=0;
    254             do {
    255                 ch=*src++;
    256                 if(!IS_BIDI_CONTROL_CHAR(ch)) {
    257                     ++i;
    258                 }
    259             } while(--length>0);
    260             src-=srcLength;
    261         }
    262 
    263         if(destSize<i) {
    264             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    265             return i;
    266         }
    267         destSize=i;
    268 
    269         /* preserve character integrity */
    270         do {
    271             /* i is always after the last code unit known to need to be kept in this segment */
    272             i=srcLength;
    273 
    274             /* collect code units for one base character */
    275             U16_PREV(src, 0, srcLength, c);
    276             if(options&UBIDI_KEEP_BASE_COMBINING) {
    277                 /* collect modifier letters for this base character */
    278                 while(srcLength>0 && IS_COMBINING(u_charType(c))) {
    279                     U16_PREV(src, 0, srcLength, c);
    280                 }
    281             }
    282 
    283             if(options&UBIDI_REMOVE_BIDI_CONTROLS && IS_BIDI_CONTROL_CHAR(c)) {
    284                 /* do not copy this BiDi control character */
    285                 continue;
    286             }
    287 
    288             /* copy this "user character" */
    289             j=srcLength;
    290             if(options&UBIDI_DO_MIRRORING) {
    291                 /* mirror only the base character */
    292                 int32_t k=0;
    293                 c=u_charMirror(c);
    294                 U16_APPEND_UNSAFE(dest, k, c);
    295                 dest+=k;
    296                 j+=k;
    297             }
    298             while(j<i) {
    299                 *dest++=src[j++];
    300             }
    301         } while(srcLength>0);
    302         break;
    303     } /* end of switch */
    304 
    305     return destSize;
    306 }
    307 
    308 U_CAPI int32_t U_EXPORT2
    309 ubidi_writeReverse(const UChar *src, int32_t srcLength,
    310                    UChar *dest, int32_t destSize,
    311                    uint16_t options,
    312                    UErrorCode *pErrorCode) {
    313     int32_t destLength;
    314 
    315     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    316         return 0;
    317     }
    318 
    319     /* more error checking */
    320     if( src==NULL || srcLength<-1 ||
    321         destSize<0 || (destSize>0 && dest==NULL))
    322     {
    323         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    324         return 0;
    325     }
    326 
    327     /* do input and output overlap? */
    328     if( dest!=NULL &&
    329         ((src>=dest && src<dest+destSize) ||
    330          (dest>=src && dest<src+srcLength)))
    331     {
    332         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    333         return 0;
    334     }
    335 
    336     if(srcLength==-1) {
    337         srcLength=u_strlen(src);
    338     }
    339     if(srcLength>0) {
    340         destLength=doWriteReverse(src, srcLength, dest, destSize, options, pErrorCode);
    341     } else {
    342         /* nothing to do */
    343         destLength=0;
    344     }
    345 
    346     return u_terminateUChars(dest, destSize, destLength, pErrorCode);
    347 }
    348 
    349 U_CAPI int32_t U_EXPORT2
    350 ubidi_writeReordered(UBiDi *pBiDi,
    351                      UChar *dest, int32_t destSize,
    352                      uint16_t options,
    353                      UErrorCode *pErrorCode) {
    354     const UChar *text;
    355     UChar *saveDest;
    356     int32_t length, destCapacity;
    357     int32_t run, runCount, logicalStart, runLength;
    358 
    359     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    360         return 0;
    361     }
    362 
    363     /* more error checking */
    364     if( pBiDi==NULL ||
    365         (text=pBiDi->text)==NULL || (length=pBiDi->length)<0 ||
    366         destSize<0 || (destSize>0 && dest==NULL))
    367     {
    368         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    369         return 0;
    370     }
    371 
    372     /* do input and output overlap? */
    373     if( dest!=NULL &&
    374         ((text>=dest && text<dest+destSize) ||
    375          (dest>=text && dest<text+pBiDi->originalLength)))
    376     {
    377         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    378         return 0;
    379     }
    380 
    381     if(length==0) {
    382         /* nothing to do */
    383         return u_terminateUChars(dest, destSize, 0, pErrorCode);
    384     }
    385 
    386     runCount=ubidi_countRuns(pBiDi, pErrorCode);
    387     if(U_FAILURE(*pErrorCode)) {
    388         return 0;
    389     }
    390 
    391     /* destSize shrinks, later destination length=destCapacity-destSize */
    392     saveDest=dest;
    393     destCapacity=destSize;
    394 
    395     /*
    396      * Option "insert marks" implies UBIDI_INSERT_LRM_FOR_NUMERIC if the
    397      * reordering mode (checked below) is appropriate.
    398      */
    399     if(pBiDi->reorderingOptions & UBIDI_OPTION_INSERT_MARKS) {
    400         options|=UBIDI_INSERT_LRM_FOR_NUMERIC;
    401         options&=~UBIDI_REMOVE_BIDI_CONTROLS;
    402     }
    403     /*
    404      * Option "remove controls" implies UBIDI_REMOVE_BIDI_CONTROLS
    405      * and cancels UBIDI_INSERT_LRM_FOR_NUMERIC.
    406      */
    407     if(pBiDi->reorderingOptions & UBIDI_OPTION_REMOVE_CONTROLS) {
    408         options|=UBIDI_REMOVE_BIDI_CONTROLS;
    409         options&=~UBIDI_INSERT_LRM_FOR_NUMERIC;
    410     }
    411     /*
    412      * If we do not perform the "inverse BiDi" algorithm, then we
    413      * don't need to insert any LRMs, and don't need to test for it.
    414      */
    415     if((pBiDi->reorderingMode != UBIDI_REORDER_INVERSE_NUMBERS_AS_L) &&
    416        (pBiDi->reorderingMode != UBIDI_REORDER_INVERSE_LIKE_DIRECT)  &&
    417        (pBiDi->reorderingMode != UBIDI_REORDER_INVERSE_FOR_NUMBERS_SPECIAL) &&
    418        (pBiDi->reorderingMode != UBIDI_REORDER_RUNS_ONLY)) {
    419         options&=~UBIDI_INSERT_LRM_FOR_NUMERIC;
    420     }
    421     /*
    422      * Iterate through all visual runs and copy the run text segments to
    423      * the destination, according to the options.
    424      *
    425      * The tests for where to insert LRMs ignore the fact that there may be
    426      * BN codes or non-BMP code points at the beginning and end of a run;
    427      * they may insert LRMs unnecessarily but the tests are faster this way
    428      * (this would have to be improved for UTF-8).
    429      *
    430      * Note that the only errors that are set by doWriteXY() are buffer overflow
    431      * errors. Ignore them until the end, and continue for preflighting.
    432      */
    433     if(!(options&UBIDI_OUTPUT_REVERSE)) {
    434         /* forward output */
    435         if(!(options&UBIDI_INSERT_LRM_FOR_NUMERIC)) {
    436             /* do not insert BiDi controls */
    437             for(run=0; run<runCount; ++run) {
    438                 if(UBIDI_LTR==ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength)) {
    439                     runLength=doWriteForward(text+logicalStart, runLength,
    440                                              dest, destSize,
    441                                              (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
    442                 } else {
    443                     runLength=doWriteReverse(text+logicalStart, runLength,
    444                                              dest, destSize,
    445                                              options, pErrorCode);
    446                 }
    447                 if(dest!=NULL) {
    448                   dest+=runLength;
    449                 }
    450                 destSize-=runLength;
    451             }
    452         } else {
    453             /* insert BiDi controls for "inverse BiDi" */
    454             const DirProp *dirProps=pBiDi->dirProps;
    455             const UChar *src;
    456             UChar uc;
    457             UBiDiDirection dir;
    458             int32_t markFlag;
    459 
    460             for(run=0; run<runCount; ++run) {
    461                 dir=ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength);
    462                 src=text+logicalStart;
    463                 /* check if something relevant in insertPoints */
    464                 markFlag=pBiDi->runs[run].insertRemove;
    465                 if(markFlag<0) {        /* BiDi controls count */
    466                     markFlag=0;
    467                 }
    468 
    469                 if(UBIDI_LTR==dir) {
    470                     if((pBiDi->isInverse) &&
    471                        (/*run>0 &&*/ dirProps[logicalStart]!=L)) {
    472                         markFlag |= LRM_BEFORE;
    473                     }
    474                     if (markFlag & LRM_BEFORE) {
    475                         uc=LRM_CHAR;
    476                     }
    477                     else if (markFlag & RLM_BEFORE) {
    478                         uc=RLM_CHAR;
    479                     }
    480                     else  uc=0;
    481                     if(uc) {
    482                         if(destSize>0) {
    483                             *dest++=uc;
    484                         }
    485                         --destSize;
    486                     }
    487 
    488                     runLength=doWriteForward(src, runLength,
    489                                              dest, destSize,
    490                                              (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
    491                     if(dest!=NULL) {
    492                       dest+=runLength;
    493                     }
    494                     destSize-=runLength;
    495 
    496                     if((pBiDi->isInverse) &&
    497                        (/*run<runCount-1 &&*/ dirProps[logicalStart+runLength-1]!=L)) {
    498                         markFlag |= LRM_AFTER;
    499                     }
    500                     if (markFlag & LRM_AFTER) {
    501                         uc=LRM_CHAR;
    502                     }
    503                     else if (markFlag & RLM_AFTER) {
    504                         uc=RLM_CHAR;
    505                     }
    506                     else  uc=0;
    507                     if(uc) {
    508                         if(destSize>0) {
    509                             *dest++=uc;
    510                         }
    511                         --destSize;
    512                     }
    513                 } else {                /* RTL run */
    514                     if((pBiDi->isInverse) &&
    515                        (/*run>0 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart+runLength-1])))) {
    516                         markFlag |= RLM_BEFORE;
    517                     }
    518                     if (markFlag & LRM_BEFORE) {
    519                         uc=LRM_CHAR;
    520                     }
    521                     else if (markFlag & RLM_BEFORE) {
    522                         uc=RLM_CHAR;
    523                     }
    524                     else  uc=0;
    525                     if(uc) {
    526                         if(destSize>0) {
    527                             *dest++=uc;
    528                         }
    529                         --destSize;
    530                     }
    531 
    532                     runLength=doWriteReverse(src, runLength,
    533                                              dest, destSize,
    534                                              options, pErrorCode);
    535                     if(dest!=NULL) {
    536                       dest+=runLength;
    537                     }
    538                     destSize-=runLength;
    539 
    540                     if((pBiDi->isInverse) &&
    541                        (/*run<runCount-1 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart])))) {
    542                         markFlag |= RLM_AFTER;
    543                     }
    544                     if (markFlag & LRM_AFTER) {
    545                         uc=LRM_CHAR;
    546                     }
    547                     else if (markFlag & RLM_AFTER) {
    548                         uc=RLM_CHAR;
    549                     }
    550                     else  uc=0;
    551                     if(uc) {
    552                         if(destSize>0) {
    553                             *dest++=uc;
    554                         }
    555                         --destSize;
    556                     }
    557                 }
    558             }
    559         }
    560     } else {
    561         /* reverse output */
    562         if(!(options&UBIDI_INSERT_LRM_FOR_NUMERIC)) {
    563             /* do not insert BiDi controls */
    564             for(run=runCount; --run>=0;) {
    565                 if(UBIDI_LTR==ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength)) {
    566                     runLength=doWriteReverse(text+logicalStart, runLength,
    567                                              dest, destSize,
    568                                              (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
    569                 } else {
    570                     runLength=doWriteForward(text+logicalStart, runLength,
    571                                              dest, destSize,
    572                                              options, pErrorCode);
    573                 }
    574                 if(dest!=NULL) {
    575                   dest+=runLength;
    576                 }
    577                 destSize-=runLength;
    578             }
    579         } else {
    580             /* insert BiDi controls for "inverse BiDi" */
    581             const DirProp *dirProps=pBiDi->dirProps;
    582             const UChar *src;
    583             UBiDiDirection dir;
    584 
    585             for(run=runCount; --run>=0;) {
    586                 /* reverse output */
    587                 dir=ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength);
    588                 src=text+logicalStart;
    589 
    590                 if(UBIDI_LTR==dir) {
    591                     if(/*run<runCount-1 &&*/ dirProps[logicalStart+runLength-1]!=L) {
    592                         if(destSize>0) {
    593                             *dest++=LRM_CHAR;
    594                         }
    595                         --destSize;
    596                     }
    597 
    598                     runLength=doWriteReverse(src, runLength,
    599                                              dest, destSize,
    600                                              (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
    601                     if(dest!=NULL) {
    602                       dest+=runLength;
    603                     }
    604                     destSize-=runLength;
    605 
    606                     if(/*run>0 &&*/ dirProps[logicalStart]!=L) {
    607                         if(destSize>0) {
    608                             *dest++=LRM_CHAR;
    609                         }
    610                         --destSize;
    611                     }
    612                 } else {
    613                     if(/*run<runCount-1 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart]))) {
    614                         if(destSize>0) {
    615                             *dest++=RLM_CHAR;
    616                         }
    617                         --destSize;
    618                     }
    619 
    620                     runLength=doWriteForward(src, runLength,
    621                                              dest, destSize,
    622                                              options, pErrorCode);
    623                     if(dest!=NULL) {
    624                       dest+=runLength;
    625                     }
    626                     destSize-=runLength;
    627 
    628                     if(/*run>0 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart+runLength-1]))) {
    629                         if(destSize>0) {
    630                             *dest++=RLM_CHAR;
    631                         }
    632                         --destSize;
    633                     }
    634                 }
    635             }
    636         }
    637     }
    638 
    639     return u_terminateUChars(saveDest, destCapacity, destCapacity-destSize, pErrorCode);
    640 }
    641