Home | History | Annotate | Download | only in common
      1 /*
      2 ******************************************************************************
      3 *
      4 *   Copyright (C) 2000-2015, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 ******************************************************************************
      8 *   file name:  ubidiwrt.c
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 1999aug06
     14 *   created by: Markus W. Scherer, updated by Matitiahu Allouche
     15 *
     16 * This file contains implementations for BiDi functions that use
     17 * the core algorithm and core API to write reordered text.
     18 */
     19 
     20 #include "unicode/utypes.h"
     21 #include "unicode/ustring.h"
     22 #include "unicode/uchar.h"
     23 #include "unicode/ubidi.h"
     24 #include "unicode/utf16.h"
     25 #include "cmemory.h"
     26 #include "ustr_imp.h"
     27 #include "ubidiimp.h"
     28 
     29 /*
     30  * The function implementations in this file are designed
     31  * for UTF-16 and UTF-32, not for UTF-8.
     32  *
     33  * Assumptions that are not true for UTF-8:
     34  * - Any code point always needs the same number of code units
     35  *   ("minimum-length-problem" of UTF-8)
     36  * - The BiDi control characters need only one code unit each
     37  *
     38  * Further assumptions for all UTFs:
     39  * - u_charMirror(c) needs the same number of code units as c
     40  */
     41 #if UTF_SIZE==8
     42 # error reimplement ubidi_writeReordered() for UTF-8, see comment above
     43 #endif
     44 
     45 #define IS_COMBINING(type) ((1UL<<(type))&(1UL<<U_NON_SPACING_MARK|1UL<<U_COMBINING_SPACING_MARK|1UL<<U_ENCLOSING_MARK))
     46 
     47 /*
     48  * When we have UBIDI_OUTPUT_REVERSE set on ubidi_writeReordered(), then we
     49  * semantically write RTL runs in reverse and later reverse them again.
     50  * Instead, we actually write them in forward order to begin with.
     51  * However, if the RTL run was to be mirrored, we need to mirror here now
     52  * since the implicit second reversal must not do it.
     53  * It looks strange to do mirroring in LTR output, but it is only because
     54  * we are writing RTL output in reverse.
     55  */
     56 static int32_t
     57 doWriteForward(const UChar *src, int32_t srcLength,
     58                UChar *dest, int32_t destSize,
     59                uint16_t options,
     60                UErrorCode *pErrorCode) {
     61     /* optimize for several combinations of options */
     62     switch(options&(UBIDI_REMOVE_BIDI_CONTROLS|UBIDI_DO_MIRRORING)) {
     63     case 0: {
     64         /* simply copy the LTR run to the destination */
     65         int32_t length=srcLength;
     66         if(destSize<length) {
     67             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
     68             return srcLength;
     69         }
     70         do {
     71             *dest++=*src++;
     72         } while(--length>0);
     73         return srcLength;
     74     }
     75     case UBIDI_DO_MIRRORING: {
     76         /* do mirroring */
     77         int32_t i=0, j=0;
     78         UChar32 c;
     79 
     80         if(destSize<srcLength) {
     81             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
     82             return srcLength;
     83         }
     84         do {
     85             U16_NEXT(src, i, srcLength, c);
     86             c=u_charMirror(c);
     87             U16_APPEND_UNSAFE(dest, j, c);
     88         } while(i<srcLength);
     89         return srcLength;
     90     }
     91     case UBIDI_REMOVE_BIDI_CONTROLS: {
     92         /* copy the LTR run and remove any BiDi control characters */
     93         int32_t remaining=destSize;
     94         UChar c;
     95         do {
     96             c=*src++;
     97             if(!IS_BIDI_CONTROL_CHAR(c)) {
     98                 if(--remaining<0) {
     99                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    100 
    101                     /* preflight the length */
    102                     while(--srcLength>0) {
    103                         c=*src++;
    104                         if(!IS_BIDI_CONTROL_CHAR(c)) {
    105                             --remaining;
    106                         }
    107                     }
    108                     return destSize-remaining;
    109                 }
    110                 *dest++=c;
    111             }
    112         } while(--srcLength>0);
    113         return destSize-remaining;
    114     }
    115     default: {
    116         /* remove BiDi control characters and do mirroring */
    117         int32_t remaining=destSize;
    118         int32_t i, j=0;
    119         UChar32 c;
    120         do {
    121             i=0;
    122             U16_NEXT(src, i, srcLength, c);
    123             src+=i;
    124             srcLength-=i;
    125             if(!IS_BIDI_CONTROL_CHAR(c)) {
    126                 remaining-=i;
    127                 if(remaining<0) {
    128                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    129 
    130                     /* preflight the length */
    131                     while(srcLength>0) {
    132                         c=*src++;
    133                         if(!IS_BIDI_CONTROL_CHAR(c)) {
    134                             --remaining;
    135                         }
    136                         --srcLength;
    137                     }
    138                     return destSize-remaining;
    139                 }
    140                 c=u_charMirror(c);
    141                 U16_APPEND_UNSAFE(dest, j, c);
    142             }
    143         } while(srcLength>0);
    144         return j;
    145     }
    146     } /* end of switch */
    147 }
    148 
    149 static int32_t
    150 doWriteReverse(const UChar *src, int32_t srcLength,
    151                UChar *dest, int32_t destSize,
    152                uint16_t options,
    153                UErrorCode *pErrorCode) {
    154     /*
    155      * RTL run -
    156      *
    157      * RTL runs need to be copied to the destination in reverse order
    158      * of code points, not code units, to keep Unicode characters intact.
    159      *
    160      * The general strategy for this is to read the source text
    161      * in backward order, collect all code units for a code point
    162      * (and optionally following combining characters, see below),
    163      * and copy all these code units in ascending order
    164      * to the destination for this run.
    165      *
    166      * Several options request whether combining characters
    167      * should be kept after their base characters,
    168      * whether BiDi control characters should be removed, and
    169      * whether characters should be replaced by their mirror-image
    170      * equivalent Unicode characters.
    171      */
    172     int32_t i, j;
    173     UChar32 c;
    174 
    175     /* optimize for several combinations of options */
    176     switch(options&(UBIDI_REMOVE_BIDI_CONTROLS|UBIDI_DO_MIRRORING|UBIDI_KEEP_BASE_COMBINING)) {
    177     case 0:
    178         /*
    179          * With none of the "complicated" options set, the destination
    180          * run will have the same length as the source run,
    181          * and there is no mirroring and no keeping combining characters
    182          * with their base characters.
    183          */
    184         if(destSize<srcLength) {
    185             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    186             return srcLength;
    187         }
    188         destSize=srcLength;
    189 
    190         /* preserve character integrity */
    191         do {
    192             /* i is always after the last code unit known to need to be kept in this segment */
    193             i=srcLength;
    194 
    195             /* collect code units for one base character */
    196             U16_BACK_1(src, 0, srcLength);
    197 
    198             /* copy this base character */
    199             j=srcLength;
    200             do {
    201                 *dest++=src[j++];
    202             } while(j<i);
    203         } while(srcLength>0);
    204         break;
    205     case UBIDI_KEEP_BASE_COMBINING:
    206         /*
    207          * Here, too, the destination
    208          * run will have the same length as the source run,
    209          * and there is no mirroring.
    210          * We do need to keep combining characters with their base characters.
    211          */
    212         if(destSize<srcLength) {
    213             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    214             return srcLength;
    215         }
    216         destSize=srcLength;
    217 
    218         /* preserve character integrity */
    219         do {
    220             /* i is always after the last code unit known to need to be kept in this segment */
    221             i=srcLength;
    222 
    223             /* collect code units and modifier letters for one base character */
    224             do {
    225                 U16_PREV(src, 0, srcLength, c);
    226             } while(srcLength>0 && IS_COMBINING(u_charType(c)));
    227 
    228             /* copy this "user character" */
    229             j=srcLength;
    230             do {
    231                 *dest++=src[j++];
    232             } while(j<i);
    233         } while(srcLength>0);
    234         break;
    235     default:
    236         /*
    237          * With several "complicated" options set, this is the most
    238          * general and the slowest copying of an RTL run.
    239          * We will do mirroring, remove BiDi controls, and
    240          * keep combining characters with their base characters
    241          * as requested.
    242          */
    243         if(!(options&UBIDI_REMOVE_BIDI_CONTROLS)) {
    244             i=srcLength;
    245         } else {
    246             /* we need to find out the destination length of the run,
    247                which will not include the BiDi control characters */
    248             int32_t length=srcLength;
    249             UChar ch;
    250 
    251             i=0;
    252             do {
    253                 ch=*src++;
    254                 if(!IS_BIDI_CONTROL_CHAR(ch)) {
    255                     ++i;
    256                 }
    257             } while(--length>0);
    258             src-=srcLength;
    259         }
    260 
    261         if(destSize<i) {
    262             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    263             return i;
    264         }
    265         destSize=i;
    266 
    267         /* preserve character integrity */
    268         do {
    269             /* i is always after the last code unit known to need to be kept in this segment */
    270             i=srcLength;
    271 
    272             /* collect code units for one base character */
    273             U16_PREV(src, 0, srcLength, c);
    274             if(options&UBIDI_KEEP_BASE_COMBINING) {
    275                 /* collect modifier letters for this base character */
    276                 while(srcLength>0 && IS_COMBINING(u_charType(c))) {
    277                     U16_PREV(src, 0, srcLength, c);
    278                 }
    279             }
    280 
    281             if(options&UBIDI_REMOVE_BIDI_CONTROLS && IS_BIDI_CONTROL_CHAR(c)) {
    282                 /* do not copy this BiDi control character */
    283                 continue;
    284             }
    285 
    286             /* copy this "user character" */
    287             j=srcLength;
    288             if(options&UBIDI_DO_MIRRORING) {
    289                 /* mirror only the base character */
    290                 int32_t k=0;
    291                 c=u_charMirror(c);
    292                 U16_APPEND_UNSAFE(dest, k, c);
    293                 dest+=k;
    294                 j+=k;
    295             }
    296             while(j<i) {
    297                 *dest++=src[j++];
    298             }
    299         } while(srcLength>0);
    300         break;
    301     } /* end of switch */
    302 
    303     return destSize;
    304 }
    305 
    306 U_CAPI int32_t U_EXPORT2
    307 ubidi_writeReverse(const UChar *src, int32_t srcLength,
    308                    UChar *dest, int32_t destSize,
    309                    uint16_t options,
    310                    UErrorCode *pErrorCode) {
    311     int32_t destLength;
    312 
    313     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    314         return 0;
    315     }
    316 
    317     /* more error checking */
    318     if( src==NULL || srcLength<-1 ||
    319         destSize<0 || (destSize>0 && dest==NULL))
    320     {
    321         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    322         return 0;
    323     }
    324 
    325     /* do input and output overlap? */
    326     if( dest!=NULL &&
    327         ((src>=dest && src<dest+destSize) ||
    328          (dest>=src && dest<src+srcLength)))
    329     {
    330         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    331         return 0;
    332     }
    333 
    334     if(srcLength==-1) {
    335         srcLength=u_strlen(src);
    336     }
    337     if(srcLength>0) {
    338         destLength=doWriteReverse(src, srcLength, dest, destSize, options, pErrorCode);
    339     } else {
    340         /* nothing to do */
    341         destLength=0;
    342     }
    343 
    344     return u_terminateUChars(dest, destSize, destLength, pErrorCode);
    345 }
    346 
    347 U_CAPI int32_t U_EXPORT2
    348 ubidi_writeReordered(UBiDi *pBiDi,
    349                      UChar *dest, int32_t destSize,
    350                      uint16_t options,
    351                      UErrorCode *pErrorCode) {
    352     const UChar *text;
    353     UChar *saveDest;
    354     int32_t length, destCapacity;
    355     int32_t run, runCount, logicalStart, runLength;
    356 
    357     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    358         return 0;
    359     }
    360 
    361     /* more error checking */
    362     if( pBiDi==NULL ||
    363         (text=pBiDi->text)==NULL || (length=pBiDi->length)<0 ||
    364         destSize<0 || (destSize>0 && dest==NULL))
    365     {
    366         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    367         return 0;
    368     }
    369 
    370     /* do input and output overlap? */
    371     if( dest!=NULL &&
    372         ((text>=dest && text<dest+destSize) ||
    373          (dest>=text && dest<text+pBiDi->originalLength)))
    374     {
    375         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    376         return 0;
    377     }
    378 
    379     if(length==0) {
    380         /* nothing to do */
    381         return u_terminateUChars(dest, destSize, 0, pErrorCode);
    382     }
    383 
    384     runCount=ubidi_countRuns(pBiDi, pErrorCode);
    385     if(U_FAILURE(*pErrorCode)) {
    386         return 0;
    387     }
    388 
    389     /* destSize shrinks, later destination length=destCapacity-destSize */
    390     saveDest=dest;
    391     destCapacity=destSize;
    392 
    393     /*
    394      * Option "insert marks" implies UBIDI_INSERT_LRM_FOR_NUMERIC if the
    395      * reordering mode (checked below) is appropriate.
    396      */
    397     if(pBiDi->reorderingOptions & UBIDI_OPTION_INSERT_MARKS) {
    398         options|=UBIDI_INSERT_LRM_FOR_NUMERIC;
    399         options&=~UBIDI_REMOVE_BIDI_CONTROLS;
    400     }
    401     /*
    402      * Option "remove controls" implies UBIDI_REMOVE_BIDI_CONTROLS
    403      * and cancels UBIDI_INSERT_LRM_FOR_NUMERIC.
    404      */
    405     if(pBiDi->reorderingOptions & UBIDI_OPTION_REMOVE_CONTROLS) {
    406         options|=UBIDI_REMOVE_BIDI_CONTROLS;
    407         options&=~UBIDI_INSERT_LRM_FOR_NUMERIC;
    408     }
    409     /*
    410      * If we do not perform the "inverse BiDi" algorithm, then we
    411      * don't need to insert any LRMs, and don't need to test for it.
    412      */
    413     if((pBiDi->reorderingMode != UBIDI_REORDER_INVERSE_NUMBERS_AS_L) &&
    414        (pBiDi->reorderingMode != UBIDI_REORDER_INVERSE_LIKE_DIRECT)  &&
    415        (pBiDi->reorderingMode != UBIDI_REORDER_INVERSE_FOR_NUMBERS_SPECIAL) &&
    416        (pBiDi->reorderingMode != UBIDI_REORDER_RUNS_ONLY)) {
    417         options&=~UBIDI_INSERT_LRM_FOR_NUMERIC;
    418     }
    419     /*
    420      * Iterate through all visual runs and copy the run text segments to
    421      * the destination, according to the options.
    422      *
    423      * The tests for where to insert LRMs ignore the fact that there may be
    424      * BN codes or non-BMP code points at the beginning and end of a run;
    425      * they may insert LRMs unnecessarily but the tests are faster this way
    426      * (this would have to be improved for UTF-8).
    427      *
    428      * Note that the only errors that are set by doWriteXY() are buffer overflow
    429      * errors. Ignore them until the end, and continue for preflighting.
    430      */
    431     if(!(options&UBIDI_OUTPUT_REVERSE)) {
    432         /* forward output */
    433         if(!(options&UBIDI_INSERT_LRM_FOR_NUMERIC)) {
    434             /* do not insert BiDi controls */
    435             for(run=0; run<runCount; ++run) {
    436                 if(UBIDI_LTR==ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength)) {
    437                     runLength=doWriteForward(text+logicalStart, runLength,
    438                                              dest, destSize,
    439                                              (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
    440                 } else {
    441                     runLength=doWriteReverse(text+logicalStart, runLength,
    442                                              dest, destSize,
    443                                              options, pErrorCode);
    444                 }
    445                 if(dest!=NULL) {
    446                   dest+=runLength;
    447                 }
    448                 destSize-=runLength;
    449             }
    450         } else {
    451             /* insert BiDi controls for "inverse BiDi" */
    452             const DirProp *dirProps=pBiDi->dirProps;
    453             const UChar *src;
    454             UChar uc;
    455             UBiDiDirection dir;
    456             int32_t markFlag;
    457 
    458             for(run=0; run<runCount; ++run) {
    459                 dir=ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength);
    460                 src=text+logicalStart;
    461                 /* check if something relevant in insertPoints */
    462                 markFlag=pBiDi->runs[run].insertRemove;
    463                 if(markFlag<0) {        /* BiDi controls count */
    464                     markFlag=0;
    465                 }
    466 
    467                 if(UBIDI_LTR==dir) {
    468                     if((pBiDi->isInverse) &&
    469                        (/*run>0 &&*/ dirProps[logicalStart]!=L)) {
    470                         markFlag |= LRM_BEFORE;
    471                     }
    472                     if (markFlag & LRM_BEFORE) {
    473                         uc=LRM_CHAR;
    474                     }
    475                     else if (markFlag & RLM_BEFORE) {
    476                         uc=RLM_CHAR;
    477                     }
    478                     else  uc=0;
    479                     if(uc) {
    480                         if(destSize>0) {
    481                             *dest++=uc;
    482                         }
    483                         --destSize;
    484                     }
    485 
    486                     runLength=doWriteForward(src, runLength,
    487                                              dest, destSize,
    488                                              (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
    489                     if(dest!=NULL) {
    490                       dest+=runLength;
    491                     }
    492                     destSize-=runLength;
    493 
    494                     if((pBiDi->isInverse) &&
    495                        (/*run<runCount-1 &&*/ dirProps[logicalStart+runLength-1]!=L)) {
    496                         markFlag |= LRM_AFTER;
    497                     }
    498                     if (markFlag & LRM_AFTER) {
    499                         uc=LRM_CHAR;
    500                     }
    501                     else if (markFlag & RLM_AFTER) {
    502                         uc=RLM_CHAR;
    503                     }
    504                     else  uc=0;
    505                     if(uc) {
    506                         if(destSize>0) {
    507                             *dest++=uc;
    508                         }
    509                         --destSize;
    510                     }
    511                 } else {                /* RTL run */
    512                     if((pBiDi->isInverse) &&
    513                        (/*run>0 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart+runLength-1])))) {
    514                         markFlag |= RLM_BEFORE;
    515                     }
    516                     if (markFlag & LRM_BEFORE) {
    517                         uc=LRM_CHAR;
    518                     }
    519                     else if (markFlag & RLM_BEFORE) {
    520                         uc=RLM_CHAR;
    521                     }
    522                     else  uc=0;
    523                     if(uc) {
    524                         if(destSize>0) {
    525                             *dest++=uc;
    526                         }
    527                         --destSize;
    528                     }
    529 
    530                     runLength=doWriteReverse(src, runLength,
    531                                              dest, destSize,
    532                                              options, pErrorCode);
    533                     if(dest!=NULL) {
    534                       dest+=runLength;
    535                     }
    536                     destSize-=runLength;
    537 
    538                     if((pBiDi->isInverse) &&
    539                        (/*run<runCount-1 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart])))) {
    540                         markFlag |= RLM_AFTER;
    541                     }
    542                     if (markFlag & LRM_AFTER) {
    543                         uc=LRM_CHAR;
    544                     }
    545                     else if (markFlag & RLM_AFTER) {
    546                         uc=RLM_CHAR;
    547                     }
    548                     else  uc=0;
    549                     if(uc) {
    550                         if(destSize>0) {
    551                             *dest++=uc;
    552                         }
    553                         --destSize;
    554                     }
    555                 }
    556             }
    557         }
    558     } else {
    559         /* reverse output */
    560         if(!(options&UBIDI_INSERT_LRM_FOR_NUMERIC)) {
    561             /* do not insert BiDi controls */
    562             for(run=runCount; --run>=0;) {
    563                 if(UBIDI_LTR==ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength)) {
    564                     runLength=doWriteReverse(text+logicalStart, runLength,
    565                                              dest, destSize,
    566                                              (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
    567                 } else {
    568                     runLength=doWriteForward(text+logicalStart, runLength,
    569                                              dest, destSize,
    570                                              options, pErrorCode);
    571                 }
    572                 if(dest!=NULL) {
    573                   dest+=runLength;
    574                 }
    575                 destSize-=runLength;
    576             }
    577         } else {
    578             /* insert BiDi controls for "inverse BiDi" */
    579             const DirProp *dirProps=pBiDi->dirProps;
    580             const UChar *src;
    581             UBiDiDirection dir;
    582 
    583             for(run=runCount; --run>=0;) {
    584                 /* reverse output */
    585                 dir=ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength);
    586                 src=text+logicalStart;
    587 
    588                 if(UBIDI_LTR==dir) {
    589                     if(/*run<runCount-1 &&*/ dirProps[logicalStart+runLength-1]!=L) {
    590                         if(destSize>0) {
    591                             *dest++=LRM_CHAR;
    592                         }
    593                         --destSize;
    594                     }
    595 
    596                     runLength=doWriteReverse(src, runLength,
    597                                              dest, destSize,
    598                                              (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
    599                     if(dest!=NULL) {
    600                       dest+=runLength;
    601                     }
    602                     destSize-=runLength;
    603 
    604                     if(/*run>0 &&*/ dirProps[logicalStart]!=L) {
    605                         if(destSize>0) {
    606                             *dest++=LRM_CHAR;
    607                         }
    608                         --destSize;
    609                     }
    610                 } else {
    611                     if(/*run<runCount-1 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart]))) {
    612                         if(destSize>0) {
    613                             *dest++=RLM_CHAR;
    614                         }
    615                         --destSize;
    616                     }
    617 
    618                     runLength=doWriteForward(src, runLength,
    619                                              dest, destSize,
    620                                              options, pErrorCode);
    621                     if(dest!=NULL) {
    622                       dest+=runLength;
    623                     }
    624                     destSize-=runLength;
    625 
    626                     if(/*run>0 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart+runLength-1]))) {
    627                         if(destSize>0) {
    628                             *dest++=RLM_CHAR;
    629                         }
    630                         --destSize;
    631                     }
    632                 }
    633             }
    634         }
    635     }
    636 
    637     return u_terminateUChars(saveDest, destCapacity, destCapacity-destSize, pErrorCode);
    638 }
    639