Home | History | Annotate | Download | only in common
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 1999-2009, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  uinvchar.c
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:2
     12 *
     13 *   created on: 2004sep14
     14 *   created by: Markus W. Scherer
     15 *
     16 *   Functions for handling invariant characters, moved here from putil.c
     17 *   for better modularization.
     18 */
     19 
     20 #include "unicode/utypes.h"
     21 #include "unicode/ustring.h"
     22 #include "udataswp.h"
     23 #include "cstring.h"
     24 #include "cmemory.h"
     25 #include "uassert.h"
     26 #include "uinvchar.h"
     27 
     28 /* invariant-character handling --------------------------------------------- */
     29 
     30 /*
     31  * These maps for ASCII to/from EBCDIC map invariant characters (see utypes.h)
     32  * appropriately for most EBCDIC codepages.
     33  *
     34  * They currently also map most other ASCII graphic characters,
     35  * appropriately for codepages 37 and 1047.
     36  * Exceptions: The characters for []^ have different codes in 37 & 1047.
     37  * Both versions are mapped to ASCII.
     38  *
     39  *    ASCII 37 1047
     40  * [     5B BA   AD
     41  * ]     5D BB   BD
     42  * ^     5E B0   5F
     43  *
     44  * There are no mappings for variant characters from Unicode to EBCDIC.
     45  *
     46  * Currently, C0 control codes are also included in these maps.
     47  * Exceptions: S/390 Open Edition swaps LF and NEL codes compared with other
     48  * EBCDIC platforms; both codes (15 and 25) are mapped to ASCII LF (0A),
     49  * but there is no mapping for ASCII LF back to EBCDIC.
     50  *
     51  *    ASCII EBCDIC S/390-OE
     52  * LF    0A     25       15
     53  * NEL   85     15       25
     54  *
     55  * The maps below explicitly exclude the variant
     56  * control and graphical characters that are in ASCII-based
     57  * codepages at 0x80 and above.
     58  * "No mapping" is expressed by mapping to a 00 byte.
     59  *
     60  * These tables do not establish a converter or a codepage.
     61  */
     62 
     63 static const uint8_t asciiFromEbcdic[256]={
     64     0x00, 0x01, 0x02, 0x03, 0x00, 0x09, 0x00, 0x7f, 0x00, 0x00, 0x00, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
     65     0x10, 0x11, 0x12, 0x13, 0x00, 0x0a, 0x08, 0x00, 0x18, 0x19, 0x00, 0x00, 0x1c, 0x1d, 0x1e, 0x1f,
     66     0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x17, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x06, 0x07,
     67     0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x14, 0x15, 0x00, 0x1a,
     68 
     69     0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2e, 0x3c, 0x28, 0x2b, 0x7c,
     70     0x26, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x21, 0x24, 0x2a, 0x29, 0x3b, 0x5e,
     71     0x2d, 0x2f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2c, 0x25, 0x5f, 0x3e, 0x3f,
     72     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x60, 0x3a, 0x23, 0x40, 0x27, 0x3d, 0x22,
     73 
     74     0x00, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     75     0x00, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     76     0x00, 0x7e, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00,
     77     0x5e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5b, 0x5d, 0x00, 0x5d, 0x00, 0x00,
     78 
     79     0x7b, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     80     0x7d, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     81     0x5c, 0x00, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     82     0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
     83 };
     84 
     85 static const uint8_t ebcdicFromAscii[256]={
     86     0x00, 0x01, 0x02, 0x03, 0x37, 0x2d, 0x2e, 0x2f, 0x16, 0x05, 0x00, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
     87     0x10, 0x11, 0x12, 0x13, 0x3c, 0x3d, 0x32, 0x26, 0x18, 0x19, 0x3f, 0x27, 0x1c, 0x1d, 0x1e, 0x1f,
     88     0x40, 0x00, 0x7f, 0x00, 0x00, 0x6c, 0x50, 0x7d, 0x4d, 0x5d, 0x5c, 0x4e, 0x6b, 0x60, 0x4b, 0x61,
     89     0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0x7a, 0x5e, 0x4c, 0x7e, 0x6e, 0x6f,
     90 
     91     0x00, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6,
     92     0xd7, 0xd8, 0xd9, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0x00, 0x00, 0x00, 0x00, 0x6d,
     93     0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
     94     0x97, 0x98, 0x99, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0x00, 0x00, 0x00, 0x00, 0x07,
     95 
     96     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     97     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     98     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     99     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    100 
    101     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    102     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    103     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    104     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    105 };
    106 
    107 /*
    108  * Bit sets indicating which characters of the ASCII repertoire
    109  * (by ASCII/Unicode code) are "invariant".
    110  * See utypes.h for more details.
    111  *
    112  * As invariant are considered the characters of the ASCII repertoire except
    113  * for the following:
    114  * 21  '!' <exclamation mark>
    115  * 23  '#' <number sign>
    116  * 24  '$' <dollar sign>
    117  *
    118  * 40  '@' <commercial at>
    119  *
    120  * 5b  '[' <left bracket>
    121  * 5c  '\' <backslash>
    122  * 5d  ']' <right bracket>
    123  * 5e  '^' <circumflex>
    124  *
    125  * 60  '`' <grave accent>
    126  *
    127  * 7b  '{' <left brace>
    128  * 7c  '|' <vertical line>
    129  * 7d  '}' <right brace>
    130  * 7e  '~' <tilde>
    131  */
    132 static const uint32_t invariantChars[4]={
    133     0xfffffbff, /* 00..1f but not 0a */
    134     0xffffffe5, /* 20..3f but not 21 23 24 */
    135     0x87fffffe, /* 40..5f but not 40 5b..5e */
    136     0x87fffffe  /* 60..7f but not 60 7b..7e */
    137 };
    138 
    139 /*
    140  * test unsigned types (or values known to be non-negative) for invariant characters,
    141  * tests ASCII-family character values
    142  */
    143 #define UCHAR_IS_INVARIANT(c) (((c)<=0x7f) && (invariantChars[(c)>>5]&((uint32_t)1<<((c)&0x1f)))!=0)
    144 
    145 /* test signed types for invariant characters, adds test for positive values */
    146 #define SCHAR_IS_INVARIANT(c) ((0<=(c)) && UCHAR_IS_INVARIANT(c))
    147 
    148 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
    149 #define CHAR_TO_UCHAR(c) c
    150 #define UCHAR_TO_CHAR(c) c
    151 #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
    152 #define CHAR_TO_UCHAR(u) asciiFromEbcdic[u]
    153 #define UCHAR_TO_CHAR(u) ebcdicFromAscii[u]
    154 #else
    155 #   error U_CHARSET_FAMILY is not valid
    156 #endif
    157 
    158 
    159 U_CAPI void U_EXPORT2
    160 u_charsToUChars(const char *cs, UChar *us, int32_t length) {
    161     UChar u;
    162     uint8_t c;
    163 
    164     /*
    165      * Allow the entire ASCII repertoire to be mapped _to_ Unicode.
    166      * For EBCDIC systems, this works for characters with codes from
    167      * codepages 37 and 1047 or compatible.
    168      */
    169     while(length>0) {
    170         c=(uint8_t)(*cs++);
    171         u=(UChar)CHAR_TO_UCHAR(c);
    172         U_ASSERT((u!=0 || c==0)); /* only invariant chars converted? */
    173         *us++=u;
    174         --length;
    175     }
    176 }
    177 
    178 U_CAPI void U_EXPORT2
    179 u_UCharsToChars(const UChar *us, char *cs, int32_t length) {
    180     UChar u;
    181 
    182     while(length>0) {
    183         u=*us++;
    184         if(!UCHAR_IS_INVARIANT(u)) {
    185             U_ASSERT(FALSE); /* Variant characters were used. These are not portable in ICU. */
    186             u=0;
    187         }
    188         *cs++=(char)UCHAR_TO_CHAR(u);
    189         --length;
    190     }
    191 }
    192 
    193 U_CAPI UBool U_EXPORT2
    194 uprv_isInvariantString(const char *s, int32_t length) {
    195     uint8_t c;
    196 
    197     for(;;) {
    198         if(length<0) {
    199             /* NUL-terminated */
    200             c=(uint8_t)*s++;
    201             if(c==0) {
    202                 break;
    203             }
    204         } else {
    205             /* count length */
    206             if(length==0) {
    207                 break;
    208             }
    209             --length;
    210             c=(uint8_t)*s++;
    211             if(c==0) {
    212                 continue; /* NUL is invariant */
    213             }
    214         }
    215         /* c!=0 now, one branch below checks c==0 for variant characters */
    216 
    217         /*
    218          * no assertions here because these functions are legitimately called
    219          * for strings with variant characters
    220          */
    221 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
    222         if(!UCHAR_IS_INVARIANT(c)) {
    223             return FALSE; /* found a variant char */
    224         }
    225 #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
    226         c=CHAR_TO_UCHAR(c);
    227         if(c==0 || !UCHAR_IS_INVARIANT(c)) {
    228             return FALSE; /* found a variant char */
    229         }
    230 #else
    231 #   error U_CHARSET_FAMILY is not valid
    232 #endif
    233     }
    234     return TRUE;
    235 }
    236 
    237 U_CAPI UBool U_EXPORT2
    238 uprv_isInvariantUString(const UChar *s, int32_t length) {
    239     UChar c;
    240 
    241     for(;;) {
    242         if(length<0) {
    243             /* NUL-terminated */
    244             c=*s++;
    245             if(c==0) {
    246                 break;
    247             }
    248         } else {
    249             /* count length */
    250             if(length==0) {
    251                 break;
    252             }
    253             --length;
    254             c=*s++;
    255         }
    256 
    257         /*
    258          * no assertions here because these functions are legitimately called
    259          * for strings with variant characters
    260          */
    261         if(!UCHAR_IS_INVARIANT(c)) {
    262             return FALSE; /* found a variant char */
    263         }
    264     }
    265     return TRUE;
    266 }
    267 
    268 /* UDataSwapFn implementations used in udataswp.c ------- */
    269 
    270 /* convert ASCII to EBCDIC and verify that all characters are invariant */
    271 U_CAPI int32_t U_EXPORT2
    272 uprv_ebcdicFromAscii(const UDataSwapper *ds,
    273                      const void *inData, int32_t length, void *outData,
    274                      UErrorCode *pErrorCode) {
    275     const uint8_t *s;
    276     uint8_t *t;
    277     uint8_t c;
    278 
    279     int32_t count;
    280 
    281     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    282         return 0;
    283     }
    284     if(ds==NULL || inData==NULL || length<0 || (length>0 && outData==NULL)) {
    285         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    286         return 0;
    287     }
    288 
    289     /* setup and swapping */
    290     s=(const uint8_t *)inData;
    291     t=(uint8_t *)outData;
    292     count=length;
    293     while(count>0) {
    294         c=*s++;
    295         if(!UCHAR_IS_INVARIANT(c)) {
    296             udata_printError(ds, "uprv_ebcdicFromAscii() string[%d] contains a variant character in position %d\n",
    297                              length, length-count);
    298             *pErrorCode=U_INVALID_CHAR_FOUND;
    299             return 0;
    300         }
    301         *t++=ebcdicFromAscii[c];
    302         --count;
    303     }
    304 
    305     return length;
    306 }
    307 
    308 /* this function only checks and copies ASCII strings without conversion */
    309 U_CFUNC int32_t
    310 uprv_copyAscii(const UDataSwapper *ds,
    311                const void *inData, int32_t length, void *outData,
    312                UErrorCode *pErrorCode) {
    313     const uint8_t *s;
    314     uint8_t c;
    315 
    316     int32_t count;
    317 
    318     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    319         return 0;
    320     }
    321     if(ds==NULL || inData==NULL || length<0 || (length>0 && outData==NULL)) {
    322         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    323         return 0;
    324     }
    325 
    326     /* setup and checking */
    327     s=(const uint8_t *)inData;
    328     count=length;
    329     while(count>0) {
    330         c=*s++;
    331         if(!UCHAR_IS_INVARIANT(c)) {
    332             udata_printError(ds, "uprv_copyFromAscii() string[%d] contains a variant character in position %d\n",
    333                              length, length-count);
    334             *pErrorCode=U_INVALID_CHAR_FOUND;
    335             return 0;
    336         }
    337         --count;
    338     }
    339 
    340     if(length>0 && inData!=outData) {
    341         uprv_memcpy(outData, inData, length);
    342     }
    343 
    344     return length;
    345 }
    346 
    347 /* convert EBCDIC to ASCII and verify that all characters are invariant */
    348 U_CFUNC int32_t
    349 uprv_asciiFromEbcdic(const UDataSwapper *ds,
    350                      const void *inData, int32_t length, void *outData,
    351                      UErrorCode *pErrorCode) {
    352     const uint8_t *s;
    353     uint8_t *t;
    354     uint8_t c;
    355 
    356     int32_t count;
    357 
    358     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    359         return 0;
    360     }
    361     if(ds==NULL || inData==NULL || length<0 ||  (length>0 && outData==NULL)) {
    362         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    363         return 0;
    364     }
    365 
    366     /* setup and swapping */
    367     s=(const uint8_t *)inData;
    368     t=(uint8_t *)outData;
    369     count=length;
    370     while(count>0) {
    371         c=*s++;
    372         if(c!=0 && ((c=asciiFromEbcdic[c])==0 || !UCHAR_IS_INVARIANT(c))) {
    373             udata_printError(ds, "uprv_asciiFromEbcdic() string[%d] contains a variant character in position %d\n",
    374                              length, length-count);
    375             *pErrorCode=U_INVALID_CHAR_FOUND;
    376             return 0;
    377         }
    378         *t++=c;
    379         --count;
    380     }
    381 
    382     return length;
    383 }
    384 
    385 /* this function only checks and copies EBCDIC strings without conversion */
    386 U_CFUNC int32_t
    387 uprv_copyEbcdic(const UDataSwapper *ds,
    388                 const void *inData, int32_t length, void *outData,
    389                 UErrorCode *pErrorCode) {
    390     const uint8_t *s;
    391     uint8_t c;
    392 
    393     int32_t count;
    394 
    395     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    396         return 0;
    397     }
    398     if(ds==NULL || inData==NULL || length<0 || (length>0 && outData==NULL)) {
    399         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    400         return 0;
    401     }
    402 
    403     /* setup and checking */
    404     s=(const uint8_t *)inData;
    405     count=length;
    406     while(count>0) {
    407         c=*s++;
    408         if(c!=0 && ((c=asciiFromEbcdic[c])==0 || !UCHAR_IS_INVARIANT(c))) {
    409             udata_printError(ds, "uprv_copyEbcdic() string[%] contains a variant character in position %d\n",
    410                              length, length-count);
    411             *pErrorCode=U_INVALID_CHAR_FOUND;
    412             return 0;
    413         }
    414         --count;
    415     }
    416 
    417     if(length>0 && inData!=outData) {
    418         uprv_memcpy(outData, inData, length);
    419     }
    420 
    421     return length;
    422 }
    423 
    424 /* compare invariant strings; variant characters compare less than others and unlike each other */
    425 U_CFUNC int32_t
    426 uprv_compareInvAscii(const UDataSwapper *ds,
    427                      const char *outString, int32_t outLength,
    428                      const UChar *localString, int32_t localLength) {
    429     int32_t minLength;
    430     UChar32 c1, c2;
    431     uint8_t c;
    432 
    433     if(outString==NULL || outLength<-1 || localString==NULL || localLength<-1) {
    434         return 0;
    435     }
    436 
    437     if(outLength<0) {
    438         outLength=(int32_t)uprv_strlen(outString);
    439     }
    440     if(localLength<0) {
    441         localLength=u_strlen(localString);
    442     }
    443 
    444     minLength= outLength<localLength ? outLength : localLength;
    445 
    446     while(minLength>0) {
    447         c=(uint8_t)*outString++;
    448         if(UCHAR_IS_INVARIANT(c)) {
    449             c1=c;
    450         } else {
    451             c1=-1;
    452         }
    453 
    454         c2=*localString++;
    455         if(!UCHAR_IS_INVARIANT(c2)) {
    456             c2=-2;
    457         }
    458 
    459         if((c1-=c2)!=0) {
    460             return c1;
    461         }
    462 
    463         --minLength;
    464     }
    465 
    466     /* strings start with same prefix, compare lengths */
    467     return outLength-localLength;
    468 }
    469 
    470 U_CFUNC int32_t
    471 uprv_compareInvEbcdic(const UDataSwapper *ds,
    472                       const char *outString, int32_t outLength,
    473                       const UChar *localString, int32_t localLength) {
    474     int32_t minLength;
    475     UChar32 c1, c2;
    476     uint8_t c;
    477 
    478     if(outString==NULL || outLength<-1 || localString==NULL || localLength<-1) {
    479         return 0;
    480     }
    481 
    482     if(outLength<0) {
    483         outLength=(int32_t)uprv_strlen(outString);
    484     }
    485     if(localLength<0) {
    486         localLength=u_strlen(localString);
    487     }
    488 
    489     minLength= outLength<localLength ? outLength : localLength;
    490 
    491     while(minLength>0) {
    492         c=(uint8_t)*outString++;
    493         if(c==0) {
    494             c1=0;
    495         } else if((c1=asciiFromEbcdic[c])!=0 && UCHAR_IS_INVARIANT(c1)) {
    496             /* c1 is set */
    497         } else {
    498             c1=-1;
    499         }
    500 
    501         c2=*localString++;
    502         if(!UCHAR_IS_INVARIANT(c2)) {
    503             c2=-2;
    504         }
    505 
    506         if((c1-=c2)!=0) {
    507             return c1;
    508         }
    509 
    510         --minLength;
    511     }
    512 
    513     /* strings start with same prefix, compare lengths */
    514     return outLength-localLength;
    515 }
    516 
    517 U_CAPI int32_t U_EXPORT2
    518 uprv_compareInvEbcdicAsAscii(const char *s1, const char *s2) {
    519     int32_t c1, c2;
    520 
    521     for(;; ++s1, ++s2) {
    522         c1=(uint8_t)*s1;
    523         c2=(uint8_t)*s2;
    524         if(c1!=c2) {
    525             if(c1!=0 && ((c1=asciiFromEbcdic[c1])==0 || !UCHAR_IS_INVARIANT(c1))) {
    526                 c1=-(int32_t)(uint8_t)*s1;
    527             }
    528             if(c2!=0 && ((c2=asciiFromEbcdic[c2])==0 || !UCHAR_IS_INVARIANT(c2))) {
    529                 c2=-(int32_t)(uint8_t)*s2;
    530             }
    531             return c1-c2;
    532         } else if(c1==0) {
    533             return 0;
    534         }
    535     }
    536 }
    537