1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 1999-2006, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: uinvchar.c 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:2 12 * 13 * created on: 2004sep14 14 * created by: Markus W. Scherer 15 * 16 * Functions for handling invariant characters, moved here from putil.c 17 * for better modularization. 18 */ 19 20 #include "unicode/utypes.h" 21 #include "unicode/ustring.h" 22 #include "udataswp.h" 23 #include "cstring.h" 24 #include "cmemory.h" 25 #include "uassert.h" 26 #include "uinvchar.h" 27 28 /* invariant-character handling --------------------------------------------- */ 29 30 /* 31 * These maps for ASCII to/from EBCDIC map invariant characters (see utypes.h) 32 * appropriately for most EBCDIC codepages. 33 * 34 * They currently also map most other ASCII graphic characters, 35 * appropriately for codepages 37 and 1047. 36 * Exceptions: The characters for []^ have different codes in 37 & 1047. 37 * Both versions are mapped to ASCII. 38 * 39 * ASCII 37 1047 40 * [ 5B BA AD 41 * ] 5D BB BD 42 * ^ 5E B0 5F 43 * 44 * There are no mappings for variant characters from Unicode to EBCDIC. 45 * 46 * Currently, C0 control codes are also included in these maps. 47 * Exceptions: S/390 Open Edition swaps LF and NEL codes compared with other 48 * EBCDIC platforms; both codes (15 and 25) are mapped to ASCII LF (0A), 49 * but there is no mapping for ASCII LF back to EBCDIC. 50 * 51 * ASCII EBCDIC S/390-OE 52 * LF 0A 25 15 53 * NEL 85 15 25 54 * 55 * The maps below explicitly exclude the variant 56 * control and graphical characters that are in ASCII-based 57 * codepages at 0x80 and above. 58 * "No mapping" is expressed by mapping to a 00 byte. 59 * 60 * These tables do not establish a converter or a codepage. 61 */ 62 63 static const uint8_t asciiFromEbcdic[256]={ 64 0x00, 0x01, 0x02, 0x03, 0x00, 0x09, 0x00, 0x7f, 0x00, 0x00, 0x00, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 65 0x10, 0x11, 0x12, 0x13, 0x00, 0x0a, 0x08, 0x00, 0x18, 0x19, 0x00, 0x00, 0x1c, 0x1d, 0x1e, 0x1f, 66 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x17, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x06, 0x07, 67 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x14, 0x15, 0x00, 0x1a, 68 69 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2e, 0x3c, 0x28, 0x2b, 0x7c, 70 0x26, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x21, 0x24, 0x2a, 0x29, 0x3b, 0x5e, 71 0x2d, 0x2f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2c, 0x25, 0x5f, 0x3e, 0x3f, 72 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x60, 0x3a, 0x23, 0x40, 0x27, 0x3d, 0x22, 73 74 0x00, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 75 0x00, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 76 0x00, 0x7e, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00, 77 0x5e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5b, 0x5d, 0x00, 0x5d, 0x00, 0x00, 78 79 0x7b, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 80 0x7d, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 81 0x5c, 0x00, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 82 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 83 }; 84 85 static const uint8_t ebcdicFromAscii[256]={ 86 0x00, 0x01, 0x02, 0x03, 0x37, 0x2d, 0x2e, 0x2f, 0x16, 0x05, 0x00, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 87 0x10, 0x11, 0x12, 0x13, 0x3c, 0x3d, 0x32, 0x26, 0x18, 0x19, 0x3f, 0x27, 0x1c, 0x1d, 0x1e, 0x1f, 88 0x40, 0x00, 0x7f, 0x00, 0x00, 0x6c, 0x50, 0x7d, 0x4d, 0x5d, 0x5c, 0x4e, 0x6b, 0x60, 0x4b, 0x61, 89 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0x7a, 0x5e, 0x4c, 0x7e, 0x6e, 0x6f, 90 91 0x00, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 92 0xd7, 0xd8, 0xd9, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0x00, 0x00, 0x00, 0x00, 0x6d, 93 0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 94 0x97, 0x98, 0x99, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0x00, 0x00, 0x00, 0x00, 0x07, 95 96 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 97 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 98 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 99 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 100 101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 102 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 103 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 104 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 105 }; 106 107 /* 108 * Bit sets indicating which characters of the ASCII repertoire 109 * (by ASCII/Unicode code) are "invariant". 110 * See utypes.h for more details. 111 * 112 * As invariant are considered the characters of the ASCII repertoire except 113 * for the following: 114 * 21 '!' <exclamation mark> 115 * 23 '#' <number sign> 116 * 24 '$' <dollar sign> 117 * 118 * 40 '@' <commercial at> 119 * 120 * 5b '[' <left bracket> 121 * 5c '\' <backslash> 122 * 5d ']' <right bracket> 123 * 5e '^' <circumflex> 124 * 125 * 60 '`' <grave accent> 126 * 127 * 7b '{' <left brace> 128 * 7c '|' <vertical line> 129 * 7d '}' <right brace> 130 * 7e '~' <tilde> 131 */ 132 static const uint32_t invariantChars[4]={ 133 0xfffffbff, /* 00..1f but not 0a */ 134 0xffffffe5, /* 20..3f but not 21 23 24 */ 135 0x87fffffe, /* 40..5f but not 40 5b..5e */ 136 0x87fffffe /* 60..7f but not 60 7b..7e */ 137 }; 138 139 /* 140 * test unsigned types (or values known to be non-negative) for invariant characters, 141 * tests ASCII-family character values 142 */ 143 #define UCHAR_IS_INVARIANT(c) (((c)<=0x7f) && (invariantChars[(c)>>5]&((uint32_t)1<<((c)&0x1f)))!=0) 144 145 /* test signed types for invariant characters, adds test for positive values */ 146 #define SCHAR_IS_INVARIANT(c) ((0<=(c)) && UCHAR_IS_INVARIANT(c)) 147 148 #if U_CHARSET_FAMILY==U_ASCII_FAMILY 149 #define CHAR_TO_UCHAR(c) c 150 #define UCHAR_TO_CHAR(c) c 151 #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY 152 #define CHAR_TO_UCHAR(u) asciiFromEbcdic[u] 153 #define UCHAR_TO_CHAR(u) ebcdicFromAscii[u] 154 #else 155 # error U_CHARSET_FAMILY is not valid 156 #endif 157 158 159 U_CAPI void U_EXPORT2 160 u_charsToUChars(const char *cs, UChar *us, int32_t length) { 161 UChar u; 162 uint8_t c; 163 164 /* 165 * Allow the entire ASCII repertoire to be mapped _to_ Unicode. 166 * For EBCDIC systems, this works for characters with codes from 167 * codepages 37 and 1047 or compatible. 168 */ 169 while(length>0) { 170 c=(uint8_t)(*cs++); 171 u=(UChar)CHAR_TO_UCHAR(c); 172 U_ASSERT((u!=0 || c==0)); /* only invariant chars converted? */ 173 *us++=u; 174 --length; 175 } 176 } 177 178 U_CAPI void U_EXPORT2 179 u_UCharsToChars(const UChar *us, char *cs, int32_t length) { 180 UChar u; 181 182 while(length>0) { 183 u=*us++; 184 if(!UCHAR_IS_INVARIANT(u)) { 185 U_ASSERT(FALSE); /* Variant characters were used. These are not portable in ICU. */ 186 u=0; 187 } 188 *cs++=(char)UCHAR_TO_CHAR(u); 189 --length; 190 } 191 } 192 193 U_CAPI UBool U_EXPORT2 194 uprv_isInvariantString(const char *s, int32_t length) { 195 uint8_t c; 196 197 for(;;) { 198 if(length<0) { 199 /* NUL-terminated */ 200 c=(uint8_t)*s++; 201 if(c==0) { 202 break; 203 } 204 } else { 205 /* count length */ 206 if(length==0) { 207 break; 208 } 209 --length; 210 c=(uint8_t)*s++; 211 if(c==0) { 212 continue; /* NUL is invariant */ 213 } 214 } 215 /* c!=0 now, one branch below checks c==0 for variant characters */ 216 217 /* 218 * no assertions here because these functions are legitimately called 219 * for strings with variant characters 220 */ 221 #if U_CHARSET_FAMILY==U_ASCII_FAMILY 222 if(!UCHAR_IS_INVARIANT(c)) { 223 return FALSE; /* found a variant char */ 224 } 225 #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY 226 c=CHAR_TO_UCHAR(c); 227 if(c==0 || !UCHAR_IS_INVARIANT(c)) { 228 return FALSE; /* found a variant char */ 229 } 230 #else 231 # error U_CHARSET_FAMILY is not valid 232 #endif 233 } 234 return TRUE; 235 } 236 237 U_CAPI UBool U_EXPORT2 238 uprv_isInvariantUString(const UChar *s, int32_t length) { 239 UChar c; 240 241 for(;;) { 242 if(length<0) { 243 /* NUL-terminated */ 244 c=*s++; 245 if(c==0) { 246 break; 247 } 248 } else { 249 /* count length */ 250 if(length==0) { 251 break; 252 } 253 --length; 254 c=*s++; 255 } 256 257 /* 258 * no assertions here because these functions are legitimately called 259 * for strings with variant characters 260 */ 261 if(!UCHAR_IS_INVARIANT(c)) { 262 return FALSE; /* found a variant char */ 263 } 264 } 265 return TRUE; 266 } 267 268 /* UDataSwapFn implementations used in udataswp.c ------- */ 269 270 /* convert ASCII to EBCDIC and verify that all characters are invariant */ 271 U_CAPI int32_t U_EXPORT2 272 uprv_ebcdicFromAscii(const UDataSwapper *ds, 273 const void *inData, int32_t length, void *outData, 274 UErrorCode *pErrorCode) { 275 const uint8_t *s; 276 uint8_t *t; 277 uint8_t c; 278 279 int32_t count; 280 281 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 282 return 0; 283 } 284 if(ds==NULL || inData==NULL || length<0 || (length>0 && outData==NULL)) { 285 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 286 return 0; 287 } 288 289 /* setup and swapping */ 290 s=(const uint8_t *)inData; 291 t=(uint8_t *)outData; 292 count=length; 293 while(count>0) { 294 c=*s++; 295 if(!UCHAR_IS_INVARIANT(c)) { 296 udata_printError(ds, "uprv_ebcdicFromAscii() string[%d] contains a variant character in position %d\n", 297 length, length-count); 298 *pErrorCode=U_INVALID_CHAR_FOUND; 299 return 0; 300 } 301 *t++=ebcdicFromAscii[c]; 302 --count; 303 } 304 305 return length; 306 } 307 308 /* this function only checks and copies ASCII strings without conversion */ 309 U_CFUNC int32_t 310 uprv_copyAscii(const UDataSwapper *ds, 311 const void *inData, int32_t length, void *outData, 312 UErrorCode *pErrorCode) { 313 const uint8_t *s; 314 uint8_t c; 315 316 int32_t count; 317 318 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 319 return 0; 320 } 321 if(ds==NULL || inData==NULL || length<0 || (length>0 && outData==NULL)) { 322 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 323 return 0; 324 } 325 326 /* setup and checking */ 327 s=(const uint8_t *)inData; 328 count=length; 329 while(count>0) { 330 c=*s++; 331 if(!UCHAR_IS_INVARIANT(c)) { 332 udata_printError(ds, "uprv_copyFromAscii() string[%d] contains a variant character in position %d\n", 333 length, length-count); 334 *pErrorCode=U_INVALID_CHAR_FOUND; 335 return 0; 336 } 337 --count; 338 } 339 340 if(length>0 && inData!=outData) { 341 uprv_memcpy(outData, inData, length); 342 } 343 344 return length; 345 } 346 347 /* convert EBCDIC to ASCII and verify that all characters are invariant */ 348 U_CFUNC int32_t 349 uprv_asciiFromEbcdic(const UDataSwapper *ds, 350 const void *inData, int32_t length, void *outData, 351 UErrorCode *pErrorCode) { 352 const uint8_t *s; 353 uint8_t *t; 354 uint8_t c; 355 356 int32_t count; 357 358 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 359 return 0; 360 } 361 if(ds==NULL || inData==NULL || length<0 || (length>0 && outData==NULL)) { 362 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 363 return 0; 364 } 365 366 /* setup and swapping */ 367 s=(const uint8_t *)inData; 368 t=(uint8_t *)outData; 369 count=length; 370 while(count>0) { 371 c=*s++; 372 if(c!=0 && ((c=asciiFromEbcdic[c])==0 || !UCHAR_IS_INVARIANT(c))) { 373 udata_printError(ds, "uprv_asciiFromEbcdic() string[%d] contains a variant character in position %d\n", 374 length, length-count); 375 *pErrorCode=U_INVALID_CHAR_FOUND; 376 return 0; 377 } 378 *t++=c; 379 --count; 380 } 381 382 return length; 383 } 384 385 /* this function only checks and copies EBCDIC strings without conversion */ 386 U_CFUNC int32_t 387 uprv_copyEbcdic(const UDataSwapper *ds, 388 const void *inData, int32_t length, void *outData, 389 UErrorCode *pErrorCode) { 390 const uint8_t *s; 391 uint8_t c; 392 393 int32_t count; 394 395 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 396 return 0; 397 } 398 if(ds==NULL || inData==NULL || length<0 || (length>0 && outData==NULL)) { 399 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 400 return 0; 401 } 402 403 /* setup and checking */ 404 s=(const uint8_t *)inData; 405 count=length; 406 while(count>0) { 407 c=*s++; 408 if(c!=0 && ((c=asciiFromEbcdic[c])==0 || !UCHAR_IS_INVARIANT(c))) { 409 udata_printError(ds, "uprv_copyEbcdic() string[%] contains a variant character in position %d\n", 410 length, length-count); 411 *pErrorCode=U_INVALID_CHAR_FOUND; 412 return 0; 413 } 414 --count; 415 } 416 417 if(length>0 && inData!=outData) { 418 uprv_memcpy(outData, inData, length); 419 } 420 421 return length; 422 } 423 424 /* compare invariant strings; variant characters compare less than others and unlike each other */ 425 U_CFUNC int32_t 426 uprv_compareInvAscii(const UDataSwapper *ds, 427 const char *outString, int32_t outLength, 428 const UChar *localString, int32_t localLength) { 429 int32_t minLength; 430 UChar32 c1, c2; 431 uint8_t c; 432 433 if(outString==NULL || outLength<-1 || localString==NULL || localLength<-1) { 434 return 0; 435 } 436 437 if(outLength<0) { 438 outLength=(int32_t)uprv_strlen(outString); 439 } 440 if(localLength<0) { 441 localLength=u_strlen(localString); 442 } 443 444 minLength= outLength<localLength ? outLength : localLength; 445 446 while(minLength>0) { 447 c=(uint8_t)*outString++; 448 if(UCHAR_IS_INVARIANT(c)) { 449 c1=c; 450 } else { 451 c1=-1; 452 } 453 454 c2=*localString++; 455 if(!UCHAR_IS_INVARIANT(c2)) { 456 c1=-2; 457 } 458 459 if((c1-=c2)!=0) { 460 return c1; 461 } 462 463 --minLength; 464 } 465 466 /* strings start with same prefix, compare lengths */ 467 return outLength-localLength; 468 } 469 470 U_CFUNC int32_t 471 uprv_compareInvEbcdic(const UDataSwapper *ds, 472 const char *outString, int32_t outLength, 473 const UChar *localString, int32_t localLength) { 474 int32_t minLength; 475 UChar32 c1, c2; 476 uint8_t c; 477 478 if(outString==NULL || outLength<-1 || localString==NULL || localLength<-1) { 479 return 0; 480 } 481 482 if(outLength<0) { 483 outLength=(int32_t)uprv_strlen(outString); 484 } 485 if(localLength<0) { 486 localLength=u_strlen(localString); 487 } 488 489 minLength= outLength<localLength ? outLength : localLength; 490 491 while(minLength>0) { 492 c=(uint8_t)*outString++; 493 if(c==0) { 494 c1=0; 495 } else if((c1=asciiFromEbcdic[c])!=0 && UCHAR_IS_INVARIANT(c1)) { 496 /* c1 is set */ 497 } else { 498 c1=-1; 499 } 500 501 c2=*localString++; 502 if(!UCHAR_IS_INVARIANT(c2)) { 503 c1=-2; 504 } 505 506 if((c1-=c2)!=0) { 507 return c1; 508 } 509 510 --minLength; 511 } 512 513 /* strings start with same prefix, compare lengths */ 514 return outLength-localLength; 515 } 516