1 /* 2 * Copyright (c) 2004, 2013, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 /* Misc functions for conversion of Unicode and UTF-8 and platform encoding */ 27 28 #include <stdio.h> 29 #include <stddef.h> 30 #include <stdlib.h> 31 #include <stdarg.h> 32 #include <string.h> 33 #include <ctype.h> 34 35 #include "jni.h" 36 37 #include "utf.h" 38 39 /* 40 * Error handler 41 */ 42 void 43 utfError(char *file, int line, char *message) 44 { 45 (void)fprintf(stderr, "UTF ERROR [\"%s\":%d]: %s\n", file, line, message); 46 abort(); 47 } 48 49 /* 50 * Convert UTF-8 to UTF-16 51 * Returns length or -1 if output overflows. 52 */ 53 int JNICALL 54 utf8ToUtf16(struct UtfInst *ui, jbyte *utf8, int len, unsigned short *output, int outputMaxLen) 55 { 56 int outputLen; 57 int i; 58 59 UTF_ASSERT(utf8); 60 UTF_ASSERT(len>=0); 61 UTF_ASSERT(output); 62 UTF_ASSERT(outputMaxLen>0); 63 64 i = 0; 65 outputLen = 0; 66 while ( i<len ) { 67 unsigned code, x, y, z; 68 69 if ( outputLen >= outputMaxLen ) { 70 return -1; 71 } 72 x = (unsigned char)utf8[i++]; 73 code = x; 74 if ( (x & 0xE0)==0xE0 ) { 75 y = (unsigned char)utf8[i++]; 76 z = (unsigned char)utf8[i++]; 77 code = ((x & 0xF)<<12) + ((y & 0x3F)<<6) + (z & 0x3F); 78 } else if ( (x & 0xC0)==0xC0 ) { 79 y = (unsigned char)utf8[i++]; 80 code = ((x & 0x1F)<<6) + (y & 0x3F); 81 } 82 output[outputLen++] = code; 83 } 84 return outputLen; 85 } 86 87 /* 88 * Convert UTF-16 to UTF-8 Modified 89 * Returns length or -1 if output overflows. 90 */ 91 int JNICALL 92 utf16ToUtf8m(struct UtfInst *ui, unsigned short *utf16, int len, jbyte *output, int outputMaxLen) 93 { 94 int i; 95 int outputLen; 96 97 UTF_ASSERT(utf16); 98 UTF_ASSERT(len>=0); 99 UTF_ASSERT(output); 100 UTF_ASSERT(outputMaxLen>0); 101 102 outputLen = 0; 103 for (i = 0; i < len; i++) { 104 unsigned code; 105 106 code = utf16[i]; 107 if ( code >= 0x0001 && code <= 0x007F ) { 108 if ( outputLen + 1 >= outputMaxLen ) { 109 return -1; 110 } 111 output[outputLen++] = code; 112 } else if ( code == 0 || ( code >= 0x0080 && code <= 0x07FF ) ) { 113 if ( outputLen + 2 >= outputMaxLen ) { 114 return -1; 115 } 116 output[outputLen++] = ((code>>6) & 0x1F) | 0xC0; 117 output[outputLen++] = (code & 0x3F) | 0x80; 118 } else if ( code >= 0x0800 && code <= 0xFFFF ) { 119 if ( outputLen + 3 >= outputMaxLen ) { 120 return -1; 121 } 122 output[outputLen++] = ((code>>12) & 0x0F) | 0xE0; 123 output[outputLen++] = ((code>>6) & 0x3F) | 0x80; 124 output[outputLen++] = (code & 0x3F) | 0x80; 125 } 126 } 127 output[outputLen] = 0; 128 return outputLen; 129 } 130 131 int JNICALL 132 utf16ToUtf8s(struct UtfInst *ui, unsigned short *utf16, int len, jbyte *output, int outputMaxLen) 133 { 134 return -1; /* FIXUP */ 135 } 136 137 /* Determine length of this Standard UTF-8 in Modified UTF-8. 138 * Validation is done of the basic UTF encoding rules, returns 139 * length (no change) when errors are detected in the UTF encoding. 140 * 141 * Note: Accepts Modified UTF-8 also, no verification on the 142 * correctness of Standard UTF-8 is done. e,g, 0xC080 input is ok. 143 */ 144 int JNICALL 145 utf8sToUtf8mLength(struct UtfInst *ui, jbyte *string, int length) 146 { 147 int newLength; 148 int i; 149 150 newLength = 0; 151 for ( i = 0 ; i < length ; i++ ) { 152 unsigned byte; 153 154 byte = (unsigned char)string[i]; 155 if ( (byte & 0x80) == 0 ) { /* 1byte encoding */ 156 newLength++; 157 if ( byte == 0 ) { 158 newLength++; /* We gain one byte in length on NULL bytes */ 159 } 160 } else if ( (byte & 0xE0) == 0xC0 ) { /* 2byte encoding */ 161 /* Check encoding of following bytes */ 162 if ( (i+1) >= length || (string[i+1] & 0xC0) != 0x80 ) { 163 break; /* Error condition */ 164 } 165 i++; /* Skip next byte */ 166 newLength += 2; 167 } else if ( (byte & 0xF0) == 0xE0 ) { /* 3byte encoding */ 168 /* Check encoding of following bytes */ 169 if ( (i+2) >= length || (string[i+1] & 0xC0) != 0x80 170 || (string[i+2] & 0xC0) != 0x80 ) { 171 break; /* Error condition */ 172 } 173 i += 2; /* Skip next two bytes */ 174 newLength += 3; 175 } else if ( (byte & 0xF8) == 0xF0 ) { /* 4byte encoding */ 176 /* Check encoding of following bytes */ 177 if ( (i+3) >= length || (string[i+1] & 0xC0) != 0x80 178 || (string[i+2] & 0xC0) != 0x80 179 || (string[i+3] & 0xC0) != 0x80 ) { 180 break; /* Error condition */ 181 } 182 i += 3; /* Skip next 3 bytes */ 183 newLength += 6; /* 4byte encoding turns into 2 3byte ones */ 184 } else { 185 break; /* Error condition */ 186 } 187 } 188 if ( i != length ) { 189 /* Error in finding new length, return old length so no conversion */ 190 /* FIXUP: ERROR_MESSAGE? */ 191 return length; 192 } 193 return newLength; 194 } 195 196 /* Convert Standard UTF-8 to Modified UTF-8. 197 * Assumes the UTF-8 encoding was validated by utf8mLength() above. 198 * 199 * Note: Accepts Modified UTF-8 also, no verification on the 200 * correctness of Standard UTF-8 is done. e,g, 0xC080 input is ok. 201 */ 202 void JNICALL 203 utf8sToUtf8m(struct UtfInst *ui, jbyte *string, int length, jbyte *newString, int newLength) 204 { 205 int i; 206 int j; 207 208 j = 0; 209 for ( i = 0 ; i < length ; i++ ) { 210 unsigned byte1; 211 212 byte1 = (unsigned char)string[i]; 213 214 /* NULL bytes and bytes starting with 11110xxx are special */ 215 if ( (byte1 & 0x80) == 0 ) { /* 1byte encoding */ 216 if ( byte1 == 0 ) { 217 /* Bits out: 11000000 10000000 */ 218 newString[j++] = (jbyte)0xC0; 219 newString[j++] = (jbyte)0x80; 220 } else { 221 /* Single byte */ 222 newString[j++] = byte1; 223 } 224 } else if ( (byte1 & 0xE0) == 0xC0 ) { /* 2byte encoding */ 225 newString[j++] = byte1; 226 newString[j++] = string[++i]; 227 } else if ( (byte1 & 0xF0) == 0xE0 ) { /* 3byte encoding */ 228 newString[j++] = byte1; 229 newString[j++] = string[++i]; 230 newString[j++] = string[++i]; 231 } else if ( (byte1 & 0xF8) == 0xF0 ) { /* 4byte encoding */ 232 /* Beginning of 4byte encoding, turn into 2 3byte encodings */ 233 unsigned byte2, byte3, byte4, u21; 234 235 /* Bits in: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ 236 byte2 = (unsigned char)string[++i]; 237 byte3 = (unsigned char)string[++i]; 238 byte4 = (unsigned char)string[++i]; 239 /* Reconstruct full 21bit value */ 240 u21 = (byte1 & 0x07) << 18; 241 u21 += (byte2 & 0x3F) << 12; 242 u21 += (byte3 & 0x3F) << 6; 243 u21 += (byte4 & 0x3F); 244 /* Bits out: 11101101 1010xxxx 10xxxxxx */ 245 newString[j++] = (jbyte)0xED; 246 newString[j++] = (jbyte)(0xA0 + (((u21 >> 16) - 1) & 0x0F)); 247 newString[j++] = (jbyte)(0x80 + ((u21 >> 10) & 0x3F)); 248 /* Bits out: 11101101 1011xxxx 10xxxxxx */ 249 newString[j++] = (jbyte)0xED; 250 newString[j++] = (jbyte)(0xB0 + ((u21 >> 6) & 0x0F)); 251 newString[j++] = byte4; 252 } 253 } 254 UTF_ASSERT(i==length); 255 UTF_ASSERT(j==newLength); 256 newString[j] = (jbyte)0; 257 } 258 259 /* Given a Modified UTF-8 string, calculate the Standard UTF-8 length. 260 * Basic validation of the UTF encoding rules is done, and length is 261 * returned (no change) when errors are detected. 262 * 263 * Note: No validation is made that this is indeed Modified UTF-8 coming in. 264 * 265 */ 266 int JNICALL 267 utf8mToUtf8sLength(struct UtfInst *ui, jbyte *string, int length) 268 { 269 int newLength; 270 int i; 271 272 newLength = 0; 273 for ( i = 0 ; i < length ; i++ ) { 274 unsigned byte1, byte2, byte3, byte4, byte5, byte6; 275 276 byte1 = (unsigned char)string[i]; 277 if ( (byte1 & 0x80) == 0 ) { /* 1byte encoding */ 278 newLength++; 279 } else if ( (byte1 & 0xE0) == 0xC0 ) { /* 2byte encoding */ 280 /* Check encoding of following bytes */ 281 if ( (i+1) >= length || (string[i+1] & 0xC0) != 0x80 ) { 282 break; /* Error condition */ 283 } 284 byte2 = (unsigned char)string[++i]; 285 if ( byte1 != 0xC0 || byte2 != 0x80 ) { 286 newLength += 2; /* Normal 2byte encoding, not 0xC080 */ 287 } else { 288 newLength++; /* We will turn 0xC080 into 0 */ 289 } 290 } else if ( (byte1 & 0xF0) == 0xE0 ) { /* 3byte encoding */ 291 /* Check encoding of following bytes */ 292 if ( (i+2) >= length || (string[i+1] & 0xC0) != 0x80 293 || (string[i+2] & 0xC0) != 0x80 ) { 294 break; /* Error condition */ 295 } 296 byte2 = (unsigned char)string[++i]; 297 byte3 = (unsigned char)string[++i]; 298 newLength += 3; 299 /* Possible process a second 3byte encoding */ 300 if ( (i+3) < length && byte1 == 0xED && (byte2 & 0xF0) == 0xA0 ) { 301 /* See if this is a pair of 3byte encodings */ 302 byte4 = (unsigned char)string[i+1]; 303 byte5 = (unsigned char)string[i+2]; 304 byte6 = (unsigned char)string[i+3]; 305 if ( byte4 == 0xED && (byte5 & 0xF0) == 0xB0 ) { 306 /* Check encoding of 3rd byte */ 307 if ( (byte6 & 0xC0) != 0x80 ) { 308 break; /* Error condition */ 309 } 310 newLength++; /* New string will have 4byte encoding */ 311 i += 3; /* Skip next 3 bytes */ 312 } 313 } 314 } else { 315 break; /* Error condition */ 316 } 317 } 318 if ( i != length ) { 319 /* Error in UTF encoding */ 320 /* FIXUP: ERROR_MESSAGE()? */ 321 return length; 322 } 323 return newLength; 324 } 325 326 /* Convert a Modified UTF-8 string into a Standard UTF-8 string 327 * It is assumed that this string has been validated in terms of the 328 * basic UTF encoding rules by utf8Length() above. 329 * 330 * Note: No validation is made that this is indeed Modified UTF-8 coming in. 331 * 332 */ 333 void JNICALL 334 utf8mToUtf8s(struct UtfInst *ui, jbyte *string, int length, jbyte *newString, int newLength) 335 { 336 int i; 337 int j; 338 339 j = 0; 340 for ( i = 0 ; i < length ; i++ ) { 341 unsigned byte1, byte2, byte3, byte4, byte5, byte6; 342 343 byte1 = (unsigned char)string[i]; 344 if ( (byte1 & 0x80) == 0 ) { /* 1byte encoding */ 345 /* Single byte */ 346 newString[j++] = byte1; 347 } else if ( (byte1 & 0xE0) == 0xC0 ) { /* 2byte encoding */ 348 byte2 = (unsigned char)string[++i]; 349 if ( byte1 != 0xC0 || byte2 != 0x80 ) { 350 newString[j++] = byte1; 351 newString[j++] = byte2; 352 } else { 353 newString[j++] = 0; 354 } 355 } else if ( (byte1 & 0xF0) == 0xE0 ) { /* 3byte encoding */ 356 byte2 = (unsigned char)string[++i]; 357 byte3 = (unsigned char)string[++i]; 358 if ( i+3 < length && byte1 == 0xED && (byte2 & 0xF0) == 0xA0 ) { 359 /* See if this is a pair of 3byte encodings */ 360 byte4 = (unsigned char)string[i+1]; 361 byte5 = (unsigned char)string[i+2]; 362 byte6 = (unsigned char)string[i+3]; 363 if ( byte4 == 0xED && (byte5 & 0xF0) == 0xB0 ) { 364 unsigned u21; 365 366 /* Bits in: 11101101 1010xxxx 10xxxxxx */ 367 /* Bits in: 11101101 1011xxxx 10xxxxxx */ 368 i += 3; 369 370 /* Reconstruct 21 bit code */ 371 u21 = ((byte2 & 0x0F) + 1) << 16; 372 u21 += (byte3 & 0x3F) << 10; 373 u21 += (byte5 & 0x0F) << 6; 374 u21 += (byte6 & 0x3F); 375 376 /* Bits out: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ 377 378 /* Convert to 4byte encoding */ 379 newString[j++] = 0xF0 + ((u21 >> 18) & 0x07); 380 newString[j++] = 0x80 + ((u21 >> 12) & 0x3F); 381 newString[j++] = 0x80 + ((u21 >> 6) & 0x3F); 382 newString[j++] = 0x80 + (u21 & 0x3F); 383 continue; 384 } 385 } 386 /* Normal 3byte encoding */ 387 newString[j++] = byte1; 388 newString[j++] = byte2; 389 newString[j++] = byte3; 390 } 391 } 392 UTF_ASSERT(i==length); 393 UTF_ASSERT(j==newLength); 394 newString[j] = 0; 395 } 396 397 /* ================================================================= */ 398 399 #ifdef COMPILE_WITH_UTF_TEST /* Test program */ 400 401 /* 402 * Convert any byte array into a printable string. 403 * Returns length or -1 if output overflows. 404 */ 405 static int 406 bytesToPrintable(struct UtfInst *ui, char *bytes, int len, char *output, int outputMaxLen) 407 { 408 int outputLen; 409 int i; 410 411 UTF_ASSERT(bytes); 412 UTF_ASSERT(len>=0); 413 UTF_ASSERT(output); 414 UTF_ASSERT(outputMaxLen>=0); 415 416 outputLen = 0; 417 for ( i=0; i<len ; i++ ) { 418 unsigned byte; 419 420 byte = bytes[i]; 421 if ( byte <= 0x7f && isprint(byte) && !iscntrl(byte) ) { 422 if ( outputLen + 1 >= outputMaxLen ) { 423 return -1; 424 } 425 output[outputLen++] = (char)byte; 426 } else { 427 if ( outputLen + 4 >= outputMaxLen ) { 428 return -1; 429 } 430 (void)sprintf(output+outputLen,"\\x%02x",byte); 431 outputLen += 4; 432 } 433 } 434 output[outputLen] = 0; 435 return outputLen; 436 } 437 438 static void 439 test(void) 440 { 441 static char *strings[] = { 442 "characters", 443 "abcdefghijklmnopqrstuvwxyz", 444 "0123456789", 445 "!@#$%^&*()_+=-{}[]:;", 446 NULL }; 447 int i; 448 struct UtfInst *ui; 449 450 ui = utfInitialize(NULL); 451 452 i = 0; 453 while ( strings[i] != NULL ) { 454 char *str; 455 #define MAX 1024 456 char buf0[MAX]; 457 char buf1[MAX]; 458 char buf2[MAX]; 459 unsigned short buf3[MAX]; 460 int len1; 461 int len2; 462 int len3; 463 464 str = strings[i]; 465 466 (void)bytesToPrintable(ui, str, (int)strlen(str), buf0, 1024); 467 468 len1 = utf8FromPlatform(ui, str, (int)strlen(str), (jbyte*)buf1, 1024); 469 470 UTF_ASSERT(len1==(int)strlen(str)); 471 472 len3 = utf8ToUtf16(ui, (jbyte*)buf1, len1, (jchar*)buf3, 1024); 473 474 UTF_ASSERT(len3==len1); 475 476 len1 = utf16ToUtf8m(ui, (jchar*)buf3, len3, (jbyte*)buf1, 1024); 477 478 UTF_ASSERT(len1==len3); 479 UTF_ASSERT(strcmp(str, buf1) == 0); 480 481 len2 = utf8ToPlatform(ui, (jbyte*)buf1, len1, buf2, 1024); 482 483 UTF_ASSERT(len2==len1); 484 UTF_ASSERT(strcmp(str, buf2) == 0); 485 486 i++; 487 } 488 489 utfTerminate(ui, NULL); 490 491 } 492 493 int 494 main(int argc, char **argv) 495 { 496 test(); 497 return 0; 498 } 499 500 #endif 501