1 /* 2 * Copyright (C) 2007 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include <objmng/drm_i18n.h> 18 19 #define IS_GB2312_HIGH_BYTE(c) ((c) >= 0xA1 && (c) <= 0xF7) 20 #define IS_GB2312_LOW_BYTE(c) ((c) >= 0xA1 && (c) <= 0xFE) 21 #define IS_GBK_HIGH_BYTE(c) ((c) >= 0x81 && (c) <= 0xFE) 22 #define IS_GBK_LOW_BYTE(c) ((c) >= 0x40 && (c) <= 0xFE && (c) != 0x7F) 23 #define IS_BIG5_HIGH_BYTE(c) ((c) >= 0xA1 && (c) <= 0xF9) 24 #define IS_BIG5_LOW_BYTE(c) (((c) >= 0x40 && (c) <= 0x7E) \ 25 || ((c) >= 0xA1 && (c) <= 0xFE)) 26 #define IS_ASCII(c) ((c) <= 127) 27 28 #define INVALID_UNICODE 0xFFFD 29 30 #define I18N_LATIN1_SUPPORT 31 #define I18N_UTF8_UTF16_SUPPORT 32 33 34 /** 35 * Simply convert ISO 8859-1 (latin1) to unicode 36 */ 37 static int32_t latin1ToWcs(const uint8_t *mbs, int32_t mbsLen, 38 uint16_t *wcsBuf, int32_t bufSizeInWideChar, 39 int32_t *bytesConsumed); 40 41 /** 42 * Convert one unicode char to ISO 8859-1 (latin1) byte 43 */ 44 static int32_t wcToLatin1(uint16_t wc, uint8_t * mbs, int32_t bufSize); 45 46 /** 47 * Convert UTF-8 to unicode 48 */ 49 static int32_t utf8ToWcs(const uint8_t *mbs, int32_t mbsLen, 50 uint16_t *wcsBuf, int32_t bufSizeInWideChar, 51 int32_t *bytesConsumed); 52 53 /** 54 * Convert one unicode char to UTF-8 bytes 55 */ 56 static int32_t wcToUtf8(uint16_t wc, uint8_t * mbs, int32_t bufSize); 57 58 /** 59 * Convert UTF-16 BE to unicode 60 */ 61 static int32_t utf16beToWcs(const uint8_t *mbs, int32_t mbsLen, 62 uint16_t *wcsBuf, int32_t bufSizeInWideChar, 63 int32_t *bytesConsumed); 64 65 /** 66 * Convert one unicode char to UTF-16 BE bytes 67 */ 68 static int32_t wcToUtf16be(uint16_t wc, uint8_t * mbs, int32_t bufSize); 69 70 /** 71 * Convert UTF-16 LE to unicode 72 */ 73 static int32_t utf16leToWcs(const uint8_t *mbs, int32_t mbsLen, 74 uint16_t *wcsBuf, int32_t bufSizeInWideChar, 75 int32_t *bytesConsumed); 76 77 /** 78 * Convert one unicode char to UTF-16 LE bytes 79 */ 80 static int32_t wcToUtf16le(uint16_t wc, uint8_t * mbs, int32_t bufSize); 81 82 /* 83 * see drm_i18n.h 84 */ 85 int32_t DRM_i18n_mbsToWcs(DRM_Charset_t charset, 86 const uint8_t *mbs, int32_t mbsLen, 87 uint16_t *wcsBuf, int32_t bufSizeInWideChar, 88 int32_t *bytesConsumed) 89 { 90 switch (charset) 91 { 92 #ifdef I18N_GB2312_SUPPORT 93 case DRM_CHARSET_GB2312: 94 return gb2312ToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed); 95 #endif 96 #ifdef I18N_GBK_SUPPORT 97 case DRM_CHARSET_GBK: 98 return gbkToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed); 99 #endif 100 #ifdef I18N_BIG5_SUPPORT 101 case DRM_CHARSET_BIG5: 102 return big5ToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed); 103 #endif 104 #ifdef I18N_LATIN1_SUPPORT 105 case DRM_CHARSET_LATIN1: 106 return latin1ToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed); 107 #endif 108 #ifdef I18N_ISO8859X_SUPPORT 109 case DRM_CHARSET_LATIN2: 110 case DRM_CHARSET_LATIN3: 111 case DRM_CHARSET_LATIN4: 112 case DRM_CHARSET_CYRILLIC: 113 case DRM_CHARSET_ARABIC: 114 case DRM_CHARSET_GREEK: 115 case DRM_CHARSET_HEBREW: 116 case DRM_CHARSET_LATIN5: 117 case DRM_CHARSET_LATIN6: 118 case DRM_CHARSET_THAI: 119 case DRM_CHARSET_LATIN7: 120 case DRM_CHARSET_LATIN8: 121 case DRM_CHARSET_LATIN9: 122 case DRM_CHARSET_LATIN10: 123 return iso8859xToWcs(charset, mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed); 124 #endif 125 #ifdef I18N_UTF8_UTF16_SUPPORT 126 case DRM_CHARSET_UTF8: 127 return utf8ToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed); 128 case DRM_CHARSET_UTF16BE: 129 return utf16beToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed); 130 case DRM_CHARSET_UTF16LE: 131 return utf16leToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed); 132 #endif 133 default: 134 return -1; 135 } 136 } 137 138 /* 139 * see drm_i18n.h 140 */ 141 int32_t DRM_i18n_wcsToMbs(DRM_Charset_t charset, 142 const uint16_t *wcs, int32_t wcsLen, 143 uint8_t *mbsBuf, int32_t bufSizeInByte) 144 { 145 int32_t (* wcToMbFunc)(uint16_t, uint8_t *, int32_t); 146 int32_t charIndex = 0; 147 int32_t numMultiBytes = 0; 148 149 switch (charset) 150 { 151 #ifdef I18N_LATIN1_SUPPORT 152 case DRM_CHARSET_LATIN1: 153 wcToMbFunc = wcToLatin1; 154 break; 155 #endif 156 #ifdef I18N_UTF8_UTF16_SUPPORT 157 case DRM_CHARSET_UTF8: 158 wcToMbFunc = wcToUtf8; 159 break; 160 case DRM_CHARSET_UTF16BE: 161 wcToMbFunc = wcToUtf16be; 162 break; 163 case DRM_CHARSET_UTF16LE: 164 wcToMbFunc = wcToUtf16le; 165 break; 166 #endif 167 #ifdef I18N_ISO8859X_SUPPORT 168 case DRM_CHARSET_LATIN2: 169 case DRM_CHARSET_LATIN3: 170 case DRM_CHARSET_LATIN4: 171 case DRM_CHARSET_CYRILLIC: 172 case DRM_CHARSET_ARABIC: 173 case DRM_CHARSET_GREEK: 174 case DRM_CHARSET_HEBREW: 175 case DRM_CHARSET_LATIN5: 176 case DRM_CHARSET_LATIN6: 177 case DRM_CHARSET_THAI: 178 case DRM_CHARSET_LATIN7: 179 case DRM_CHARSET_LATIN8: 180 case DRM_CHARSET_LATIN9: 181 case DRM_CHARSET_LATIN10: 182 return wcsToIso8859x(charset, wcs, wcsLen, mbsBuf, bufSizeInByte); 183 #endif 184 default: 185 return -1; 186 } 187 188 if (mbsBuf) { 189 while (numMultiBytes < bufSizeInByte && charIndex < wcsLen) { 190 /* TODO: handle surrogate pair values here */ 191 int32_t mbLen = wcToMbFunc(wcs[charIndex], 192 &mbsBuf[numMultiBytes], bufSizeInByte - numMultiBytes); 193 194 if (numMultiBytes + mbLen > bufSizeInByte) { 195 /* Insufficient buffer. Don't update numMultiBytes */ 196 break; 197 } 198 charIndex++; 199 numMultiBytes += mbLen; 200 } 201 } else { 202 while (charIndex < wcsLen) { 203 /* TODO: handle surrogate pair values here */ 204 numMultiBytes += wcToMbFunc(wcs[charIndex], NULL, 0); 205 charIndex++; 206 } 207 } 208 209 return numMultiBytes; 210 } 211 212 213 #ifdef I18N_LATIN1_SUPPORT 214 215 int32_t latin1ToWcs(const uint8_t *mbs, int32_t mbsLen, 216 uint16_t *wcsBuf, int32_t bufSizeInWideChar, 217 int32_t *bytesConsumed) 218 { 219 int32_t charsToConvert; 220 int32_t len; 221 222 if (wcsBuf == NULL) { 223 return mbsLen; 224 } 225 226 len = charsToConvert = mbsLen > bufSizeInWideChar ? bufSizeInWideChar : mbsLen; 227 if (len < 0) 228 return 0; 229 while (len--) { 230 *wcsBuf++ = *mbs++; 231 } 232 233 if (bytesConsumed) 234 *bytesConsumed = charsToConvert; 235 236 return charsToConvert; 237 } 238 239 int32_t wcToLatin1(uint16_t wc, uint8_t * mbs, int32_t bufSize) 240 { 241 uint8_t ch; 242 243 if (wc < 0x100) { 244 ch = (uint8_t)(wc & 0xff); 245 } else { 246 ch = '?'; 247 } 248 if (mbs && bufSize > 0) 249 *mbs = ch; 250 return 1; 251 } 252 253 #endif /* I18N_LATIN1_SUPPORT */ 254 255 #ifdef I18N_UTF8_UTF16_SUPPORT 256 257 int32_t utf8ToWcs(const uint8_t *mbs, int32_t mbsLen, 258 uint16_t *wcsBuf, int32_t bufSizeInWideChar, 259 int32_t *bytesConsumed) 260 { 261 int32_t charsConverted = 0; 262 int32_t i = 0; 263 int32_t wideChar; 264 265 if (wcsBuf == NULL) { 266 /* No conversion but we're still going to calculate bytesConsumed */ 267 bufSizeInWideChar = mbsLen * 2; 268 } 269 270 while((i < mbsLen) && (charsConverted < bufSizeInWideChar)) { 271 uint8_t ch = mbs[i]; 272 uint8_t ch2, ch3, ch4; 273 274 wideChar = -1; 275 276 if(IS_ASCII(ch)) { 277 wideChar = ch; 278 i++; 279 } else if ((ch & 0xc0) == 0xc0) { 280 int utfStart = i; 281 if ((ch & 0xe0) == 0xc0) { 282 /* 2 byte sequence */ 283 if (i + 1 < mbsLen && ((ch2 = mbs[i + 1]) & 0xc0) == 0x80) { 284 wideChar = (uint16_t)(((ch & 0x1F) << 6) | (ch2 & 0x3F)); 285 i += 2; 286 } else { 287 /* skip incomplete sequence */ 288 i++; 289 } 290 } else if ((ch & 0xf0) == 0xe0) { 291 /* 3 byte sequence */ 292 if (i + 2 < mbsLen 293 && ((ch2 = mbs[i + 1]) & 0xc0) == 0x80 294 && ((ch3 = mbs[i + 2]) & 0xc0) == 0x80) { 295 wideChar = (uint16_t)(((ch & 0x0F) << 12) | ((ch2 & 0x3F) << 6) | (ch3 & 0x3F)); 296 i += 3; 297 } else { 298 /* skip incomplete sequence (up to 2 bytes) */ 299 i++; 300 if (i < mbsLen && (mbs[i] & 0xc0) == 0x80) 301 i++; 302 } 303 } else if ((ch & 0xf8) == 0xf0) { 304 /* 4 byte sequence */ 305 if (i + 3 < mbsLen 306 && ((ch2 = mbs[i + 1]) & 0xc0) == 0x80 307 && ((ch3 = mbs[i + 2]) & 0xc0) == 0x80 308 && ((ch4 = mbs[i + 3]) & 0xc0) == 0x80) { 309 /* FIXME: we do NOT support U+10000 - U+10FFFF for now. 310 * leave it as 0xFFFD. */ 311 wideChar = INVALID_UNICODE; 312 i += 4; 313 } else { 314 /* skip incomplete sequence (up to 3 bytes) */ 315 i++; 316 if (i < mbsLen && (mbs[i] & 0xc0) == 0x80) { 317 i++; 318 if (i < mbsLen && (mbs[i] & 0xc0) == 0x80) { 319 i++; 320 } 321 } 322 } 323 } else { 324 /* invalid */ 325 i++; 326 } 327 if (i >= mbsLen && wideChar == -1) { 328 /* Possible incomplete UTF-8 sequence at the end of mbs. 329 * Leave it to the caller. 330 */ 331 i = utfStart; 332 break; 333 } 334 } else { 335 /* invalid */ 336 i++; 337 } 338 if(wcsBuf) { 339 if (wideChar == -1) 340 wideChar = INVALID_UNICODE; 341 wcsBuf[charsConverted] = (uint16_t)wideChar; 342 } 343 charsConverted++; 344 } 345 346 if (bytesConsumed) 347 *bytesConsumed = i; 348 349 return charsConverted; 350 } 351 352 int32_t wcToUtf8(uint16_t wc, uint8_t * mbs, int32_t bufSize) 353 { 354 if (wc <= 0x7f) { 355 if (mbs && (bufSize >= 1)) { 356 *mbs = (uint8_t)wc; 357 } 358 return 1; 359 } else if (wc <= 0x7ff) { 360 if (mbs && (bufSize >= 2)) { 361 *mbs++ = (uint8_t)((wc >> 6) | 0xc0); 362 *mbs = (uint8_t)((wc & 0x3f) | 0x80); 363 } 364 return 2; 365 } else { 366 if (mbs && (bufSize >= 3)) { 367 *mbs++ = (uint8_t)((wc >> 12) | 0xe0); 368 *mbs++ = (uint8_t)(((wc >> 6) & 0x3f)| 0x80); 369 *mbs = (uint8_t)((wc & 0x3f) | 0x80); 370 } 371 return 3; 372 } 373 } 374 375 int32_t utf16beToWcs(const uint8_t *mbs, int32_t mbsLen, 376 uint16_t *wcsBuf, int32_t bufSizeInWideChar, 377 int32_t *bytesConsumed) 378 { 379 int32_t charsToConvert; 380 int32_t len; 381 382 if (wcsBuf == NULL) { 383 return mbsLen / 2; 384 } 385 386 len = charsToConvert = (mbsLen / 2) > bufSizeInWideChar ? bufSizeInWideChar : (mbsLen / 2); 387 while (len--) { 388 /* TODO: handle surrogate pair values */ 389 *wcsBuf++ = (uint16_t)((*mbs << 8) | *(mbs + 1)); 390 mbs += 2; 391 } 392 393 if (bytesConsumed) 394 *bytesConsumed = charsToConvert * 2; 395 396 return charsToConvert; 397 } 398 399 int32_t wcToUtf16be(uint16_t wc, uint8_t * mbs, int32_t bufSize) 400 { 401 if (mbs && bufSize >= 2) { 402 /* TODO: handle surrogate pair values */ 403 *mbs = (uint8_t)(wc >> 8); 404 *(mbs + 1) = (uint8_t)(wc & 0xff); 405 } 406 return 2; 407 } 408 409 int32_t utf16leToWcs(const uint8_t *mbs, int32_t mbsLen, 410 uint16_t *wcsBuf, int32_t bufSizeInWideChar, 411 int32_t *bytesConsumed) 412 { 413 int32_t charsToConvert; 414 int32_t len; 415 416 if (wcsBuf == NULL) { 417 return mbsLen / 2; 418 } 419 420 len = charsToConvert = (mbsLen / 2) > bufSizeInWideChar ? bufSizeInWideChar : (mbsLen / 2); 421 while (len--) { 422 /* TODO: handle surrogate pair values */ 423 *wcsBuf++ = (uint16_t)(*mbs | (*(mbs + 1) << 8)); 424 mbs += 2; 425 } 426 427 if (bytesConsumed) 428 *bytesConsumed = charsToConvert * 2; 429 430 return charsToConvert; 431 } 432 433 int32_t wcToUtf16le(uint16_t wc, uint8_t * mbs, int32_t bufSize) 434 { 435 if (mbs && bufSize >= 2) { 436 /* TODO: handle surrogate pair values */ 437 *mbs = (uint8_t)(wc & 0xff); 438 *(mbs + 1) = (uint8_t)(wc >> 8); 439 } 440 return 2; 441 } 442 443 #endif /* I18N_UTF8_UTF16_SUPPORT */ 444 445