Home | History | Annotate | Download | only in objmng
      1 /*
      2  * Copyright (C) 2007 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include <objmng/drm_i18n.h>
     18 
     19 #define IS_GB2312_HIGH_BYTE(c)  ((c) >= 0xA1 && (c) <= 0xF7)
     20 #define IS_GB2312_LOW_BYTE(c)   ((c) >= 0xA1 && (c) <= 0xFE)
     21 #define IS_GBK_HIGH_BYTE(c)     ((c) >= 0x81 && (c) <= 0xFE)
     22 #define IS_GBK_LOW_BYTE(c)      ((c) >= 0x40 && (c) <= 0xFE && (c) != 0x7F)
     23 #define IS_BIG5_HIGH_BYTE(c)    ((c) >= 0xA1 && (c) <= 0xF9)
     24 #define IS_BIG5_LOW_BYTE(c)     (((c) >= 0x40 && (c) <= 0x7E) \
     25                                  || ((c) >= 0xA1 && (c) <= 0xFE))
     26 #define IS_ASCII(c)             ((c) <= 127)
     27 
     28 #define INVALID_UNICODE         0xFFFD
     29 
     30 #define I18N_LATIN1_SUPPORT
     31 #define I18N_UTF8_UTF16_SUPPORT
     32 
     33 
     34 /**
     35  * Simply convert ISO 8859-1 (latin1) to unicode
     36  */
     37 static int32_t latin1ToWcs(const uint8_t *mbs, int32_t mbsLen,
     38         uint16_t *wcsBuf, int32_t bufSizeInWideChar,
     39         int32_t *bytesConsumed);
     40 
     41 /**
     42  * Convert one unicode char to ISO 8859-1 (latin1) byte
     43  */
     44 static int32_t wcToLatin1(uint16_t wc, uint8_t * mbs, int32_t bufSize);
     45 
     46 /**
     47  * Convert UTF-8 to unicode
     48  */
     49 static int32_t utf8ToWcs(const uint8_t *mbs, int32_t mbsLen,
     50         uint16_t *wcsBuf, int32_t bufSizeInWideChar,
     51         int32_t *bytesConsumed);
     52 
     53 /**
     54  * Convert one unicode char to UTF-8 bytes
     55  */
     56 static int32_t wcToUtf8(uint16_t wc, uint8_t * mbs, int32_t bufSize);
     57 
     58 /**
     59  * Convert UTF-16 BE to unicode
     60  */
     61 static int32_t utf16beToWcs(const uint8_t *mbs, int32_t mbsLen,
     62         uint16_t *wcsBuf, int32_t bufSizeInWideChar,
     63         int32_t *bytesConsumed);
     64 
     65 /**
     66  * Convert one unicode char to UTF-16 BE bytes
     67  */
     68 static int32_t wcToUtf16be(uint16_t wc, uint8_t * mbs, int32_t bufSize);
     69 
     70 /**
     71  * Convert UTF-16 LE to unicode
     72  */
     73 static int32_t utf16leToWcs(const uint8_t *mbs, int32_t mbsLen,
     74         uint16_t *wcsBuf, int32_t bufSizeInWideChar,
     75         int32_t *bytesConsumed);
     76 
     77 /**
     78  * Convert one unicode char to UTF-16 LE bytes
     79  */
     80 static int32_t wcToUtf16le(uint16_t wc, uint8_t * mbs, int32_t bufSize);
     81 
     82 /*
     83  * see drm_i18n.h
     84  */
     85 int32_t DRM_i18n_mbsToWcs(DRM_Charset_t charset,
     86         const uint8_t *mbs, int32_t mbsLen,
     87         uint16_t *wcsBuf, int32_t bufSizeInWideChar,
     88         int32_t *bytesConsumed)
     89 {
     90     switch (charset)
     91     {
     92 #ifdef I18N_GB2312_SUPPORT
     93         case DRM_CHARSET_GB2312:
     94             return gb2312ToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
     95 #endif
     96 #ifdef I18N_GBK_SUPPORT
     97         case DRM_CHARSET_GBK:
     98             return gbkToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
     99 #endif
    100 #ifdef I18N_BIG5_SUPPORT
    101         case DRM_CHARSET_BIG5:
    102             return big5ToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
    103 #endif
    104 #ifdef I18N_LATIN1_SUPPORT
    105         case DRM_CHARSET_LATIN1:
    106             return latin1ToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
    107 #endif
    108 #ifdef I18N_ISO8859X_SUPPORT
    109         case DRM_CHARSET_LATIN2:
    110         case DRM_CHARSET_LATIN3:
    111         case DRM_CHARSET_LATIN4:
    112         case DRM_CHARSET_CYRILLIC:
    113         case DRM_CHARSET_ARABIC:
    114         case DRM_CHARSET_GREEK:
    115         case DRM_CHARSET_HEBREW:
    116         case DRM_CHARSET_LATIN5:
    117         case DRM_CHARSET_LATIN6:
    118         case DRM_CHARSET_THAI:
    119         case DRM_CHARSET_LATIN7:
    120         case DRM_CHARSET_LATIN8:
    121         case DRM_CHARSET_LATIN9:
    122         case DRM_CHARSET_LATIN10:
    123             return iso8859xToWcs(charset, mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
    124 #endif
    125 #ifdef I18N_UTF8_UTF16_SUPPORT
    126         case DRM_CHARSET_UTF8:
    127             return utf8ToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
    128         case DRM_CHARSET_UTF16BE:
    129             return utf16beToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
    130         case DRM_CHARSET_UTF16LE:
    131             return utf16leToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
    132 #endif
    133         default:
    134             return -1;
    135     }
    136 }
    137 
    138 /*
    139  * see drm_i18n.h
    140  */
    141 int32_t DRM_i18n_wcsToMbs(DRM_Charset_t charset,
    142         const uint16_t *wcs, int32_t wcsLen,
    143         uint8_t *mbsBuf, int32_t bufSizeInByte)
    144 {
    145     int32_t (* wcToMbFunc)(uint16_t, uint8_t *, int32_t);
    146     int32_t charIndex = 0;
    147     int32_t numMultiBytes = 0;
    148 
    149     switch (charset)
    150     {
    151 #ifdef I18N_LATIN1_SUPPORT
    152         case DRM_CHARSET_LATIN1:
    153             wcToMbFunc = wcToLatin1;
    154             break;
    155 #endif
    156 #ifdef I18N_UTF8_UTF16_SUPPORT
    157         case DRM_CHARSET_UTF8:
    158             wcToMbFunc = wcToUtf8;
    159             break;
    160         case DRM_CHARSET_UTF16BE:
    161             wcToMbFunc = wcToUtf16be;
    162             break;
    163         case DRM_CHARSET_UTF16LE:
    164             wcToMbFunc = wcToUtf16le;
    165             break;
    166 #endif
    167 #ifdef I18N_ISO8859X_SUPPORT
    168         case DRM_CHARSET_LATIN2:
    169         case DRM_CHARSET_LATIN3:
    170         case DRM_CHARSET_LATIN4:
    171         case DRM_CHARSET_CYRILLIC:
    172         case DRM_CHARSET_ARABIC:
    173         case DRM_CHARSET_GREEK:
    174         case DRM_CHARSET_HEBREW:
    175         case DRM_CHARSET_LATIN5:
    176         case DRM_CHARSET_LATIN6:
    177         case DRM_CHARSET_THAI:
    178         case DRM_CHARSET_LATIN7:
    179         case DRM_CHARSET_LATIN8:
    180         case DRM_CHARSET_LATIN9:
    181         case DRM_CHARSET_LATIN10:
    182             return wcsToIso8859x(charset, wcs, wcsLen, mbsBuf, bufSizeInByte);
    183 #endif
    184         default:
    185             return -1;
    186     }
    187 
    188     if (mbsBuf) {
    189         while (numMultiBytes < bufSizeInByte && charIndex < wcsLen) {
    190             /* TODO: handle surrogate pair values here */
    191             int32_t mbLen = wcToMbFunc(wcs[charIndex],
    192                     &mbsBuf[numMultiBytes], bufSizeInByte - numMultiBytes);
    193 
    194             if (numMultiBytes + mbLen > bufSizeInByte) {
    195                 /* Insufficient buffer. Don't update numMultiBytes */
    196                 break;
    197             }
    198             charIndex++;
    199             numMultiBytes += mbLen;
    200         }
    201     } else {
    202         while (charIndex < wcsLen) {
    203             /* TODO: handle surrogate pair values here */
    204             numMultiBytes += wcToMbFunc(wcs[charIndex], NULL, 0);
    205             charIndex++;
    206         }
    207     }
    208 
    209     return numMultiBytes;
    210 }
    211 
    212 
    213 #ifdef I18N_LATIN1_SUPPORT
    214 
    215 int32_t latin1ToWcs(const uint8_t *mbs, int32_t mbsLen,
    216         uint16_t *wcsBuf, int32_t bufSizeInWideChar,
    217         int32_t *bytesConsumed)
    218 {
    219     int32_t charsToConvert;
    220     int32_t len;
    221 
    222     if (wcsBuf == NULL) {
    223         return mbsLen;
    224     }
    225 
    226     len = charsToConvert = mbsLen > bufSizeInWideChar ? bufSizeInWideChar : mbsLen;
    227     if (len < 0)
    228         return 0;
    229     while (len--) {
    230         *wcsBuf++ = *mbs++;
    231     }
    232 
    233     if (bytesConsumed)
    234         *bytesConsumed = charsToConvert;
    235 
    236     return charsToConvert;
    237 }
    238 
    239 int32_t wcToLatin1(uint16_t wc, uint8_t * mbs, int32_t bufSize)
    240 {
    241     uint8_t ch;
    242 
    243     if (wc < 0x100) {
    244         ch = (uint8_t)(wc & 0xff);
    245     } else {
    246         ch = '?';
    247     }
    248     if (mbs && bufSize > 0)
    249         *mbs = ch;
    250     return 1;
    251 }
    252 
    253 #endif /* I18N_LATIN1_SUPPORT */
    254 
    255 #ifdef I18N_UTF8_UTF16_SUPPORT
    256 
    257 int32_t utf8ToWcs(const uint8_t *mbs, int32_t mbsLen,
    258         uint16_t *wcsBuf, int32_t bufSizeInWideChar,
    259         int32_t *bytesConsumed)
    260 {
    261     int32_t charsConverted = 0;
    262     int32_t i = 0;
    263     int32_t wideChar;
    264 
    265     if (wcsBuf == NULL) {
    266         /* No conversion but we're still going to calculate bytesConsumed */
    267         bufSizeInWideChar = mbsLen * 2;
    268     }
    269 
    270     while((i < mbsLen) && (charsConverted < bufSizeInWideChar)) {
    271         uint8_t ch = mbs[i];
    272         uint8_t ch2, ch3, ch4;
    273 
    274         wideChar = -1;
    275 
    276         if(IS_ASCII(ch)) {
    277             wideChar = ch;
    278             i++;
    279         } else if ((ch & 0xc0) == 0xc0) {
    280             int utfStart = i;
    281             if ((ch & 0xe0) == 0xc0) {
    282                 /* 2 byte sequence */
    283                 if (i + 1 < mbsLen && ((ch2 = mbs[i + 1]) & 0xc0) == 0x80) {
    284                     wideChar = (uint16_t)(((ch & 0x1F) << 6) | (ch2 & 0x3F));
    285                     i += 2;
    286                 } else {
    287                     /* skip incomplete sequence */
    288                     i++;
    289                 }
    290             } else if ((ch & 0xf0) == 0xe0) {
    291                 /* 3 byte sequence */
    292                 if (i + 2 < mbsLen
    293                         && ((ch2 = mbs[i + 1]) & 0xc0) == 0x80
    294                         && ((ch3 = mbs[i + 2]) & 0xc0) == 0x80) {
    295                     wideChar = (uint16_t)(((ch & 0x0F) << 12) | ((ch2 & 0x3F) << 6) | (ch3 & 0x3F));
    296                     i += 3;
    297                 } else {
    298                     /* skip incomplete sequence (up to 2 bytes) */
    299                     i++;
    300                     if (i < mbsLen && (mbs[i] & 0xc0) == 0x80)
    301                         i++;
    302                 }
    303             } else if ((ch & 0xf8) == 0xf0) {
    304                 /* 4 byte sequence */
    305                 if (i + 3 < mbsLen
    306                         && ((ch2 = mbs[i + 1]) & 0xc0) == 0x80
    307                         && ((ch3 = mbs[i + 2]) & 0xc0) == 0x80
    308                         && ((ch4 = mbs[i + 3]) & 0xc0) == 0x80) {
    309                     /* FIXME: we do NOT support U+10000 - U+10FFFF for now.
    310                      *        leave it as 0xFFFD. */
    311                     wideChar = INVALID_UNICODE;
    312                     i += 4;
    313                 } else {
    314                     /* skip incomplete sequence (up to 3 bytes) */
    315                     i++;
    316                     if (i < mbsLen && (mbs[i] & 0xc0) == 0x80) {
    317                         i++;
    318                         if (i < mbsLen && (mbs[i] & 0xc0) == 0x80) {
    319                             i++;
    320                         }
    321                     }
    322                 }
    323             } else {
    324                 /* invalid */
    325                 i++;
    326             }
    327             if (i >= mbsLen && wideChar == -1) {
    328                 /* Possible incomplete UTF-8 sequence at the end of mbs.
    329                  * Leave it to the caller.
    330                  */
    331                 i = utfStart;
    332                 break;
    333             }
    334         } else {
    335             /* invalid */
    336             i++;
    337         }
    338         if(wcsBuf) {
    339             if (wideChar == -1)
    340                 wideChar = INVALID_UNICODE;
    341             wcsBuf[charsConverted] = (uint16_t)wideChar;
    342         }
    343         charsConverted++;
    344     }
    345 
    346     if (bytesConsumed)
    347         *bytesConsumed = i;
    348 
    349     return charsConverted;
    350 }
    351 
    352 int32_t wcToUtf8(uint16_t wc, uint8_t * mbs, int32_t bufSize)
    353 {
    354     if (wc <= 0x7f) {
    355         if (mbs && (bufSize >= 1)) {
    356             *mbs = (uint8_t)wc;
    357         }
    358         return 1;
    359     } else if (wc <= 0x7ff) {
    360         if (mbs && (bufSize >= 2)) {
    361             *mbs++ = (uint8_t)((wc >> 6) | 0xc0);
    362             *mbs = (uint8_t)((wc & 0x3f) | 0x80);
    363         }
    364         return 2;
    365     } else {
    366         if (mbs && (bufSize >= 3)) {
    367             *mbs++ = (uint8_t)((wc >> 12) | 0xe0);
    368             *mbs++ = (uint8_t)(((wc >> 6) & 0x3f)| 0x80);
    369             *mbs = (uint8_t)((wc & 0x3f) | 0x80);
    370         }
    371         return 3;
    372     }
    373 }
    374 
    375 int32_t utf16beToWcs(const uint8_t *mbs, int32_t mbsLen,
    376         uint16_t *wcsBuf, int32_t bufSizeInWideChar,
    377         int32_t *bytesConsumed)
    378 {
    379     int32_t charsToConvert;
    380     int32_t len;
    381 
    382     if (wcsBuf == NULL) {
    383         return mbsLen / 2;
    384     }
    385 
    386     len = charsToConvert = (mbsLen / 2) > bufSizeInWideChar ? bufSizeInWideChar : (mbsLen / 2);
    387     while (len--) {
    388         /* TODO: handle surrogate pair values */
    389         *wcsBuf++ = (uint16_t)((*mbs << 8) | *(mbs + 1));
    390         mbs += 2;
    391     }
    392 
    393     if (bytesConsumed)
    394         *bytesConsumed = charsToConvert * 2;
    395 
    396     return charsToConvert;
    397 }
    398 
    399 int32_t wcToUtf16be(uint16_t wc, uint8_t * mbs, int32_t bufSize)
    400 {
    401     if (mbs && bufSize >= 2) {
    402         /* TODO: handle surrogate pair values */
    403         *mbs = (uint8_t)(wc >> 8);
    404         *(mbs + 1) = (uint8_t)(wc & 0xff);
    405     }
    406     return 2;
    407 }
    408 
    409 int32_t utf16leToWcs(const uint8_t *mbs, int32_t mbsLen,
    410         uint16_t *wcsBuf, int32_t bufSizeInWideChar,
    411         int32_t *bytesConsumed)
    412 {
    413     int32_t charsToConvert;
    414     int32_t len;
    415 
    416     if (wcsBuf == NULL) {
    417         return mbsLen / 2;
    418     }
    419 
    420     len = charsToConvert = (mbsLen / 2) > bufSizeInWideChar ? bufSizeInWideChar : (mbsLen / 2);
    421     while (len--) {
    422         /* TODO: handle surrogate pair values */
    423         *wcsBuf++ = (uint16_t)(*mbs | (*(mbs + 1) << 8));
    424         mbs += 2;
    425     }
    426 
    427     if (bytesConsumed)
    428         *bytesConsumed = charsToConvert * 2;
    429 
    430     return charsToConvert;
    431 }
    432 
    433 int32_t wcToUtf16le(uint16_t wc, uint8_t * mbs, int32_t bufSize)
    434 {
    435     if (mbs && bufSize >= 2) {
    436         /* TODO: handle surrogate pair values */
    437         *mbs = (uint8_t)(wc & 0xff);
    438         *(mbs + 1) = (uint8_t)(wc >> 8);
    439     }
    440     return 2;
    441 }
    442 
    443 #endif /* I18N_UTF8_UTF16_SUPPORT */
    444 
    445