Home | History | Annotate | Download | only in bionic
      1 /*
      2  * Copyright (C) 2017 The Android Open Source Project
      3  * All rights reserved.
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  *  * Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  *  * Redistributions in binary form must reproduce the above copyright
     11  *    notice, this list of conditions and the following disclaimer in
     12  *    the documentation and/or other materials provided with the
     13  *    distribution.
     14  *
     15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     16  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     17  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
     18  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
     19  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
     20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
     21  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
     22  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
     23  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
     24  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
     25  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     26  * SUCH DAMAGE.
     27  */
     28 
     29 #include <iconv.h>
     30 
     31 #include <ctype.h>
     32 #include <endian.h>
     33 #include <errno.h>
     34 #include <stdlib.h>
     35 #include <string.h>
     36 #include <uchar.h>
     37 
     38 #include "private/bionic_mbstate.h"
     39 
     40 #define INVALID_ICONV_T reinterpret_cast<iconv_t>(-1)
     41 
     42 // Ideally we'd use icu4c but the API mismatch seems too great. So we just offer something
     43 // equivalent to (but slightly easier to use for runs of text than) <uchar.h>. If you're
     44 // here to add more encodings, consider working on finishing the icu4c NDK wrappers instead.
     45 enum Encoding {
     46   US_ASCII,
     47   UTF_8,
     48   UTF_16_LE,
     49   UTF_16_BE,
     50   UTF_32_LE,
     51   UTF_32_BE,
     52   WCHAR_T,
     53 };
     54 
     55 enum Mode {
     56   ERROR,
     57   IGNORE,
     58   TRANSLIT,
     59 };
     60 
     61 // This matching is strange but true.
     62 // See http://www.unicode.org/reports/tr22/#Charset_Alias_Matching.
     63 static bool __match_encoding(const char* lhs, const char* rhs) {
     64   while (*lhs && *rhs) {
     65     // Skip non-alnum in lhs; "UTF-8", "UTF_8", "UTF8", "UTF 8" are all equivalent.
     66     // Also implement the "delete each 0 that is not preceded by a digit" rule.
     67     for (; *lhs; ++lhs) {
     68       if (isalnum(*lhs) && (*lhs != '0' || !isdigit(*(lhs + 1)))) break;
     69     }
     70     // Case doesn't matter either.
     71     if (tolower(*lhs) != tolower(*rhs)) break;
     72     ++lhs;
     73     ++rhs;
     74   }
     75   // As a special case we treat the GNU "//" extensions as end of string.
     76   if ((*lhs == '\0' || strstr(lhs, "//") == lhs) && *rhs == '\0') return true;
     77   return false;
     78 }
     79 
     80 static bool __parse_encoding(const char* s, Encoding* encoding, Mode* mode) {
     81   const char* suffix = strstr(s, "//");
     82   if (suffix) {
     83     if (!mode) return false;
     84     if (strcmp(suffix, "//IGNORE") == 0) {
     85       *mode = IGNORE;
     86     } else if (strcmp(suffix, "//TRANSLIT") == 0) {
     87       *mode = TRANSLIT;
     88     } else {
     89       return false;
     90     }
     91   }
     92   if (__match_encoding(s, "utf8")) {
     93     *encoding = UTF_8;
     94   } else if (__match_encoding(s, "ascii") || __match_encoding(s, "usascii")) {
     95     *encoding = US_ASCII;
     96   } else if (__match_encoding(s, "utf16le")) {
     97     *encoding = UTF_16_LE;
     98   } else if (__match_encoding(s, "utf16be")) {
     99     *encoding = UTF_16_BE;
    100   } else if (__match_encoding(s, "utf32le")) {
    101     *encoding = UTF_32_LE;
    102   } else if (__match_encoding(s, "utf32be")) {
    103     *encoding = UTF_32_BE;
    104   } else if (__match_encoding(s, "wchart")) {
    105     *encoding = WCHAR_T;
    106   } else {
    107     return false;
    108   }
    109   return true;
    110 }
    111 
    112 struct __iconv_t {
    113   Encoding src_encoding;
    114   Encoding dst_encoding;
    115   Mode mode;
    116 
    117   __iconv_t() : mode(ERROR) {
    118   }
    119 
    120   int Convert(char** src_buf0, size_t* src_bytes_left0, char** dst_buf0, size_t* dst_bytes_left0) {
    121     // Reset state.
    122     wc = 0;
    123     memset(&ps, 0, sizeof(ps));
    124     replacement_count = 0;
    125     ignored = false;
    126     src_buf = src_buf0;
    127     src_bytes_left = src_bytes_left0;
    128     dst_buf = dst_buf0;
    129     dst_bytes_left = dst_bytes_left0;
    130 
    131     while (*src_bytes_left > 0) {
    132       if (!GetNext() || !Convert()) return -1;
    133     }
    134     return Done();
    135   }
    136 
    137  private:
    138   char32_t wc;
    139   char buf[16];
    140   size_t src_bytes_used;
    141   size_t dst_bytes_used;
    142   mbstate_t ps;
    143 
    144   size_t replacement_count;
    145   bool ignored;
    146 
    147   char** src_buf;
    148   size_t* src_bytes_left;
    149   char** dst_buf;
    150   size_t* dst_bytes_left;
    151 
    152   bool GetNext() {
    153     errno = 0;
    154     switch (src_encoding) {
    155       case US_ASCII:
    156         wc = **src_buf;
    157         src_bytes_used = 1;
    158         if (wc > 0x7f) errno = EILSEQ;
    159         break;
    160 
    161       case UTF_8:
    162         src_bytes_used = mbrtoc32(&wc, *src_buf, *src_bytes_left, &ps);
    163         if (src_bytes_used == __MB_ERR_ILLEGAL_SEQUENCE) {
    164           break;  // EILSEQ already set.
    165         } else if (src_bytes_used == __MB_ERR_INCOMPLETE_SEQUENCE) {
    166           errno = EINVAL;
    167           return false;
    168         }
    169         break;
    170 
    171       case UTF_16_BE:
    172       case UTF_16_LE: {
    173         if (*src_bytes_left < 2) {
    174           errno = EINVAL;
    175           return false;
    176         }
    177         bool swap = (src_encoding == UTF_16_BE);
    178         wc = In16(*src_buf, swap);
    179         // 0xd800-0xdbff: high surrogates
    180         // 0xdc00-0xdfff: low surrogates
    181         if (wc >= 0xd800 && wc <= 0xdfff) {
    182           if (wc >= 0xdc00) {  // Low surrogate before high surrogate.
    183             errno = EILSEQ;
    184             return false;
    185           }
    186           if (*src_bytes_left < 4) {
    187             errno = EINVAL;
    188             return false;
    189           }
    190           uint16_t hi = wc;
    191           uint16_t lo = In16(*src_buf + 2, swap);
    192           wc = 0x10000 + ((hi - 0xd800) << 10) + (lo - 0xdc00);
    193           src_bytes_used = 4;
    194         }
    195         break;
    196       }
    197 
    198       case UTF_32_BE:
    199       case UTF_32_LE:
    200       case WCHAR_T:
    201         if (*src_bytes_left < 4) {
    202           errno = EINVAL;
    203           return false;
    204         }
    205         wc = In32(*src_buf, (src_encoding == UTF_32_BE));
    206         break;
    207     }
    208 
    209     if (errno == EILSEQ) {
    210       switch (mode) {
    211         case ERROR:
    212           return false;
    213         case IGNORE:
    214           *src_buf += src_bytes_used;
    215           *src_bytes_left -= src_bytes_used;
    216           ignored = true;
    217           return GetNext();
    218         case TRANSLIT:
    219           wc = '?';
    220           ++replacement_count;
    221           return true;
    222       }
    223     }
    224     return true;
    225   }
    226 
    227   bool Convert() {
    228     errno = 0;
    229     switch (dst_encoding) {
    230       case US_ASCII:
    231         buf[0] = wc;
    232         dst_bytes_used = 1;
    233         if (wc > 0x7f) errno = EILSEQ;
    234         break;
    235 
    236       case UTF_8:
    237         dst_bytes_used = c32rtomb(buf, wc, &ps);
    238         if (dst_bytes_used == __MB_ERR_ILLEGAL_SEQUENCE) {
    239           break;  // EILSEQ already set.
    240         } else if (dst_bytes_used == __MB_ERR_INCOMPLETE_SEQUENCE) {
    241           errno = EINVAL;
    242           return false;
    243         }
    244         break;
    245 
    246       case UTF_16_BE:
    247       case UTF_16_LE: {
    248         bool swap = (dst_encoding == UTF_16_BE);
    249         if (wc < 0x10000) {  // BMP.
    250           Out16(buf, wc, swap);
    251         } else {  // Supplementary plane; output surrogate pair.
    252           wc -= 0x10000;
    253           char16_t hi = 0xd800 | (wc >> 10);
    254           char16_t lo = 0xdc00 | (wc & 0x3ff);
    255           Out16(buf + 0, hi, swap);
    256           Out16(buf + 2, lo, swap);
    257           dst_bytes_used = 4;
    258         }
    259       } break;
    260 
    261       case UTF_32_BE:
    262       case UTF_32_LE:
    263       case WCHAR_T:
    264         Out32(wc, (dst_encoding == UTF_32_BE));
    265         break;
    266     }
    267 
    268     if (errno == EILSEQ) {
    269       if (mode == IGNORE) {
    270         *src_buf += src_bytes_used;
    271         *src_bytes_left -= src_bytes_used;
    272         ignored = true;
    273         return true;
    274       } else if (mode == TRANSLIT) {
    275         wc = '?';
    276         ++replacement_count;
    277         return Convert();
    278       }
    279       return false;
    280     }
    281 
    282     return Emit();
    283   }
    284 
    285   uint16_t In16(const char* buf, bool swap) {
    286     const uint8_t* src = reinterpret_cast<const uint8_t*>(buf);
    287     uint16_t wc = (src[0]) | (src[1] << 8);
    288     if (swap) wc = __swap16(wc);
    289     src_bytes_used = 2;
    290     return wc;
    291   }
    292 
    293   uint32_t In32(const char* buf, bool swap) {
    294     const uint8_t* src = reinterpret_cast<const uint8_t*>(buf);
    295     uint32_t wc = (src[0]) | (src[1] << 8) | (src[2] << 16) | (src[3] << 24);
    296     if (swap) wc = __swap32(wc);
    297     src_bytes_used = 4;
    298     return wc;
    299   }
    300 
    301   void Out16(char* dst, char16_t ch, bool swap) {
    302     if (swap) ch = __swap16(ch);
    303     dst[0] = ch;
    304     dst[1] = ch >> 8;
    305     dst_bytes_used = 2;
    306   }
    307 
    308   void Out32(char32_t ch, bool swap) {
    309     if (swap) ch = __swap32(ch);
    310     buf[0] = ch;
    311     buf[1] = ch >> 8;
    312     buf[2] = ch >> 16;
    313     buf[3] = ch >> 24;
    314     dst_bytes_used = 4;
    315   }
    316 
    317   bool Emit() {
    318     if (dst_bytes_used > *dst_bytes_left) {
    319       errno = E2BIG;
    320       return false;
    321     }
    322 
    323     memcpy(*dst_buf, buf, dst_bytes_used);
    324     *src_buf += src_bytes_used;
    325     *src_bytes_left -= src_bytes_used;
    326     *dst_buf += dst_bytes_used;
    327     *dst_bytes_left -= dst_bytes_used;
    328     return true;
    329   }
    330 
    331   int Done() {
    332     if (mode == TRANSLIT) return replacement_count;
    333     if (ignored) {
    334       errno = EILSEQ;
    335       return -1;
    336     }
    337     return 0;
    338   }
    339 };
    340 
    341 iconv_t iconv_open(const char* __dst_encoding, const char* __src_encoding) {
    342   iconv_t result = new __iconv_t;
    343   if (!__parse_encoding(__src_encoding, &result->src_encoding, nullptr) ||
    344       !__parse_encoding(__dst_encoding, &result->dst_encoding, &result->mode)) {
    345     delete result;
    346     errno = EINVAL;
    347     return INVALID_ICONV_T;
    348   }
    349   return result;
    350 }
    351 
    352 size_t iconv(iconv_t __converter,
    353              char** __src_buf, size_t* __src_bytes_left,
    354              char** __dst_buf, size_t* __dst_bytes_left) {
    355   if (__converter == INVALID_ICONV_T) {
    356     errno = EBADF;
    357     return -1;
    358   }
    359   return __converter->Convert(__src_buf, __src_bytes_left, __dst_buf, __dst_bytes_left);
    360 }
    361 
    362 int iconv_close(iconv_t __converter) {
    363   if (__converter == INVALID_ICONV_T) {
    364     errno = EBADF;
    365     return -1;
    366   }
    367   delete __converter;
    368   return 0;
    369 }
    370