libc/bionic/iconv.cpp

/*
 * Copyright (C) 2017 The Android Open Source Project
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <iconv.h>

#include <ctype.h>
#include <endian.h>
#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include <uchar.h>

#include "private/bionic_mbstate.h"

#define INVALID_ICONV_T reinterpret_cast<iconv_t>(-1)

// Ideally we'd use icu4c but the API mismatch seems too great. So we just offer something
// equivalent to (but slightly easier to use for runs of text than) <uchar.h>. If you're
// here to add more encodings, consider working on finishing the icu4c NDK wrappers instead.
enum Encoding {
  US_ASCII,
  UTF_8,
  UTF_16_LE,
  UTF_16_BE,
  UTF_32_LE,
  UTF_32_BE,
  WCHAR_T,
};

enum Mode {
  ERROR,
  IGNORE,
  TRANSLIT,
};

// This matching is strange but true.
// See http://www.unicode.org/reports/tr22/#Charset_Alias_Matching.
static bool __match_encoding(const char* lhs, const char* rhs) {
  while (*lhs && *rhs) {
    // Skip non-alnum in lhs; "UTF-8", "UTF_8", "UTF8", "UTF 8" are all equivalent.
    // Also implement the "delete each 0 that is not preceded by a digit" rule.
    for (; *lhs; ++lhs) {
      if (isalnum(*lhs) && (*lhs != '0' || !isdigit(*(lhs + 1)))) break;
    }
    // Case doesn't matter either.
    if (tolower(*lhs) != tolower(*rhs)) break;
    ++lhs;
    ++rhs;
  }
  // As a special case we treat the GNU "//" extensions as end of string.
  if ((*lhs == '\0' || strstr(lhs, "//") == lhs) && *rhs == '\0') return true;
  return false;
}

static bool __parse_encoding(const char* s, Encoding* encoding, Mode* mode) {
  const char* suffix = strstr(s, "//");
  if (suffix) {
    if (!mode) return false;
    if (strcmp(suffix, "//IGNORE") == 0) {
      *mode = IGNORE;
    } else if (strcmp(suffix, "//TRANSLIT") == 0) {
      *mode = TRANSLIT;
    } else {
      return false;
    }
  }
  if (__match_encoding(s, "utf8")) {
    *encoding = UTF_8;
  } else if (__match_encoding(s, "ascii") || __match_encoding(s, "usascii")) {
    *encoding = US_ASCII;
  } else if (__match_encoding(s, "utf16le")) {
    *encoding = UTF_16_LE;
  } else if (__match_encoding(s, "utf16be")) {
    *encoding = UTF_16_BE;
  } else if (__match_encoding(s, "utf32le")) {
    *encoding = UTF_32_LE;
  } else if (__match_encoding(s, "utf32be")) {
    *encoding = UTF_32_BE;
  } else if (__match_encoding(s, "wchart")) {
    *encoding = WCHAR_T;
  } else {
    return false;
  }
  return true;
}

struct __iconv_t {
  Encoding src_encoding;
  Encoding dst_encoding;
  Mode mode;

  __iconv_t() : mode(ERROR) {
  }

  int Convert(char** src_buf0, size_t* src_bytes_left0, char** dst_buf0, size_t* dst_bytes_left0) {
    // Reset state.
    wc = 0;
    memset(&ps, 0, sizeof(ps));
    replacement_count = 0;
    ignored = false;
    src_buf = src_buf0;
    src_bytes_left = src_bytes_left0;
    dst_buf = dst_buf0;
    dst_bytes_left = dst_bytes_left0;

    while (*src_bytes_left > 0) {
      if (!GetNext() || !Convert()) return -1;
    }
    return Done();
  }

 private:
  char32_t wc;
  char buf[16];
  size_t src_bytes_used;
  size_t dst_bytes_used;
  mbstate_t ps;

  size_t replacement_count;
  bool ignored;

  char** src_buf;
  size_t* src_bytes_left;
  char** dst_buf;
  size_t* dst_bytes_left;

  bool GetNext() {
    errno = 0;
    switch (src_encoding) {
      case US_ASCII:
        wc = **src_buf;
        src_bytes_used = 1;
        if (wc > 0x7f) errno = EILSEQ;
        break;

      case UTF_8:
        src_bytes_used = mbrtoc32(&wc, *src_buf, *src_bytes_left, &ps);
        if (src_bytes_used == __MB_ERR_ILLEGAL_SEQUENCE) {
          break;  // EILSEQ already set.
        } else if (src_bytes_used == __MB_ERR_INCOMPLETE_SEQUENCE) {
          errno = EINVAL;
          return false;
        }
        break;

      case UTF_16_BE:
      case UTF_16_LE: {
        if (*src_bytes_left < 2) {
          errno = EINVAL;
          return false;
        }
        bool swap = (src_encoding == UTF_16_BE);
        wc = In16(*src_buf, swap);
        // 0xd800-0xdbff: high surrogates
        // 0xdc00-0xdfff: low surrogates
        if (wc >= 0xd800 && wc <= 0xdfff) {
          if (wc >= 0xdc00) {  // Low surrogate before high surrogate.
            errno = EILSEQ;
            return false;
          }
          if (*src_bytes_left < 4) {
            errno = EINVAL;
            return false;
          }
          uint16_t hi = wc;
          uint16_t lo = In16(*src_buf + 2, swap);
          wc = 0x10000 + ((hi - 0xd800) << 10) + (lo - 0xdc00);
          src_bytes_used = 4;
        }
        break;
      }

      case UTF_32_BE:
      case UTF_32_LE:
      case WCHAR_T:
        if (*src_bytes_left < 4) {
          errno = EINVAL;
          return false;
        }
        wc = In32(*src_buf, (src_encoding == UTF_32_BE));
        break;
    }

    if (errno == EILSEQ) {
      switch (mode) {
        case ERROR:
          return false;
        case IGNORE:
          *src_buf += src_bytes_used;
          *src_bytes_left -= src_bytes_used;
          ignored = true;
          return GetNext();
        case TRANSLIT:
          wc = '?';
          ++replacement_count;
          return true;
      }
    }
    return true;
  }

  bool Convert() {
    errno = 0;
    switch (dst_encoding) {
      case US_ASCII:
        buf[0] = wc;
        dst_bytes_used = 1;
        if (wc > 0x7f) errno = EILSEQ;
        break;

      case UTF_8:
        dst_bytes_used = c32rtomb(buf, wc, &ps);
        if (dst_bytes_used == __MB_ERR_ILLEGAL_SEQUENCE) {
          break;  // EILSEQ already set.
        } else if (dst_bytes_used == __MB_ERR_INCOMPLETE_SEQUENCE) {
          errno = EINVAL;
          return false;
        }
        break;

      case UTF_16_BE:
      case UTF_16_LE: {
        bool swap = (dst_encoding == UTF_16_BE);
        if (wc < 0x10000) {  // BMP.
          Out16(buf, wc, swap);
        } else {  // Supplementary plane; output surrogate pair.
          wc -= 0x10000;
          char16_t hi = 0xd800 | (wc >> 10);
          char16_t lo = 0xdc00 | (wc & 0x3ff);
          Out16(buf + 0, hi, swap);
          Out16(buf + 2, lo, swap);
          dst_bytes_used = 4;
        }
      } break;

      case UTF_32_BE:
      case UTF_32_LE:
      case WCHAR_T:
        Out32(wc, (dst_encoding == UTF_32_BE));
        break;
    }

    if (errno == EILSEQ) {
      if (mode == IGNORE) {
        *src_buf += src_bytes_used;
        *src_bytes_left -= src_bytes_used;
        ignored = true;
        return true;
      } else if (mode == TRANSLIT) {
        wc = '?';
        ++replacement_count;
        return Convert();
      }
      return false;
    }

    return Emit();
  }

  uint16_t In16(const char* buf, bool swap) {
    const uint8_t* src = reinterpret_cast<const uint8_t*>(buf);
    uint16_t wc = (src[0]) | (src[1] << 8);
    if (swap) wc = __swap16(wc);
    src_bytes_used = 2;
    return wc;
  }

  uint32_t In32(const char* buf, bool swap) {
    const uint8_t* src = reinterpret_cast<const uint8_t*>(buf);
    uint32_t wc = (src[0]) | (src[1] << 8) | (src[2] << 16) | (src[3] << 24);
    if (swap) wc = __swap32(wc);
    src_bytes_used = 4;
    return wc;
  }

  void Out16(char* dst, char16_t ch, bool swap) {
    if (swap) ch = __swap16(ch);
    dst[0] = ch;
    dst[1] = ch >> 8;
    dst_bytes_used = 2;
  }

  void Out32(char32_t ch, bool swap) {
    if (swap) ch = __swap32(ch);
    buf[0] = ch;
    buf[1] = ch >> 8;
    buf[2] = ch >> 16;
    buf[3] = ch >> 24;
    dst_bytes_used = 4;
  }

  bool Emit() {
    if (dst_bytes_used > *dst_bytes_left) {
      errno = E2BIG;
      return false;
    }

    memcpy(*dst_buf, buf, dst_bytes_used);
    *src_buf += src_bytes_used;
    *src_bytes_left -= src_bytes_used;
    *dst_buf += dst_bytes_used;
    *dst_bytes_left -= dst_bytes_used;
    return true;
  }

  int Done() {
    if (mode == TRANSLIT) return replacement_count;
    if (ignored) {
      errno = EILSEQ;
      return -1;
    }
    return 0;
  }
};

iconv_t iconv_open(const char* __dst_encoding, const char* __src_encoding) {
  iconv_t result = new __iconv_t;
  if (!__parse_encoding(__src_encoding, &result->src_encoding, nullptr) ||
      !__parse_encoding(__dst_encoding, &result->dst_encoding, &result->mode)) {
    delete result;
    errno = EINVAL;
    return INVALID_ICONV_T;
  }
  return result;
}

size_t iconv(iconv_t __converter,
             char** __src_buf, size_t* __src_bytes_left,
             char** __dst_buf, size_t* __dst_bytes_left) {
  if (__converter == INVALID_ICONV_T) {
    errno = EBADF;
    return -1;
  }
  return __converter->Convert(__src_buf, __src_bytes_left, __dst_buf, __dst_bytes_left);
}

int iconv_close(iconv_t __converter) {
  if (__converter == INVALID_ICONV_T) {
    errno = EBADF;
    return -1;
  }
  delete __converter;
  return 0;
}