Home | History | Annotate | Download | only in tests
      1 /*
      2  * Copyright (C) 2017 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include <gtest/gtest.h>
     18 
     19 #include <iconv.h>
     20 
     21 #define INVALID_ICONV_T reinterpret_cast<iconv_t>(-1)
     22 
     23 TEST(iconv, iconv_open_EINVAL) {
     24   errno = 0;
     25   ASSERT_EQ(INVALID_ICONV_T, iconv_open("silly", "silly"));
     26   ASSERT_EQ(EINVAL, errno);
     27   errno = 0;
     28   ASSERT_EQ(INVALID_ICONV_T, iconv_open("silly", "UTF-8"));
     29   ASSERT_EQ(EINVAL, errno);
     30   errno = 0;
     31   ASSERT_EQ(INVALID_ICONV_T, iconv_open("UTF-8", "silly"));
     32   ASSERT_EQ(EINVAL, errno);
     33 }
     34 
     35 TEST(iconv, iconv_open_comparator) {
     36   // Examples from http://www.unicode.org/reports/tr22/#Charset_Alias_Matching:
     37   // "For example, the following names should match: "UTF-8", "utf8", "u.t.f-008", ..."
     38   iconv_t c;
     39   ASSERT_NE(INVALID_ICONV_T, c = iconv_open("UTF-8", "utf8"));
     40   ASSERT_EQ(0, iconv_close(c));
     41   ASSERT_NE(INVALID_ICONV_T, c = iconv_open("UTF-8", "u.t.f-008"));
     42   ASSERT_EQ(0, iconv_close(c));
     43 
     44   // "...but not "utf-80" or "ut8"."
     45   errno = 0;
     46   ASSERT_EQ(INVALID_ICONV_T, iconv_open("UTF-8", "utf-80"));
     47   ASSERT_EQ(EINVAL, errno);
     48   errno = 0;
     49   ASSERT_EQ(INVALID_ICONV_T, iconv_open("UTF-8", "ut80"));
     50   ASSERT_EQ(EINVAL, errno);
     51 }
     52 
     53 TEST(iconv, iconv_smoke) {
     54   const char* utf8 = "a"; // U+0666  0xd9 0xa6 // U+1100  0xe1 0x84 0x80
     55   char buf[BUFSIZ] = {};
     56 
     57   iconv_t c = iconv_open("UTF-32LE", "UTF-8");
     58   ASSERT_NE(INVALID_ICONV_T, c);
     59 
     60   char* in = const_cast<char*>(utf8);
     61   size_t in_bytes = strlen(in);
     62 
     63   char* out = buf;
     64   size_t out_bytes = sizeof(buf);
     65 
     66   EXPECT_EQ(0U, iconv(c, &in, &in_bytes, &out, &out_bytes));
     67 
     68   wchar_t* utf16 = reinterpret_cast<wchar_t*>(buf);
     69   EXPECT_EQ(L'a', utf16[0]);
     70   EXPECT_EQ(L'', utf16[1]);
     71   EXPECT_EQ(L'', utf16[2]);
     72   EXPECT_EQ(L'\0', utf16[3]);
     73   EXPECT_EQ(0U, in_bytes);
     74   EXPECT_EQ(sizeof(buf) - (3 /* chars */ * 4 /* bytes each */), out_bytes);
     75 
     76   ASSERT_EQ(0, iconv_close(c));
     77 }
     78 
     79 TEST(iconv, iconv_lossy_TRANSLIT) {
     80   const char* utf8 = "az"; // U+0666  0xd9 0xa6 // U+1100  0xe1 0x84 0x80
     81   char buf[BUFSIZ] = {};
     82 
     83   iconv_t c = iconv_open("ASCII//TRANSLIT", "UTF-8");
     84   ASSERT_NE(INVALID_ICONV_T, c);
     85 
     86   char* in = const_cast<char*>(utf8);
     87   size_t in_bytes = strlen(in);
     88 
     89   char* out = buf;
     90   size_t out_bytes = sizeof(buf);
     91 
     92   // Two of the input characters (5 input bytes) aren't representable as ASCII.
     93   // With "//TRANSLIT", we use a replacement character, and report the number
     94   // of replacements.
     95   EXPECT_EQ(2U, iconv(c, &in, &in_bytes, &out, &out_bytes));
     96 
     97   EXPECT_EQ('a', buf[0]);
     98   EXPECT_EQ('?', buf[1]);
     99   EXPECT_EQ('?', buf[2]);
    100   EXPECT_EQ('z', buf[3]);
    101   EXPECT_EQ(0, buf[4]);
    102   EXPECT_EQ(0U, in_bytes);
    103   EXPECT_EQ(sizeof(buf) - 4, out_bytes);
    104 
    105   ASSERT_EQ(0, iconv_close(c));
    106 }
    107 
    108 TEST(iconv, iconv_lossy_IGNORE) {
    109   const char* utf8 = "az"; // U+0666  0xd9 0xa6 // U+1100  0xe1 0x84 0x80
    110   char buf[BUFSIZ] = {};
    111 
    112   iconv_t c = iconv_open("ASCII//IGNORE", "UTF-8");
    113   ASSERT_NE(INVALID_ICONV_T, c);
    114 
    115   char* in = const_cast<char*>(utf8);
    116   size_t in_bytes = strlen(in);
    117 
    118   char* out = buf;
    119   size_t out_bytes = sizeof(buf);
    120 
    121   // Two of the input characters (5 input bytes) aren't representable as ASCII.
    122   // With "//IGNORE", we just skip them (but return failure).
    123   errno = 0;
    124   EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
    125   EXPECT_EQ(EILSEQ, errno);
    126 
    127   EXPECT_EQ('a', buf[0]);
    128   EXPECT_EQ('z', buf[1]);
    129   EXPECT_EQ(0, buf[2]);
    130   EXPECT_EQ(0U, in_bytes);
    131   EXPECT_EQ(sizeof(buf) - 2, out_bytes);
    132 
    133   ASSERT_EQ(0, iconv_close(c));
    134 }
    135 
    136 TEST(iconv, iconv_lossy) {
    137   const char* utf8 = "az"; // U+0666  0xd9 0xa6 // U+1100  0xe1 0x84 0x80
    138   char buf[BUFSIZ] = {};
    139 
    140   iconv_t c = iconv_open("ASCII", "UTF-8");
    141   ASSERT_NE(INVALID_ICONV_T, c);
    142 
    143   char* in = const_cast<char*>(utf8);
    144   size_t in_bytes = strlen(in);
    145 
    146   char* out = buf;
    147   size_t out_bytes = sizeof(buf);
    148 
    149   // The second input character isn't representable as ASCII, so we stop there.
    150   errno = 0;
    151   EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
    152   EXPECT_EQ(EILSEQ, errno);
    153 
    154   EXPECT_EQ('a', buf[0]);
    155   EXPECT_EQ(0, buf[1]);
    156   EXPECT_EQ(6U, in_bytes); // Two bytes for , three bytes for , and one byte for z.
    157   EXPECT_EQ(sizeof(buf) - 1, out_bytes);
    158 
    159   ASSERT_EQ(0, iconv_close(c));
    160 }
    161 
    162 TEST(iconv, iconv_malformed_sequence_EILSEQ) {
    163   const char* utf8 = "a\xd9z"; // 0xd9 is the first byte of the two-byte U+0666 .
    164   char buf[BUFSIZ] = {};
    165 
    166   iconv_t c = iconv_open("UTF-8", "UTF-8");
    167   ASSERT_NE(INVALID_ICONV_T, c);
    168 
    169   char* in = const_cast<char*>(utf8);
    170   size_t in_bytes = strlen(in);
    171 
    172   char* out = buf;
    173   size_t out_bytes = sizeof(buf);
    174 
    175   // The second input byte is a malformed character, so we stop there.
    176   errno = 0;
    177   EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
    178   EXPECT_EQ(EILSEQ, errno);
    179   EXPECT_EQ('\xd9', *in); // *in is left pointing to the start of the invalid sequence.
    180   ++in;
    181   --in_bytes;
    182   errno = 0;
    183   EXPECT_EQ(0U, iconv(c, &in, &in_bytes, &out, &out_bytes));
    184   EXPECT_EQ(0, errno);
    185 
    186   EXPECT_EQ('a', buf[0]);
    187   EXPECT_EQ('z', buf[1]);
    188   EXPECT_EQ(0, buf[2]);
    189   EXPECT_EQ(0U, in_bytes);
    190   EXPECT_EQ(sizeof(buf) - 2, out_bytes);
    191 
    192   ASSERT_EQ(0, iconv_close(c));
    193 }
    194 
    195 TEST(iconv, iconv_incomplete_sequence_EINVAL) {
    196   const char* utf8 = "a\xd9"; // 0xd9 is the first byte of the two-byte U+0666 .
    197   char buf[BUFSIZ] = {};
    198 
    199   iconv_t c = iconv_open("UTF-8", "UTF-8");
    200   ASSERT_NE(INVALID_ICONV_T, c);
    201 
    202   char* in = const_cast<char*>(utf8);
    203   size_t in_bytes = strlen(in);
    204 
    205   char* out = buf;
    206   size_t out_bytes = sizeof(buf);
    207 
    208   // The second input byte is just the start of a character, and we don't have any more bytes.
    209   errno = 0;
    210   EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
    211   EXPECT_EQ(EINVAL, errno);
    212   EXPECT_EQ('\xd9', *in); // *in is left pointing to the start of the incomplete sequence.
    213 
    214   EXPECT_EQ('a', buf[0]);
    215   EXPECT_EQ(0, buf[1]);
    216   EXPECT_EQ(1U, in_bytes);
    217   EXPECT_EQ(sizeof(buf) - 1, out_bytes);
    218 
    219   ASSERT_EQ(0, iconv_close(c));
    220 }
    221 
    222 TEST(iconv, iconv_E2BIG) {
    223   const char* utf8 = "abc";
    224   char buf[BUFSIZ] = {};
    225 
    226   iconv_t c = iconv_open("UTF-8", "UTF-8");
    227   ASSERT_NE(INVALID_ICONV_T, c);
    228 
    229   char* in = const_cast<char*>(utf8);
    230   size_t in_bytes = strlen(in);
    231 
    232   char* out = buf;
    233   size_t out_bytes = 1;
    234 
    235   // We need three bytes, so one isn't enough (but we will make progress).
    236   out_bytes = 1;
    237   errno = 0;
    238   EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
    239   EXPECT_EQ(E2BIG, errno);
    240   EXPECT_EQ(2U, in_bytes);
    241   EXPECT_EQ(0U, out_bytes);
    242 
    243   // Two bytes left, so zero isn't enough (and we can't even make progress).
    244   out_bytes = 0;
    245   errno = 0;
    246   EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
    247   EXPECT_EQ(E2BIG, errno);
    248   EXPECT_EQ(2U, in_bytes);
    249   EXPECT_EQ(0U, out_bytes);
    250 
    251   // Two bytes left, so one isn't enough (but we will make progress).
    252   out_bytes = 1;
    253   errno = 0;
    254   EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
    255   EXPECT_EQ(E2BIG, errno);
    256   EXPECT_EQ(1U, in_bytes);
    257   EXPECT_EQ(0U, out_bytes);
    258 
    259   // One byte left, so one byte is now enough.
    260   out_bytes = 1;
    261   errno = 0;
    262   EXPECT_EQ(0U, iconv(c, &in, &in_bytes, &out, &out_bytes));
    263   EXPECT_EQ(0, errno);
    264   EXPECT_EQ(0U, in_bytes);
    265   EXPECT_EQ(0U, out_bytes);
    266 
    267   EXPECT_EQ('a', buf[0]);
    268   EXPECT_EQ('b', buf[1]);
    269   EXPECT_EQ('c', buf[2]);
    270   EXPECT_EQ(0, buf[3]);
    271 
    272   ASSERT_EQ(0, iconv_close(c));
    273 }
    274 
    275 TEST(iconv, iconv_invalid_converter_EBADF) {
    276   char* in = nullptr;
    277   char* out = nullptr;
    278   size_t in_bytes = 0;
    279   size_t out_bytes = 0;
    280   errno = 0;
    281   ASSERT_EQ(static_cast<size_t>(-1), iconv(INVALID_ICONV_T, &in, &in_bytes, &out, &out_bytes));
    282   ASSERT_EQ(EBADF, errno);
    283 }
    284 
    285 TEST(iconv, iconv_close_invalid_converter_EBADF) {
    286   errno = 0;
    287   ASSERT_EQ(-1, iconv_close(INVALID_ICONV_T));
    288   ASSERT_EQ(EBADF, errno);
    289 }
    290 
    291 static void RoundTrip(const char* dst_enc, const char* expected_bytes, size_t n) {
    292   // Examples from https://en.wikipedia.org/wiki/UTF-16.
    293   const char* utf8 = "$"; // U+0024, U+20AC, U+10437.
    294 
    295   iconv_t c = iconv_open(dst_enc, "UTF-8");
    296   ASSERT_NE(INVALID_ICONV_T, c) << dst_enc;
    297 
    298   char* in = const_cast<char*>(utf8);
    299   size_t in_bytes = strlen(utf8);
    300   char buf[BUFSIZ] = {};
    301   char* out = buf;
    302   size_t out_bytes = sizeof(buf);
    303   size_t replacement_count = iconv(c, &in, &in_bytes, &out, &out_bytes);
    304 
    305   // Check we got the bytes we were expecting.
    306   for (size_t i = 0; i < n; ++i) {
    307     EXPECT_EQ(expected_bytes[i], buf[i]) << i << ' '<< dst_enc;
    308   }
    309 
    310   ASSERT_EQ(0, iconv_close(c));
    311 
    312   // We can't round-trip if there were replacements.
    313   if (strstr(dst_enc, "ascii")) {
    314     GTEST_LOG_(INFO) << "can't round-trip " << dst_enc << "\n";
    315     return;
    316   }
    317   ASSERT_EQ(0U, replacement_count);
    318 
    319   c = iconv_open("UTF-8", dst_enc);
    320   ASSERT_NE(INVALID_ICONV_T, c) << dst_enc;
    321 
    322   in = buf;
    323   in_bytes = n;
    324   char buf2[BUFSIZ] = {};
    325   out = buf2;
    326   out_bytes = sizeof(buf2);
    327   iconv(c, &in, &in_bytes, &out, &out_bytes);
    328 
    329   ASSERT_STREQ(utf8, buf2) << dst_enc;
    330 
    331   ASSERT_EQ(0, iconv_close(c));
    332 }
    333 
    334 TEST(iconv, iconv_round_trip_ascii) {
    335   RoundTrip("ascii//TRANSLIT", "$??", 3);
    336 }
    337 
    338 TEST(iconv, iconv_round_trip_utf8) {
    339   RoundTrip("utf8", "\x24\xe2\x82\xac\xf0\x90\x90\xb7", 8);
    340 }
    341 
    342 TEST(iconv, iconv_round_trip_utf16be) {
    343   RoundTrip("utf16be", "\x00\x24" "\x20\xac" "\xd8\x01\xdc\x37", 8);
    344 }
    345 
    346 TEST(iconv, iconv_round_trip_utf16le) {
    347   RoundTrip("utf16le", "\x24\x00" "\xac\x20" "\x01\xd8\x37\xdc", 8);
    348 }
    349 
    350 TEST(iconv, iconv_round_trip_utf32be) {
    351   RoundTrip("utf32be", "\x00\x00\x00\x24" "\x00\x00\x20\xac" "\x00\x01\x04\x37", 12);
    352 }
    353 
    354 TEST(iconv, iconv_round_trip_utf32le) {
    355   RoundTrip("utf32le", "\x24\x00\x00\x00" "\xac\x20\x00\x00" "\x37\x04\x01\x00", 12);
    356 }
    357 
    358 TEST(iconv, iconv_round_trip_wchar_t) {
    359   RoundTrip("wchar_t", "\x24\x00\x00\x00" "\xac\x20\x00\x00" "\x37\x04\x01\x00", 12);
    360 }
    361 
    362 static void Check(int expected_errno, const char* src_enc, const char* src, size_t n) {
    363   iconv_t c = iconv_open("wchar_t", src_enc);
    364   char* in = const_cast<char*>(src);
    365   size_t in_bytes = n;
    366   wchar_t out_buf[16];
    367   size_t out_bytes = sizeof(out_buf);
    368   char* out = reinterpret_cast<char*>(out_buf);
    369   errno = 0;
    370   ASSERT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
    371   EXPECT_EQ(expected_errno, errno);
    372   EXPECT_EQ(0, iconv_close(c));
    373 }
    374 
    375 TEST(iconv, iconv_EILSEQ_ascii) {
    376   Check(EILSEQ, "ASCII", "\xac", 1); // > 0x7f, so not ASCII.
    377 }
    378 
    379 TEST(iconv, iconv_EILSEQ_utf8_initial) {
    380   Check(EILSEQ, "utf8", "\x82", 1); // Invalid initial byte.
    381 }
    382 
    383 TEST(iconv, iconv_EILSEQ_utf8_non_initial) {
    384   Check(EILSEQ, "utf8", "\xe2\xe2\x82", 3); // Invalid second byte.
    385 }
    386 
    387 TEST(iconv, iconv_EILSEQ_utf16be_low_surrogate_first) {
    388   Check(EILSEQ, "utf16be", "\xdc\x37" "\xd8\x01", 4);
    389 }
    390 
    391 TEST(iconv, iconv_EILSEQ_utf16le_low_surrogate_first) {
    392   Check(EILSEQ, "utf16le", "\x37\xdc" "\x01\xd8", 4);
    393 }
    394 
    395 TEST(iconv, iconv_EINVAL_utf8_short) {
    396   Check(EINVAL, "utf8", "\xe2\x82", 2); // Missing final byte of 3-byte sequence.
    397 }
    398 
    399 TEST(iconv, iconv_EINVAL_utf16be_short) {
    400   Check(EINVAL, "utf16be", "\x00", 1); // Missing second byte.
    401 }
    402 
    403 TEST(iconv, iconv_EINVAL_utf16be_missing_low_surrogate) {
    404   Check(EINVAL, "utf16be", "\xd8\x01", 2);
    405 }
    406 
    407 TEST(iconv, iconv_EINVAL_utf16be_half_low_surrogate) {
    408   Check(EINVAL, "utf16be", "\xd8\x01\xdc", 3);
    409 }
    410 
    411 TEST(iconv, iconv_EINVAL_utf16le_short) {
    412   Check(EINVAL, "utf16le", "\x24", 1); // Missing second byte.
    413 }
    414 
    415 TEST(iconv, iconv_EINVAL_utf16le_missing_low_surrogate) {
    416   Check(EINVAL, "utf16le", "\x01\xd8", 2);
    417 }
    418 
    419 TEST(iconv, iconv_EINVAL_utf16le_half_low_surrogate) {
    420   Check(EINVAL, "utf16le", "\x01\xd8\x37", 3);
    421 }
    422 
    423 TEST(iconv, iconv_EINVAL_utf32be_short) {
    424   Check(EINVAL, "utf32be", "\x00\x00\x00", 3); // Missing final byte.
    425 }
    426 
    427 TEST(iconv, iconv_EINVAL_utf32le_short) {
    428   Check(EINVAL, "utf32le", "\x24\x00\x00", 3); // Missing final byte.
    429 }
    430 
    431 TEST(iconv, iconv_initial_shift_state) {
    432   // POSIX: "For state-dependent encodings, the conversion descriptor
    433   // cd is placed into its initial shift state by a call for which inbuf
    434   // is a null pointer, or for which inbuf points to a null pointer."
    435   iconv_t c = iconv_open("utf8", "utf8");
    436   char* in = nullptr;
    437   size_t in_bytes = 0;
    438   wchar_t out_buf[16];
    439   size_t out_bytes = sizeof(out_buf);
    440   char* out = reinterpret_cast<char*>(out_buf);
    441 
    442   // Points to a null pointer...
    443   errno = 0;
    444   ASSERT_EQ(static_cast<size_t>(0), iconv(c, &in, &in_bytes, &out, &out_bytes));
    445   EXPECT_EQ(0, errno);
    446   EXPECT_EQ(sizeof(out_buf), out_bytes);
    447 
    448   // Is a null pointer...
    449   errno = 0;
    450   ASSERT_EQ(static_cast<size_t>(0), iconv(c, nullptr, &in_bytes, &out, &out_bytes));
    451   EXPECT_EQ(0, errno);
    452   EXPECT_EQ(sizeof(out_buf), out_bytes);
    453 
    454   EXPECT_EQ(0, iconv_close(c));
    455 }
    456