1 /* 2 * Copyright (C) 2017 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include <gtest/gtest.h> 18 19 #include <iconv.h> 20 21 #define INVALID_ICONV_T reinterpret_cast<iconv_t>(-1) 22 23 TEST(iconv, iconv_open_EINVAL) { 24 errno = 0; 25 ASSERT_EQ(INVALID_ICONV_T, iconv_open("silly", "silly")); 26 ASSERT_EQ(EINVAL, errno); 27 errno = 0; 28 ASSERT_EQ(INVALID_ICONV_T, iconv_open("silly", "UTF-8")); 29 ASSERT_EQ(EINVAL, errno); 30 errno = 0; 31 ASSERT_EQ(INVALID_ICONV_T, iconv_open("UTF-8", "silly")); 32 ASSERT_EQ(EINVAL, errno); 33 } 34 35 TEST(iconv, iconv_open_comparator) { 36 // Examples from http://www.unicode.org/reports/tr22/#Charset_Alias_Matching: 37 // "For example, the following names should match: "UTF-8", "utf8", "u.t.f-008", ..." 38 iconv_t c; 39 ASSERT_NE(INVALID_ICONV_T, c = iconv_open("UTF-8", "utf8")); 40 ASSERT_EQ(0, iconv_close(c)); 41 ASSERT_NE(INVALID_ICONV_T, c = iconv_open("UTF-8", "u.t.f-008")); 42 ASSERT_EQ(0, iconv_close(c)); 43 44 // "...but not "utf-80" or "ut8"." 45 errno = 0; 46 ASSERT_EQ(INVALID_ICONV_T, iconv_open("UTF-8", "utf-80")); 47 ASSERT_EQ(EINVAL, errno); 48 errno = 0; 49 ASSERT_EQ(INVALID_ICONV_T, iconv_open("UTF-8", "ut80")); 50 ASSERT_EQ(EINVAL, errno); 51 } 52 53 TEST(iconv, iconv_smoke) { 54 const char* utf8 = "a"; // U+0666 0xd9 0xa6 // U+1100 0xe1 0x84 0x80 55 char buf[BUFSIZ] = {}; 56 57 iconv_t c = iconv_open("UTF-32LE", "UTF-8"); 58 ASSERT_NE(INVALID_ICONV_T, c); 59 60 char* in = const_cast<char*>(utf8); 61 size_t in_bytes = strlen(in); 62 63 char* out = buf; 64 size_t out_bytes = sizeof(buf); 65 66 EXPECT_EQ(0U, iconv(c, &in, &in_bytes, &out, &out_bytes)); 67 68 wchar_t* utf16 = reinterpret_cast<wchar_t*>(buf); 69 EXPECT_EQ(L'a', utf16[0]); 70 EXPECT_EQ(L'', utf16[1]); 71 EXPECT_EQ(L'', utf16[2]); 72 EXPECT_EQ(L'\0', utf16[3]); 73 EXPECT_EQ(0U, in_bytes); 74 EXPECT_EQ(sizeof(buf) - (3 /* chars */ * 4 /* bytes each */), out_bytes); 75 76 ASSERT_EQ(0, iconv_close(c)); 77 } 78 79 TEST(iconv, iconv_lossy_TRANSLIT) { 80 const char* utf8 = "az"; // U+0666 0xd9 0xa6 // U+1100 0xe1 0x84 0x80 81 char buf[BUFSIZ] = {}; 82 83 iconv_t c = iconv_open("ASCII//TRANSLIT", "UTF-8"); 84 ASSERT_NE(INVALID_ICONV_T, c); 85 86 char* in = const_cast<char*>(utf8); 87 size_t in_bytes = strlen(in); 88 89 char* out = buf; 90 size_t out_bytes = sizeof(buf); 91 92 // Two of the input characters (5 input bytes) aren't representable as ASCII. 93 // With "//TRANSLIT", we use a replacement character, and report the number 94 // of replacements. 95 EXPECT_EQ(2U, iconv(c, &in, &in_bytes, &out, &out_bytes)); 96 97 EXPECT_EQ('a', buf[0]); 98 EXPECT_EQ('?', buf[1]); 99 EXPECT_EQ('?', buf[2]); 100 EXPECT_EQ('z', buf[3]); 101 EXPECT_EQ(0, buf[4]); 102 EXPECT_EQ(0U, in_bytes); 103 EXPECT_EQ(sizeof(buf) - 4, out_bytes); 104 105 ASSERT_EQ(0, iconv_close(c)); 106 } 107 108 TEST(iconv, iconv_lossy_IGNORE) { 109 const char* utf8 = "az"; // U+0666 0xd9 0xa6 // U+1100 0xe1 0x84 0x80 110 char buf[BUFSIZ] = {}; 111 112 iconv_t c = iconv_open("ASCII//IGNORE", "UTF-8"); 113 ASSERT_NE(INVALID_ICONV_T, c); 114 115 char* in = const_cast<char*>(utf8); 116 size_t in_bytes = strlen(in); 117 118 char* out = buf; 119 size_t out_bytes = sizeof(buf); 120 121 // Two of the input characters (5 input bytes) aren't representable as ASCII. 122 // With "//IGNORE", we just skip them (but return failure). 123 errno = 0; 124 EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes)); 125 EXPECT_EQ(EILSEQ, errno); 126 127 EXPECT_EQ('a', buf[0]); 128 EXPECT_EQ('z', buf[1]); 129 EXPECT_EQ(0, buf[2]); 130 EXPECT_EQ(0U, in_bytes); 131 EXPECT_EQ(sizeof(buf) - 2, out_bytes); 132 133 ASSERT_EQ(0, iconv_close(c)); 134 } 135 136 TEST(iconv, iconv_lossy) { 137 const char* utf8 = "az"; // U+0666 0xd9 0xa6 // U+1100 0xe1 0x84 0x80 138 char buf[BUFSIZ] = {}; 139 140 iconv_t c = iconv_open("ASCII", "UTF-8"); 141 ASSERT_NE(INVALID_ICONV_T, c); 142 143 char* in = const_cast<char*>(utf8); 144 size_t in_bytes = strlen(in); 145 146 char* out = buf; 147 size_t out_bytes = sizeof(buf); 148 149 // The second input character isn't representable as ASCII, so we stop there. 150 errno = 0; 151 EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes)); 152 EXPECT_EQ(EILSEQ, errno); 153 154 EXPECT_EQ('a', buf[0]); 155 EXPECT_EQ(0, buf[1]); 156 EXPECT_EQ(6U, in_bytes); // Two bytes for , three bytes for , and one byte for z. 157 EXPECT_EQ(sizeof(buf) - 1, out_bytes); 158 159 ASSERT_EQ(0, iconv_close(c)); 160 } 161 162 TEST(iconv, iconv_malformed_sequence_EILSEQ) { 163 const char* utf8 = "a\xd9z"; // 0xd9 is the first byte of the two-byte U+0666 . 164 char buf[BUFSIZ] = {}; 165 166 iconv_t c = iconv_open("UTF-8", "UTF-8"); 167 ASSERT_NE(INVALID_ICONV_T, c); 168 169 char* in = const_cast<char*>(utf8); 170 size_t in_bytes = strlen(in); 171 172 char* out = buf; 173 size_t out_bytes = sizeof(buf); 174 175 // The second input byte is a malformed character, so we stop there. 176 errno = 0; 177 EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes)); 178 EXPECT_EQ(EILSEQ, errno); 179 EXPECT_EQ('\xd9', *in); // *in is left pointing to the start of the invalid sequence. 180 ++in; 181 --in_bytes; 182 errno = 0; 183 EXPECT_EQ(0U, iconv(c, &in, &in_bytes, &out, &out_bytes)); 184 EXPECT_EQ(0, errno); 185 186 EXPECT_EQ('a', buf[0]); 187 EXPECT_EQ('z', buf[1]); 188 EXPECT_EQ(0, buf[2]); 189 EXPECT_EQ(0U, in_bytes); 190 EXPECT_EQ(sizeof(buf) - 2, out_bytes); 191 192 ASSERT_EQ(0, iconv_close(c)); 193 } 194 195 TEST(iconv, iconv_incomplete_sequence_EINVAL) { 196 const char* utf8 = "a\xd9"; // 0xd9 is the first byte of the two-byte U+0666 . 197 char buf[BUFSIZ] = {}; 198 199 iconv_t c = iconv_open("UTF-8", "UTF-8"); 200 ASSERT_NE(INVALID_ICONV_T, c); 201 202 char* in = const_cast<char*>(utf8); 203 size_t in_bytes = strlen(in); 204 205 char* out = buf; 206 size_t out_bytes = sizeof(buf); 207 208 // The second input byte is just the start of a character, and we don't have any more bytes. 209 errno = 0; 210 EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes)); 211 EXPECT_EQ(EINVAL, errno); 212 EXPECT_EQ('\xd9', *in); // *in is left pointing to the start of the incomplete sequence. 213 214 EXPECT_EQ('a', buf[0]); 215 EXPECT_EQ(0, buf[1]); 216 EXPECT_EQ(1U, in_bytes); 217 EXPECT_EQ(sizeof(buf) - 1, out_bytes); 218 219 ASSERT_EQ(0, iconv_close(c)); 220 } 221 222 TEST(iconv, iconv_E2BIG) { 223 const char* utf8 = "abc"; 224 char buf[BUFSIZ] = {}; 225 226 iconv_t c = iconv_open("UTF-8", "UTF-8"); 227 ASSERT_NE(INVALID_ICONV_T, c); 228 229 char* in = const_cast<char*>(utf8); 230 size_t in_bytes = strlen(in); 231 232 char* out = buf; 233 size_t out_bytes = 1; 234 235 // We need three bytes, so one isn't enough (but we will make progress). 236 out_bytes = 1; 237 errno = 0; 238 EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes)); 239 EXPECT_EQ(E2BIG, errno); 240 EXPECT_EQ(2U, in_bytes); 241 EXPECT_EQ(0U, out_bytes); 242 243 // Two bytes left, so zero isn't enough (and we can't even make progress). 244 out_bytes = 0; 245 errno = 0; 246 EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes)); 247 EXPECT_EQ(E2BIG, errno); 248 EXPECT_EQ(2U, in_bytes); 249 EXPECT_EQ(0U, out_bytes); 250 251 // Two bytes left, so one isn't enough (but we will make progress). 252 out_bytes = 1; 253 errno = 0; 254 EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes)); 255 EXPECT_EQ(E2BIG, errno); 256 EXPECT_EQ(1U, in_bytes); 257 EXPECT_EQ(0U, out_bytes); 258 259 // One byte left, so one byte is now enough. 260 out_bytes = 1; 261 errno = 0; 262 EXPECT_EQ(0U, iconv(c, &in, &in_bytes, &out, &out_bytes)); 263 EXPECT_EQ(0, errno); 264 EXPECT_EQ(0U, in_bytes); 265 EXPECT_EQ(0U, out_bytes); 266 267 EXPECT_EQ('a', buf[0]); 268 EXPECT_EQ('b', buf[1]); 269 EXPECT_EQ('c', buf[2]); 270 EXPECT_EQ(0, buf[3]); 271 272 ASSERT_EQ(0, iconv_close(c)); 273 } 274 275 TEST(iconv, iconv_invalid_converter_EBADF) { 276 char* in = nullptr; 277 char* out = nullptr; 278 size_t in_bytes = 0; 279 size_t out_bytes = 0; 280 errno = 0; 281 ASSERT_EQ(static_cast<size_t>(-1), iconv(INVALID_ICONV_T, &in, &in_bytes, &out, &out_bytes)); 282 ASSERT_EQ(EBADF, errno); 283 } 284 285 TEST(iconv, iconv_close_invalid_converter_EBADF) { 286 errno = 0; 287 ASSERT_EQ(-1, iconv_close(INVALID_ICONV_T)); 288 ASSERT_EQ(EBADF, errno); 289 } 290 291 static void RoundTrip(const char* dst_enc, const char* expected_bytes, size_t n) { 292 // Examples from https://en.wikipedia.org/wiki/UTF-16. 293 const char* utf8 = "$"; // U+0024, U+20AC, U+10437. 294 295 iconv_t c = iconv_open(dst_enc, "UTF-8"); 296 ASSERT_NE(INVALID_ICONV_T, c) << dst_enc; 297 298 char* in = const_cast<char*>(utf8); 299 size_t in_bytes = strlen(utf8); 300 char buf[BUFSIZ] = {}; 301 char* out = buf; 302 size_t out_bytes = sizeof(buf); 303 size_t replacement_count = iconv(c, &in, &in_bytes, &out, &out_bytes); 304 305 // Check we got the bytes we were expecting. 306 for (size_t i = 0; i < n; ++i) { 307 EXPECT_EQ(expected_bytes[i], buf[i]) << i << ' '<< dst_enc; 308 } 309 310 ASSERT_EQ(0, iconv_close(c)); 311 312 // We can't round-trip if there were replacements. 313 if (strstr(dst_enc, "ascii")) { 314 GTEST_LOG_(INFO) << "can't round-trip " << dst_enc << "\n"; 315 return; 316 } 317 ASSERT_EQ(0U, replacement_count); 318 319 c = iconv_open("UTF-8", dst_enc); 320 ASSERT_NE(INVALID_ICONV_T, c) << dst_enc; 321 322 in = buf; 323 in_bytes = n; 324 char buf2[BUFSIZ] = {}; 325 out = buf2; 326 out_bytes = sizeof(buf2); 327 iconv(c, &in, &in_bytes, &out, &out_bytes); 328 329 ASSERT_STREQ(utf8, buf2) << dst_enc; 330 331 ASSERT_EQ(0, iconv_close(c)); 332 } 333 334 TEST(iconv, iconv_round_trip_ascii) { 335 RoundTrip("ascii//TRANSLIT", "$??", 3); 336 } 337 338 TEST(iconv, iconv_round_trip_utf8) { 339 RoundTrip("utf8", "\x24\xe2\x82\xac\xf0\x90\x90\xb7", 8); 340 } 341 342 TEST(iconv, iconv_round_trip_utf16be) { 343 RoundTrip("utf16be", "\x00\x24" "\x20\xac" "\xd8\x01\xdc\x37", 8); 344 } 345 346 TEST(iconv, iconv_round_trip_utf16le) { 347 RoundTrip("utf16le", "\x24\x00" "\xac\x20" "\x01\xd8\x37\xdc", 8); 348 } 349 350 TEST(iconv, iconv_round_trip_utf32be) { 351 RoundTrip("utf32be", "\x00\x00\x00\x24" "\x00\x00\x20\xac" "\x00\x01\x04\x37", 12); 352 } 353 354 TEST(iconv, iconv_round_trip_utf32le) { 355 RoundTrip("utf32le", "\x24\x00\x00\x00" "\xac\x20\x00\x00" "\x37\x04\x01\x00", 12); 356 } 357 358 TEST(iconv, iconv_round_trip_wchar_t) { 359 RoundTrip("wchar_t", "\x24\x00\x00\x00" "\xac\x20\x00\x00" "\x37\x04\x01\x00", 12); 360 } 361 362 static void Check(int expected_errno, const char* src_enc, const char* src, size_t n) { 363 iconv_t c = iconv_open("wchar_t", src_enc); 364 char* in = const_cast<char*>(src); 365 size_t in_bytes = n; 366 wchar_t out_buf[16]; 367 size_t out_bytes = sizeof(out_buf); 368 char* out = reinterpret_cast<char*>(out_buf); 369 errno = 0; 370 ASSERT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes)); 371 EXPECT_EQ(expected_errno, errno); 372 EXPECT_EQ(0, iconv_close(c)); 373 } 374 375 TEST(iconv, iconv_EILSEQ_ascii) { 376 Check(EILSEQ, "ASCII", "\xac", 1); // > 0x7f, so not ASCII. 377 } 378 379 TEST(iconv, iconv_EILSEQ_utf8_initial) { 380 Check(EILSEQ, "utf8", "\x82", 1); // Invalid initial byte. 381 } 382 383 TEST(iconv, iconv_EILSEQ_utf8_non_initial) { 384 Check(EILSEQ, "utf8", "\xe2\xe2\x82", 3); // Invalid second byte. 385 } 386 387 TEST(iconv, iconv_EILSEQ_utf16be_low_surrogate_first) { 388 Check(EILSEQ, "utf16be", "\xdc\x37" "\xd8\x01", 4); 389 } 390 391 TEST(iconv, iconv_EILSEQ_utf16le_low_surrogate_first) { 392 Check(EILSEQ, "utf16le", "\x37\xdc" "\x01\xd8", 4); 393 } 394 395 TEST(iconv, iconv_EINVAL_utf8_short) { 396 Check(EINVAL, "utf8", "\xe2\x82", 2); // Missing final byte of 3-byte sequence. 397 } 398 399 TEST(iconv, iconv_EINVAL_utf16be_short) { 400 Check(EINVAL, "utf16be", "\x00", 1); // Missing second byte. 401 } 402 403 TEST(iconv, iconv_EINVAL_utf16be_missing_low_surrogate) { 404 Check(EINVAL, "utf16be", "\xd8\x01", 2); 405 } 406 407 TEST(iconv, iconv_EINVAL_utf16be_half_low_surrogate) { 408 Check(EINVAL, "utf16be", "\xd8\x01\xdc", 3); 409 } 410 411 TEST(iconv, iconv_EINVAL_utf16le_short) { 412 Check(EINVAL, "utf16le", "\x24", 1); // Missing second byte. 413 } 414 415 TEST(iconv, iconv_EINVAL_utf16le_missing_low_surrogate) { 416 Check(EINVAL, "utf16le", "\x01\xd8", 2); 417 } 418 419 TEST(iconv, iconv_EINVAL_utf16le_half_low_surrogate) { 420 Check(EINVAL, "utf16le", "\x01\xd8\x37", 3); 421 } 422 423 TEST(iconv, iconv_EINVAL_utf32be_short) { 424 Check(EINVAL, "utf32be", "\x00\x00\x00", 3); // Missing final byte. 425 } 426 427 TEST(iconv, iconv_EINVAL_utf32le_short) { 428 Check(EINVAL, "utf32le", "\x24\x00\x00", 3); // Missing final byte. 429 } 430 431 TEST(iconv, iconv_initial_shift_state) { 432 // POSIX: "For state-dependent encodings, the conversion descriptor 433 // cd is placed into its initial shift state by a call for which inbuf 434 // is a null pointer, or for which inbuf points to a null pointer." 435 iconv_t c = iconv_open("utf8", "utf8"); 436 char* in = nullptr; 437 size_t in_bytes = 0; 438 wchar_t out_buf[16]; 439 size_t out_bytes = sizeof(out_buf); 440 char* out = reinterpret_cast<char*>(out_buf); 441 442 // Points to a null pointer... 443 errno = 0; 444 ASSERT_EQ(static_cast<size_t>(0), iconv(c, &in, &in_bytes, &out, &out_bytes)); 445 EXPECT_EQ(0, errno); 446 EXPECT_EQ(sizeof(out_buf), out_bytes); 447 448 // Is a null pointer... 449 errno = 0; 450 ASSERT_EQ(static_cast<size_t>(0), iconv(c, nullptr, &in_bytes, &out, &out_bytes)); 451 EXPECT_EQ(0, errno); 452 EXPECT_EQ(sizeof(out_buf), out_bytes); 453 454 EXPECT_EQ(0, iconv_close(c)); 455 } 456