1 // Locale support (codecvt) -*- C++ -*- 2 3 // Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2009 4 // Free Software Foundation, Inc. 5 // 6 // This file is part of the GNU ISO C++ Library. This library is free 7 // software; you can redistribute it and/or modify it under the 8 // terms of the GNU General Public License as published by the 9 // Free Software Foundation; either version 3, or (at your option) 10 // any later version. 11 12 // This library is distributed in the hope that it will be useful, 13 // but WITHOUT ANY WARRANTY; without even the implied warranty of 14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 // GNU General Public License for more details. 16 17 // Under Section 7 of GPL version 3, you are granted additional 18 // permissions described in the GCC Runtime Library Exception, version 19 // 3.1, as published by the Free Software Foundation. 20 21 // You should have received a copy of the GNU General Public License and 22 // a copy of the GCC Runtime Library Exception along with this program; 23 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 24 // <http://www.gnu.org/licenses/>. 25 26 // 27 // ISO C++ 14882: 22.2.1.5 Template class codecvt 28 // 29 30 // Written by Benjamin Kosnik <bkoz (at) redhat.com> 31 32 /** @file ext/codecvt_specializations.h 33 * This file is a GNU extension to the Standard C++ Library. 34 */ 35 36 #ifndef _EXT_CODECVT_SPECIALIZATIONS_H 37 #define _EXT_CODECVT_SPECIALIZATIONS_H 1 38 39 #include <bits/c++config.h> 40 #include <locale> 41 #include <iconv.h> 42 43 _GLIBCXX_BEGIN_NAMESPACE(__gnu_cxx) 44 45 /// Extension to use iconv for dealing with character encodings. 46 // This includes conversions and comparisons between various character 47 // sets. This object encapsulates data that may need to be shared between 48 // char_traits, codecvt and ctype. 49 class encoding_state 50 { 51 public: 52 // Types: 53 // NB: A conversion descriptor subsumes and enhances the 54 // functionality of a simple state type such as mbstate_t. 55 typedef iconv_t descriptor_type; 56 57 protected: 58 // Name of internal character set encoding. 59 std::string _M_int_enc; 60 61 // Name of external character set encoding. 62 std::string _M_ext_enc; 63 64 // Conversion descriptor between external encoding to internal encoding. 65 descriptor_type _M_in_desc; 66 67 // Conversion descriptor between internal encoding to external encoding. 68 descriptor_type _M_out_desc; 69 70 // The byte-order marker for the external encoding, if necessary. 71 int _M_ext_bom; 72 73 // The byte-order marker for the internal encoding, if necessary. 74 int _M_int_bom; 75 76 // Number of external bytes needed to construct one complete 77 // character in the internal encoding. 78 // NB: -1 indicates variable, or stateful, encodings. 79 int _M_bytes; 80 81 public: 82 explicit 83 encoding_state() 84 : _M_in_desc(0), _M_out_desc(0), _M_ext_bom(0), _M_int_bom(0), _M_bytes(0) 85 { } 86 87 explicit 88 encoding_state(const char* __int, const char* __ext, 89 int __ibom = 0, int __ebom = 0, int __bytes = 1) 90 : _M_int_enc(__int), _M_ext_enc(__ext), _M_in_desc(0), _M_out_desc(0), 91 _M_ext_bom(__ebom), _M_int_bom(__ibom), _M_bytes(__bytes) 92 { init(); } 93 94 // 21.1.2 traits typedefs 95 // p4 96 // typedef STATE_T state_type 97 // requires: state_type shall meet the requirements of 98 // CopyConstructible types (20.1.3) 99 // NB: This does not preserve the actual state of the conversion 100 // descriptor member, but it does duplicate the encoding 101 // information. 102 encoding_state(const encoding_state& __obj) : _M_in_desc(0), _M_out_desc(0) 103 { construct(__obj); } 104 105 // Need assignment operator as well. 106 encoding_state& 107 operator=(const encoding_state& __obj) 108 { 109 construct(__obj); 110 return *this; 111 } 112 113 ~encoding_state() 114 { destroy(); } 115 116 bool 117 good() const throw() 118 { 119 const descriptor_type __err = (iconv_t)(-1); 120 bool __test = _M_in_desc && _M_in_desc != __err; 121 __test &= _M_out_desc && _M_out_desc != __err; 122 return __test; 123 } 124 125 int 126 character_ratio() const 127 { return _M_bytes; } 128 129 const std::string 130 internal_encoding() const 131 { return _M_int_enc; } 132 133 int 134 internal_bom() const 135 { return _M_int_bom; } 136 137 const std::string 138 external_encoding() const 139 { return _M_ext_enc; } 140 141 int 142 external_bom() const 143 { return _M_ext_bom; } 144 145 const descriptor_type& 146 in_descriptor() const 147 { return _M_in_desc; } 148 149 const descriptor_type& 150 out_descriptor() const 151 { return _M_out_desc; } 152 153 protected: 154 void 155 init() 156 { 157 const descriptor_type __err = (iconv_t)(-1); 158 const bool __have_encodings = _M_int_enc.size() && _M_ext_enc.size(); 159 if (!_M_in_desc && __have_encodings) 160 { 161 _M_in_desc = iconv_open(_M_int_enc.c_str(), _M_ext_enc.c_str()); 162 if (_M_in_desc == __err) 163 std::__throw_runtime_error(__N("encoding_state::_M_init " 164 "creating iconv input descriptor failed")); 165 } 166 if (!_M_out_desc && __have_encodings) 167 { 168 _M_out_desc = iconv_open(_M_ext_enc.c_str(), _M_int_enc.c_str()); 169 if (_M_out_desc == __err) 170 std::__throw_runtime_error(__N("encoding_state::_M_init " 171 "creating iconv output descriptor failed")); 172 } 173 } 174 175 void 176 construct(const encoding_state& __obj) 177 { 178 destroy(); 179 _M_int_enc = __obj._M_int_enc; 180 _M_ext_enc = __obj._M_ext_enc; 181 _M_ext_bom = __obj._M_ext_bom; 182 _M_int_bom = __obj._M_int_bom; 183 _M_bytes = __obj._M_bytes; 184 init(); 185 } 186 187 void 188 destroy() throw() 189 { 190 const descriptor_type __err = (iconv_t)(-1); 191 if (_M_in_desc && _M_in_desc != __err) 192 { 193 iconv_close(_M_in_desc); 194 _M_in_desc = 0; 195 } 196 if (_M_out_desc && _M_out_desc != __err) 197 { 198 iconv_close(_M_out_desc); 199 _M_out_desc = 0; 200 } 201 } 202 }; 203 204 /// encoding_char_traits 205 // Custom traits type with encoding_state for the state type, and the 206 // associated fpos<encoding_state> for the position type, all other 207 // bits equivalent to the required char_traits instantiations. 208 template<typename _CharT> 209 struct encoding_char_traits : public std::char_traits<_CharT> 210 { 211 typedef encoding_state state_type; 212 typedef typename std::fpos<state_type> pos_type; 213 }; 214 215 _GLIBCXX_END_NAMESPACE 216 217 218 _GLIBCXX_BEGIN_NAMESPACE(std) 219 220 using __gnu_cxx::encoding_state; 221 222 /// codecvt<InternT, _ExternT, encoding_state> specialization. 223 // This partial specialization takes advantage of iconv to provide 224 // code conversions between a large number of character encodings. 225 template<typename _InternT, typename _ExternT> 226 class codecvt<_InternT, _ExternT, encoding_state> 227 : public __codecvt_abstract_base<_InternT, _ExternT, encoding_state> 228 { 229 public: 230 // Types: 231 typedef codecvt_base::result result; 232 typedef _InternT intern_type; 233 typedef _ExternT extern_type; 234 typedef __gnu_cxx::encoding_state state_type; 235 typedef state_type::descriptor_type descriptor_type; 236 237 // Data Members: 238 static locale::id id; 239 240 explicit 241 codecvt(size_t __refs = 0) 242 : __codecvt_abstract_base<intern_type, extern_type, state_type>(__refs) 243 { } 244 245 explicit 246 codecvt(state_type& __enc, size_t __refs = 0) 247 : __codecvt_abstract_base<intern_type, extern_type, state_type>(__refs) 248 { } 249 250 protected: 251 virtual 252 ~codecvt() { } 253 254 virtual result 255 do_out(state_type& __state, const intern_type* __from, 256 const intern_type* __from_end, const intern_type*& __from_next, 257 extern_type* __to, extern_type* __to_end, 258 extern_type*& __to_next) const; 259 260 virtual result 261 do_unshift(state_type& __state, extern_type* __to, 262 extern_type* __to_end, extern_type*& __to_next) const; 263 264 virtual result 265 do_in(state_type& __state, const extern_type* __from, 266 const extern_type* __from_end, const extern_type*& __from_next, 267 intern_type* __to, intern_type* __to_end, 268 intern_type*& __to_next) const; 269 270 virtual int 271 do_encoding() const throw(); 272 273 virtual bool 274 do_always_noconv() const throw(); 275 276 virtual int 277 do_length(state_type&, const extern_type* __from, 278 const extern_type* __end, size_t __max) const; 279 280 virtual int 281 do_max_length() const throw(); 282 }; 283 284 template<typename _InternT, typename _ExternT> 285 locale::id 286 codecvt<_InternT, _ExternT, encoding_state>::id; 287 288 // This adaptor works around the signature problems of the second 289 // argument to iconv(): SUSv2 and others use 'const char**', but glibc 2.2 290 // uses 'char**', which matches the POSIX 1003.1-2001 standard. 291 // Using this adaptor, g++ will do the work for us. 292 template<typename _Tp> 293 inline size_t 294 __iconv_adaptor(size_t(*__func)(iconv_t, _Tp, size_t*, char**, size_t*), 295 iconv_t __cd, char** __inbuf, size_t* __inbytes, 296 char** __outbuf, size_t* __outbytes) 297 { return __func(__cd, (_Tp)__inbuf, __inbytes, __outbuf, __outbytes); } 298 299 template<typename _InternT, typename _ExternT> 300 codecvt_base::result 301 codecvt<_InternT, _ExternT, encoding_state>:: 302 do_out(state_type& __state, const intern_type* __from, 303 const intern_type* __from_end, const intern_type*& __from_next, 304 extern_type* __to, extern_type* __to_end, 305 extern_type*& __to_next) const 306 { 307 result __ret = codecvt_base::error; 308 if (__state.good()) 309 { 310 const descriptor_type& __desc = __state.out_descriptor(); 311 const size_t __fmultiple = sizeof(intern_type); 312 size_t __fbytes = __fmultiple * (__from_end - __from); 313 const size_t __tmultiple = sizeof(extern_type); 314 size_t __tbytes = __tmultiple * (__to_end - __to); 315 316 // Argument list for iconv specifies a byte sequence. Thus, 317 // all to/from arrays must be brutally casted to char*. 318 char* __cto = reinterpret_cast<char*>(__to); 319 char* __cfrom; 320 size_t __conv; 321 322 // Some encodings need a byte order marker as the first item 323 // in the byte stream, to designate endian-ness. The default 324 // value for the byte order marker is NULL, so if this is 325 // the case, it's not necessary and we can just go on our 326 // merry way. 327 int __int_bom = __state.internal_bom(); 328 if (__int_bom) 329 { 330 size_t __size = __from_end - __from; 331 intern_type* __cfixed = static_cast<intern_type*> 332 (__builtin_alloca(sizeof(intern_type) * (__size + 1))); 333 __cfixed[0] = static_cast<intern_type>(__int_bom); 334 char_traits<intern_type>::copy(__cfixed + 1, __from, __size); 335 __cfrom = reinterpret_cast<char*>(__cfixed); 336 __conv = __iconv_adaptor(iconv, __desc, &__cfrom, 337 &__fbytes, &__cto, &__tbytes); 338 } 339 else 340 { 341 intern_type* __cfixed = const_cast<intern_type*>(__from); 342 __cfrom = reinterpret_cast<char*>(__cfixed); 343 __conv = __iconv_adaptor(iconv, __desc, &__cfrom, &__fbytes, 344 &__cto, &__tbytes); 345 } 346 347 if (__conv != size_t(-1)) 348 { 349 __from_next = reinterpret_cast<const intern_type*>(__cfrom); 350 __to_next = reinterpret_cast<extern_type*>(__cto); 351 __ret = codecvt_base::ok; 352 } 353 else 354 { 355 if (__fbytes < __fmultiple * (__from_end - __from)) 356 { 357 __from_next = reinterpret_cast<const intern_type*>(__cfrom); 358 __to_next = reinterpret_cast<extern_type*>(__cto); 359 __ret = codecvt_base::partial; 360 } 361 else 362 __ret = codecvt_base::error; 363 } 364 } 365 return __ret; 366 } 367 368 template<typename _InternT, typename _ExternT> 369 codecvt_base::result 370 codecvt<_InternT, _ExternT, encoding_state>:: 371 do_unshift(state_type& __state, extern_type* __to, 372 extern_type* __to_end, extern_type*& __to_next) const 373 { 374 result __ret = codecvt_base::error; 375 if (__state.good()) 376 { 377 const descriptor_type& __desc = __state.in_descriptor(); 378 const size_t __tmultiple = sizeof(intern_type); 379 size_t __tlen = __tmultiple * (__to_end - __to); 380 381 // Argument list for iconv specifies a byte sequence. Thus, 382 // all to/from arrays must be brutally casted to char*. 383 char* __cto = reinterpret_cast<char*>(__to); 384 size_t __conv = __iconv_adaptor(iconv,__desc, NULL, NULL, 385 &__cto, &__tlen); 386 387 if (__conv != size_t(-1)) 388 { 389 __to_next = reinterpret_cast<extern_type*>(__cto); 390 if (__tlen == __tmultiple * (__to_end - __to)) 391 __ret = codecvt_base::noconv; 392 else if (__tlen == 0) 393 __ret = codecvt_base::ok; 394 else 395 __ret = codecvt_base::partial; 396 } 397 else 398 __ret = codecvt_base::error; 399 } 400 return __ret; 401 } 402 403 template<typename _InternT, typename _ExternT> 404 codecvt_base::result 405 codecvt<_InternT, _ExternT, encoding_state>:: 406 do_in(state_type& __state, const extern_type* __from, 407 const extern_type* __from_end, const extern_type*& __from_next, 408 intern_type* __to, intern_type* __to_end, 409 intern_type*& __to_next) const 410 { 411 result __ret = codecvt_base::error; 412 if (__state.good()) 413 { 414 const descriptor_type& __desc = __state.in_descriptor(); 415 const size_t __fmultiple = sizeof(extern_type); 416 size_t __flen = __fmultiple * (__from_end - __from); 417 const size_t __tmultiple = sizeof(intern_type); 418 size_t __tlen = __tmultiple * (__to_end - __to); 419 420 // Argument list for iconv specifies a byte sequence. Thus, 421 // all to/from arrays must be brutally casted to char*. 422 char* __cto = reinterpret_cast<char*>(__to); 423 char* __cfrom; 424 size_t __conv; 425 426 // Some encodings need a byte order marker as the first item 427 // in the byte stream, to designate endian-ness. The default 428 // value for the byte order marker is NULL, so if this is 429 // the case, it's not necessary and we can just go on our 430 // merry way. 431 int __ext_bom = __state.external_bom(); 432 if (__ext_bom) 433 { 434 size_t __size = __from_end - __from; 435 extern_type* __cfixed = static_cast<extern_type*> 436 (__builtin_alloca(sizeof(extern_type) * (__size + 1))); 437 __cfixed[0] = static_cast<extern_type>(__ext_bom); 438 char_traits<extern_type>::copy(__cfixed + 1, __from, __size); 439 __cfrom = reinterpret_cast<char*>(__cfixed); 440 __conv = __iconv_adaptor(iconv, __desc, &__cfrom, 441 &__flen, &__cto, &__tlen); 442 } 443 else 444 { 445 extern_type* __cfixed = const_cast<extern_type*>(__from); 446 __cfrom = reinterpret_cast<char*>(__cfixed); 447 __conv = __iconv_adaptor(iconv, __desc, &__cfrom, 448 &__flen, &__cto, &__tlen); 449 } 450 451 452 if (__conv != size_t(-1)) 453 { 454 __from_next = reinterpret_cast<const extern_type*>(__cfrom); 455 __to_next = reinterpret_cast<intern_type*>(__cto); 456 __ret = codecvt_base::ok; 457 } 458 else 459 { 460 if (__flen < static_cast<size_t>(__from_end - __from)) 461 { 462 __from_next = reinterpret_cast<const extern_type*>(__cfrom); 463 __to_next = reinterpret_cast<intern_type*>(__cto); 464 __ret = codecvt_base::partial; 465 } 466 else 467 __ret = codecvt_base::error; 468 } 469 } 470 return __ret; 471 } 472 473 template<typename _InternT, typename _ExternT> 474 int 475 codecvt<_InternT, _ExternT, encoding_state>:: 476 do_encoding() const throw() 477 { 478 int __ret = 0; 479 if (sizeof(_ExternT) <= sizeof(_InternT)) 480 __ret = sizeof(_InternT) / sizeof(_ExternT); 481 return __ret; 482 } 483 484 template<typename _InternT, typename _ExternT> 485 bool 486 codecvt<_InternT, _ExternT, encoding_state>:: 487 do_always_noconv() const throw() 488 { return false; } 489 490 template<typename _InternT, typename _ExternT> 491 int 492 codecvt<_InternT, _ExternT, encoding_state>:: 493 do_length(state_type&, const extern_type* __from, 494 const extern_type* __end, size_t __max) const 495 { return std::min(__max, static_cast<size_t>(__end - __from)); } 496 497 // _GLIBCXX_RESOLVE_LIB_DEFECTS 498 // 74. Garbled text for codecvt::do_max_length 499 template<typename _InternT, typename _ExternT> 500 int 501 codecvt<_InternT, _ExternT, encoding_state>:: 502 do_max_length() const throw() 503 { return 1; } 504 505 _GLIBCXX_END_NAMESPACE 506 507 #endif 508