Home | History | Annotate | Download | only in ext
      1 // Locale support (codecvt) -*- C++ -*-
      2 
      3 // Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
      4 // 2008, 2009, 2010
      5 // Free Software Foundation, Inc.
      6 //
      7 // This file is part of the GNU ISO C++ Library.  This library is free
      8 // software; you can redistribute it and/or modify it under the
      9 // terms of the GNU General Public License as published by the
     10 // Free Software Foundation; either version 3, or (at your option)
     11 // any later version.
     12 
     13 // This library is distributed in the hope that it will be useful,
     14 // but WITHOUT ANY WARRANTY; without even the implied warranty of
     15 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     16 // GNU General Public License for more details.
     17 
     18 // Under Section 7 of GPL version 3, you are granted additional
     19 // permissions described in the GCC Runtime Library Exception, version
     20 // 3.1, as published by the Free Software Foundation.
     21 
     22 // You should have received a copy of the GNU General Public License and
     23 // a copy of the GCC Runtime Library Exception along with this program;
     24 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     25 // <http://www.gnu.org/licenses/>.
     26 
     27 //
     28 // ISO C++ 14882: 22.2.1.5 Template class codecvt
     29 //
     30 
     31 // Written by Benjamin Kosnik <bkoz (at) redhat.com>
     32 
     33 /** @file ext/codecvt_specializations.h
     34  *  This file is a GNU extension to the Standard C++ Library.
     35  */
     36 
     37 #ifndef _EXT_CODECVT_SPECIALIZATIONS_H
     38 #define _EXT_CODECVT_SPECIALIZATIONS_H 1
     39 
     40 #include <bits/c++config.h>
     41 #include <locale>
     42 #include <iconv.h>
     43 
     44 namespace __gnu_cxx _GLIBCXX_VISIBILITY(default)
     45 {
     46 _GLIBCXX_BEGIN_NAMESPACE_VERSION
     47 
     48   /// Extension to use iconv for dealing with character encodings.
     49   // This includes conversions and comparisons between various character
     50   // sets.  This object encapsulates data that may need to be shared between
     51   // char_traits, codecvt and ctype.
     52   class encoding_state
     53   {
     54   public:
     55     // Types:
     56     // NB: A conversion descriptor subsumes and enhances the
     57     // functionality of a simple state type such as mbstate_t.
     58     typedef iconv_t	descriptor_type;
     59 
     60   protected:
     61     // Name of internal character set encoding.
     62     std::string	       	_M_int_enc;
     63 
     64     // Name of external character set encoding.
     65     std::string  	_M_ext_enc;
     66 
     67     // Conversion descriptor between external encoding to internal encoding.
     68     descriptor_type	_M_in_desc;
     69 
     70     // Conversion descriptor between internal encoding to external encoding.
     71     descriptor_type	_M_out_desc;
     72 
     73     // The byte-order marker for the external encoding, if necessary.
     74     int			_M_ext_bom;
     75 
     76     // The byte-order marker for the internal encoding, if necessary.
     77     int			_M_int_bom;
     78 
     79     // Number of external bytes needed to construct one complete
     80     // character in the internal encoding.
     81     // NB: -1 indicates variable, or stateful, encodings.
     82     int 		_M_bytes;
     83 
     84   public:
     85     explicit
     86     encoding_state()
     87     : _M_in_desc(0), _M_out_desc(0), _M_ext_bom(0), _M_int_bom(0), _M_bytes(0)
     88     { }
     89 
     90     explicit
     91     encoding_state(const char* __int, const char* __ext,
     92 		   int __ibom = 0, int __ebom = 0, int __bytes = 1)
     93     : _M_int_enc(__int), _M_ext_enc(__ext), _M_in_desc(0), _M_out_desc(0),
     94       _M_ext_bom(__ebom), _M_int_bom(__ibom), _M_bytes(__bytes)
     95     { init(); }
     96 
     97     // 21.1.2 traits typedefs
     98     // p4
     99     // typedef STATE_T state_type
    100     // requires: state_type shall meet the requirements of
    101     // CopyConstructible types (20.1.3)
    102     // NB: This does not preserve the actual state of the conversion
    103     // descriptor member, but it does duplicate the encoding
    104     // information.
    105     encoding_state(const encoding_state& __obj) : _M_in_desc(0), _M_out_desc(0)
    106     { construct(__obj); }
    107 
    108     // Need assignment operator as well.
    109     encoding_state&
    110     operator=(const encoding_state& __obj)
    111     {
    112       construct(__obj);
    113       return *this;
    114     }
    115 
    116     ~encoding_state()
    117     { destroy(); }
    118 
    119     bool
    120     good() const throw()
    121     {
    122       const descriptor_type __err = (iconv_t)(-1);
    123       bool __test = _M_in_desc && _M_in_desc != __err;
    124       __test &=  _M_out_desc && _M_out_desc != __err;
    125       return __test;
    126     }
    127 
    128     int
    129     character_ratio() const
    130     { return _M_bytes; }
    131 
    132     const std::string
    133     internal_encoding() const
    134     { return _M_int_enc; }
    135 
    136     int
    137     internal_bom() const
    138     { return _M_int_bom; }
    139 
    140     const std::string
    141     external_encoding() const
    142     { return _M_ext_enc; }
    143 
    144     int
    145     external_bom() const
    146     { return _M_ext_bom; }
    147 
    148     const descriptor_type&
    149     in_descriptor() const
    150     { return _M_in_desc; }
    151 
    152     const descriptor_type&
    153     out_descriptor() const
    154     { return _M_out_desc; }
    155 
    156   protected:
    157     void
    158     init()
    159     {
    160       const descriptor_type __err = (iconv_t)(-1);
    161       const bool __have_encodings = _M_int_enc.size() && _M_ext_enc.size();
    162       if (!_M_in_desc && __have_encodings)
    163 	{
    164 	  _M_in_desc = iconv_open(_M_int_enc.c_str(), _M_ext_enc.c_str());
    165 	  if (_M_in_desc == __err)
    166 	    std::__throw_runtime_error(__N("encoding_state::_M_init "
    167 				    "creating iconv input descriptor failed"));
    168 	}
    169       if (!_M_out_desc && __have_encodings)
    170 	{
    171 	  _M_out_desc = iconv_open(_M_ext_enc.c_str(), _M_int_enc.c_str());
    172 	  if (_M_out_desc == __err)
    173 	    std::__throw_runtime_error(__N("encoding_state::_M_init "
    174 				  "creating iconv output descriptor failed"));
    175 	}
    176     }
    177 
    178     void
    179     construct(const encoding_state& __obj)
    180     {
    181       destroy();
    182       _M_int_enc = __obj._M_int_enc;
    183       _M_ext_enc = __obj._M_ext_enc;
    184       _M_ext_bom = __obj._M_ext_bom;
    185       _M_int_bom = __obj._M_int_bom;
    186       _M_bytes = __obj._M_bytes;
    187       init();
    188     }
    189 
    190     void
    191     destroy() throw()
    192     {
    193       const descriptor_type __err = (iconv_t)(-1);
    194       if (_M_in_desc && _M_in_desc != __err)
    195 	{
    196 	  iconv_close(_M_in_desc);
    197 	  _M_in_desc = 0;
    198 	}
    199       if (_M_out_desc && _M_out_desc != __err)
    200 	{
    201 	  iconv_close(_M_out_desc);
    202 	  _M_out_desc = 0;
    203 	}
    204     }
    205   };
    206 
    207   /// encoding_char_traits
    208   // Custom traits type with encoding_state for the state type, and the
    209   // associated fpos<encoding_state> for the position type, all other
    210   // bits equivalent to the required char_traits instantiations.
    211   template<typename _CharT>
    212     struct encoding_char_traits : public std::char_traits<_CharT>
    213     {
    214       typedef encoding_state				state_type;
    215       typedef typename std::fpos<state_type>		pos_type;
    216     };
    217 
    218 _GLIBCXX_END_NAMESPACE_VERSION
    219 } // namespace
    220 
    221 
    222 namespace std _GLIBCXX_VISIBILITY(default)
    223 {
    224 _GLIBCXX_BEGIN_NAMESPACE_VERSION
    225 
    226   using __gnu_cxx::encoding_state;
    227 
    228   /// codecvt<InternT, _ExternT, encoding_state> specialization.
    229   // This partial specialization takes advantage of iconv to provide
    230   // code conversions between a large number of character encodings.
    231   template<typename _InternT, typename _ExternT>
    232     class codecvt<_InternT, _ExternT, encoding_state>
    233     : public __codecvt_abstract_base<_InternT, _ExternT, encoding_state>
    234     {
    235     public:
    236       // Types:
    237       typedef codecvt_base::result			result;
    238       typedef _InternT 					intern_type;
    239       typedef _ExternT 					extern_type;
    240       typedef __gnu_cxx::encoding_state 		state_type;
    241       typedef state_type::descriptor_type 		descriptor_type;
    242 
    243       // Data Members:
    244       static locale::id 		id;
    245 
    246       explicit
    247       codecvt(size_t __refs = 0)
    248       : __codecvt_abstract_base<intern_type, extern_type, state_type>(__refs)
    249       { }
    250 
    251       explicit
    252       codecvt(state_type& __enc, size_t __refs = 0)
    253       : __codecvt_abstract_base<intern_type, extern_type, state_type>(__refs)
    254       { }
    255 
    256      protected:
    257       virtual
    258       ~codecvt() { }
    259 
    260       virtual result
    261       do_out(state_type& __state, const intern_type* __from,
    262 	     const intern_type* __from_end, const intern_type*& __from_next,
    263 	     extern_type* __to, extern_type* __to_end,
    264 	     extern_type*& __to_next) const;
    265 
    266       virtual result
    267       do_unshift(state_type& __state, extern_type* __to,
    268 		 extern_type* __to_end, extern_type*& __to_next) const;
    269 
    270       virtual result
    271       do_in(state_type& __state, const extern_type* __from,
    272 	    const extern_type* __from_end, const extern_type*& __from_next,
    273 	    intern_type* __to, intern_type* __to_end,
    274 	    intern_type*& __to_next) const;
    275 
    276       virtual int
    277       do_encoding() const throw();
    278 
    279       virtual bool
    280       do_always_noconv() const throw();
    281 
    282       virtual int
    283       do_length(state_type&, const extern_type* __from,
    284 		const extern_type* __end, size_t __max) const;
    285 
    286       virtual int
    287       do_max_length() const throw();
    288     };
    289 
    290   template<typename _InternT, typename _ExternT>
    291     locale::id
    292     codecvt<_InternT, _ExternT, encoding_state>::id;
    293 
    294   // This adaptor works around the signature problems of the second
    295   // argument to iconv():  SUSv2 and others use 'const char**', but glibc 2.2
    296   // uses 'char**', which matches the POSIX 1003.1-2001 standard.
    297   // Using this adaptor, g++ will do the work for us.
    298   template<typename _Tp>
    299     inline size_t
    300     __iconv_adaptor(size_t(*__func)(iconv_t, _Tp, size_t*, char**, size_t*),
    301                     iconv_t __cd, char** __inbuf, size_t* __inbytes,
    302                     char** __outbuf, size_t* __outbytes)
    303     { return __func(__cd, (_Tp)__inbuf, __inbytes, __outbuf, __outbytes); }
    304 
    305   template<typename _InternT, typename _ExternT>
    306     codecvt_base::result
    307     codecvt<_InternT, _ExternT, encoding_state>::
    308     do_out(state_type& __state, const intern_type* __from,
    309 	   const intern_type* __from_end, const intern_type*& __from_next,
    310 	   extern_type* __to, extern_type* __to_end,
    311 	   extern_type*& __to_next) const
    312     {
    313       result __ret = codecvt_base::error;
    314       if (__state.good())
    315 	{
    316 	  const descriptor_type& __desc = __state.out_descriptor();
    317 	  const size_t __fmultiple = sizeof(intern_type);
    318 	  size_t __fbytes = __fmultiple * (__from_end - __from);
    319 	  const size_t __tmultiple = sizeof(extern_type);
    320 	  size_t __tbytes = __tmultiple * (__to_end - __to);
    321 
    322 	  // Argument list for iconv specifies a byte sequence. Thus,
    323 	  // all to/from arrays must be brutally casted to char*.
    324 	  char* __cto = reinterpret_cast<char*>(__to);
    325 	  char* __cfrom;
    326 	  size_t __conv;
    327 
    328 	  // Some encodings need a byte order marker as the first item
    329 	  // in the byte stream, to designate endian-ness. The default
    330 	  // value for the byte order marker is NULL, so if this is
    331 	  // the case, it's not necessary and we can just go on our
    332 	  // merry way.
    333 	  int __int_bom = __state.internal_bom();
    334 	  if (__int_bom)
    335 	    {
    336 	      size_t __size = __from_end - __from;
    337 	      intern_type* __cfixed = static_cast<intern_type*>
    338 		(__builtin_alloca(sizeof(intern_type) * (__size + 1)));
    339 	      __cfixed[0] = static_cast<intern_type>(__int_bom);
    340 	      char_traits<intern_type>::copy(__cfixed + 1, __from, __size);
    341 	      __cfrom = reinterpret_cast<char*>(__cfixed);
    342 	      __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
    343                                         &__fbytes, &__cto, &__tbytes);
    344 	    }
    345 	  else
    346 	    {
    347 	      intern_type* __cfixed = const_cast<intern_type*>(__from);
    348 	      __cfrom = reinterpret_cast<char*>(__cfixed);
    349 	      __conv = __iconv_adaptor(iconv, __desc, &__cfrom, &__fbytes,
    350 				       &__cto, &__tbytes);
    351 	    }
    352 
    353 	  if (__conv != size_t(-1))
    354 	    {
    355 	      __from_next = reinterpret_cast<const intern_type*>(__cfrom);
    356 	      __to_next = reinterpret_cast<extern_type*>(__cto);
    357 	      __ret = codecvt_base::ok;
    358 	    }
    359 	  else
    360 	    {
    361 	      if (__fbytes < __fmultiple * (__from_end - __from))
    362 		{
    363 		  __from_next = reinterpret_cast<const intern_type*>(__cfrom);
    364 		  __to_next = reinterpret_cast<extern_type*>(__cto);
    365 		  __ret = codecvt_base::partial;
    366 		}
    367 	      else
    368 		__ret = codecvt_base::error;
    369 	    }
    370 	}
    371       return __ret;
    372     }
    373 
    374   template<typename _InternT, typename _ExternT>
    375     codecvt_base::result
    376     codecvt<_InternT, _ExternT, encoding_state>::
    377     do_unshift(state_type& __state, extern_type* __to,
    378 	       extern_type* __to_end, extern_type*& __to_next) const
    379     {
    380       result __ret = codecvt_base::error;
    381       if (__state.good())
    382 	{
    383 	  const descriptor_type& __desc = __state.in_descriptor();
    384 	  const size_t __tmultiple = sizeof(intern_type);
    385 	  size_t __tlen = __tmultiple * (__to_end - __to);
    386 
    387 	  // Argument list for iconv specifies a byte sequence. Thus,
    388 	  // all to/from arrays must be brutally casted to char*.
    389 	  char* __cto = reinterpret_cast<char*>(__to);
    390 	  size_t __conv = __iconv_adaptor(iconv,__desc, 0, 0,
    391                                           &__cto, &__tlen);
    392 
    393 	  if (__conv != size_t(-1))
    394 	    {
    395 	      __to_next = reinterpret_cast<extern_type*>(__cto);
    396 	      if (__tlen == __tmultiple * (__to_end - __to))
    397 		__ret = codecvt_base::noconv;
    398 	      else if (__tlen == 0)
    399 		__ret = codecvt_base::ok;
    400 	      else
    401 		__ret = codecvt_base::partial;
    402 	    }
    403 	  else
    404 	    __ret = codecvt_base::error;
    405 	}
    406       return __ret;
    407     }
    408 
    409   template<typename _InternT, typename _ExternT>
    410     codecvt_base::result
    411     codecvt<_InternT, _ExternT, encoding_state>::
    412     do_in(state_type& __state, const extern_type* __from,
    413 	  const extern_type* __from_end, const extern_type*& __from_next,
    414 	  intern_type* __to, intern_type* __to_end,
    415 	  intern_type*& __to_next) const
    416     {
    417       result __ret = codecvt_base::error;
    418       if (__state.good())
    419 	{
    420 	  const descriptor_type& __desc = __state.in_descriptor();
    421 	  const size_t __fmultiple = sizeof(extern_type);
    422 	  size_t __flen = __fmultiple * (__from_end - __from);
    423 	  const size_t __tmultiple = sizeof(intern_type);
    424 	  size_t __tlen = __tmultiple * (__to_end - __to);
    425 
    426 	  // Argument list for iconv specifies a byte sequence. Thus,
    427 	  // all to/from arrays must be brutally casted to char*.
    428 	  char* __cto = reinterpret_cast<char*>(__to);
    429 	  char* __cfrom;
    430 	  size_t __conv;
    431 
    432 	  // Some encodings need a byte order marker as the first item
    433 	  // in the byte stream, to designate endian-ness. The default
    434 	  // value for the byte order marker is NULL, so if this is
    435 	  // the case, it's not necessary and we can just go on our
    436 	  // merry way.
    437 	  int __ext_bom = __state.external_bom();
    438 	  if (__ext_bom)
    439 	    {
    440 	      size_t __size = __from_end - __from;
    441 	      extern_type* __cfixed =  static_cast<extern_type*>
    442 		(__builtin_alloca(sizeof(extern_type) * (__size + 1)));
    443 	      __cfixed[0] = static_cast<extern_type>(__ext_bom);
    444 	      char_traits<extern_type>::copy(__cfixed + 1, __from, __size);
    445 	      __cfrom = reinterpret_cast<char*>(__cfixed);
    446 	      __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
    447                                        &__flen, &__cto, &__tlen);
    448 	    }
    449 	  else
    450 	    {
    451 	      extern_type* __cfixed = const_cast<extern_type*>(__from);
    452 	      __cfrom = reinterpret_cast<char*>(__cfixed);
    453 	      __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
    454                                        &__flen, &__cto, &__tlen);
    455 	    }
    456 
    457 
    458 	  if (__conv != size_t(-1))
    459 	    {
    460 	      __from_next = reinterpret_cast<const extern_type*>(__cfrom);
    461 	      __to_next = reinterpret_cast<intern_type*>(__cto);
    462 	      __ret = codecvt_base::ok;
    463 	    }
    464 	  else
    465 	    {
    466 	      if (__flen < static_cast<size_t>(__from_end - __from))
    467 		{
    468 		  __from_next = reinterpret_cast<const extern_type*>(__cfrom);
    469 		  __to_next = reinterpret_cast<intern_type*>(__cto);
    470 		  __ret = codecvt_base::partial;
    471 		}
    472 	      else
    473 		__ret = codecvt_base::error;
    474 	    }
    475 	}
    476       return __ret;
    477     }
    478 
    479   template<typename _InternT, typename _ExternT>
    480     int
    481     codecvt<_InternT, _ExternT, encoding_state>::
    482     do_encoding() const throw()
    483     {
    484       int __ret = 0;
    485       if (sizeof(_ExternT) <= sizeof(_InternT))
    486 	__ret = sizeof(_InternT) / sizeof(_ExternT);
    487       return __ret;
    488     }
    489 
    490   template<typename _InternT, typename _ExternT>
    491     bool
    492     codecvt<_InternT, _ExternT, encoding_state>::
    493     do_always_noconv() const throw()
    494     { return false; }
    495 
    496   template<typename _InternT, typename _ExternT>
    497     int
    498     codecvt<_InternT, _ExternT, encoding_state>::
    499     do_length(state_type&, const extern_type* __from,
    500 	      const extern_type* __end, size_t __max) const
    501     { return std::min(__max, static_cast<size_t>(__end - __from)); }
    502 
    503   // _GLIBCXX_RESOLVE_LIB_DEFECTS
    504   // 74.  Garbled text for codecvt::do_max_length
    505   template<typename _InternT, typename _ExternT>
    506     int
    507     codecvt<_InternT, _ExternT, encoding_state>::
    508     do_max_length() const throw()
    509     { return 1; }
    510 
    511 _GLIBCXX_END_NAMESPACE_VERSION
    512 } // namespace
    513 
    514 #endif
    515