Home | History | Annotate | Download | only in ext
      1 // Locale support (codecvt) -*- C++ -*-
      2 
      3 // Copyright (C) 2000-2013 Free Software Foundation, Inc.
      4 //
      5 // This file is part of the GNU ISO C++ Library.  This library is free
      6 // software; you can redistribute it and/or modify it under the
      7 // terms of the GNU General Public License as published by the
      8 // Free Software Foundation; either version 3, or (at your option)
      9 // any later version.
     10 
     11 // This library is distributed in the hope that it will be useful,
     12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
     13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     14 // GNU General Public License for more details.
     15 
     16 // Under Section 7 of GPL version 3, you are granted additional
     17 // permissions described in the GCC Runtime Library Exception, version
     18 // 3.1, as published by the Free Software Foundation.
     19 
     20 // You should have received a copy of the GNU General Public License and
     21 // a copy of the GCC Runtime Library Exception along with this program;
     22 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     23 // <http://www.gnu.org/licenses/>.
     24 
     25 //
     26 // ISO C++ 14882: 22.2.1.5 Template class codecvt
     27 //
     28 
     29 // Written by Benjamin Kosnik <bkoz (at) redhat.com>
     30 
     31 /** @file ext/codecvt_specializations.h
     32  *  This file is a GNU extension to the Standard C++ Library.
     33  */
     34 
     35 #ifndef _EXT_CODECVT_SPECIALIZATIONS_H
     36 #define _EXT_CODECVT_SPECIALIZATIONS_H 1
     37 
     38 #include <bits/c++config.h>
     39 #include <locale>
     40 #include <iconv.h>
     41 
     42 namespace __gnu_cxx _GLIBCXX_VISIBILITY(default)
     43 {
     44 _GLIBCXX_BEGIN_NAMESPACE_VERSION
     45 
     46   /// Extension to use iconv for dealing with character encodings.
     47   // This includes conversions and comparisons between various character
     48   // sets.  This object encapsulates data that may need to be shared between
     49   // char_traits, codecvt and ctype.
     50   class encoding_state
     51   {
     52   public:
     53     // Types:
     54     // NB: A conversion descriptor subsumes and enhances the
     55     // functionality of a simple state type such as mbstate_t.
     56     typedef iconv_t	descriptor_type;
     57 
     58   protected:
     59     // Name of internal character set encoding.
     60     std::string	       	_M_int_enc;
     61 
     62     // Name of external character set encoding.
     63     std::string  	_M_ext_enc;
     64 
     65     // Conversion descriptor between external encoding to internal encoding.
     66     descriptor_type	_M_in_desc;
     67 
     68     // Conversion descriptor between internal encoding to external encoding.
     69     descriptor_type	_M_out_desc;
     70 
     71     // The byte-order marker for the external encoding, if necessary.
     72     int			_M_ext_bom;
     73 
     74     // The byte-order marker for the internal encoding, if necessary.
     75     int			_M_int_bom;
     76 
     77     // Number of external bytes needed to construct one complete
     78     // character in the internal encoding.
     79     // NB: -1 indicates variable, or stateful, encodings.
     80     int 		_M_bytes;
     81 
     82   public:
     83     explicit
     84     encoding_state()
     85     : _M_in_desc(0), _M_out_desc(0), _M_ext_bom(0), _M_int_bom(0), _M_bytes(0)
     86     { }
     87 
     88     explicit
     89     encoding_state(const char* __int, const char* __ext,
     90 		   int __ibom = 0, int __ebom = 0, int __bytes = 1)
     91     : _M_int_enc(__int), _M_ext_enc(__ext), _M_in_desc(0), _M_out_desc(0),
     92       _M_ext_bom(__ebom), _M_int_bom(__ibom), _M_bytes(__bytes)
     93     { init(); }
     94 
     95     // 21.1.2 traits typedefs
     96     // p4
     97     // typedef STATE_T state_type
     98     // requires: state_type shall meet the requirements of
     99     // CopyConstructible types (20.1.3)
    100     // NB: This does not preserve the actual state of the conversion
    101     // descriptor member, but it does duplicate the encoding
    102     // information.
    103     encoding_state(const encoding_state& __obj) : _M_in_desc(0), _M_out_desc(0)
    104     { construct(__obj); }
    105 
    106     // Need assignment operator as well.
    107     encoding_state&
    108     operator=(const encoding_state& __obj)
    109     {
    110       construct(__obj);
    111       return *this;
    112     }
    113 
    114     ~encoding_state()
    115     { destroy(); }
    116 
    117     bool
    118     good() const throw()
    119     {
    120       const descriptor_type __err = (iconv_t)(-1);
    121       bool __test = _M_in_desc && _M_in_desc != __err;
    122       __test &=  _M_out_desc && _M_out_desc != __err;
    123       return __test;
    124     }
    125 
    126     int
    127     character_ratio() const
    128     { return _M_bytes; }
    129 
    130     const std::string
    131     internal_encoding() const
    132     { return _M_int_enc; }
    133 
    134     int
    135     internal_bom() const
    136     { return _M_int_bom; }
    137 
    138     const std::string
    139     external_encoding() const
    140     { return _M_ext_enc; }
    141 
    142     int
    143     external_bom() const
    144     { return _M_ext_bom; }
    145 
    146     const descriptor_type&
    147     in_descriptor() const
    148     { return _M_in_desc; }
    149 
    150     const descriptor_type&
    151     out_descriptor() const
    152     { return _M_out_desc; }
    153 
    154   protected:
    155     void
    156     init()
    157     {
    158       const descriptor_type __err = (iconv_t)(-1);
    159       const bool __have_encodings = _M_int_enc.size() && _M_ext_enc.size();
    160       if (!_M_in_desc && __have_encodings)
    161 	{
    162 	  _M_in_desc = iconv_open(_M_int_enc.c_str(), _M_ext_enc.c_str());
    163 	  if (_M_in_desc == __err)
    164 	    std::__throw_runtime_error(__N("encoding_state::_M_init "
    165 				    "creating iconv input descriptor failed"));
    166 	}
    167       if (!_M_out_desc && __have_encodings)
    168 	{
    169 	  _M_out_desc = iconv_open(_M_ext_enc.c_str(), _M_int_enc.c_str());
    170 	  if (_M_out_desc == __err)
    171 	    std::__throw_runtime_error(__N("encoding_state::_M_init "
    172 				  "creating iconv output descriptor failed"));
    173 	}
    174     }
    175 
    176     void
    177     construct(const encoding_state& __obj)
    178     {
    179       destroy();
    180       _M_int_enc = __obj._M_int_enc;
    181       _M_ext_enc = __obj._M_ext_enc;
    182       _M_ext_bom = __obj._M_ext_bom;
    183       _M_int_bom = __obj._M_int_bom;
    184       _M_bytes = __obj._M_bytes;
    185       init();
    186     }
    187 
    188     void
    189     destroy() throw()
    190     {
    191       const descriptor_type __err = (iconv_t)(-1);
    192       if (_M_in_desc && _M_in_desc != __err)
    193 	{
    194 	  iconv_close(_M_in_desc);
    195 	  _M_in_desc = 0;
    196 	}
    197       if (_M_out_desc && _M_out_desc != __err)
    198 	{
    199 	  iconv_close(_M_out_desc);
    200 	  _M_out_desc = 0;
    201 	}
    202     }
    203   };
    204 
    205   /// encoding_char_traits
    206   // Custom traits type with encoding_state for the state type, and the
    207   // associated fpos<encoding_state> for the position type, all other
    208   // bits equivalent to the required char_traits instantiations.
    209   template<typename _CharT>
    210     struct encoding_char_traits : public std::char_traits<_CharT>
    211     {
    212       typedef encoding_state				state_type;
    213       typedef typename std::fpos<state_type>		pos_type;
    214     };
    215 
    216 _GLIBCXX_END_NAMESPACE_VERSION
    217 } // namespace
    218 
    219 
    220 namespace std _GLIBCXX_VISIBILITY(default)
    221 {
    222 _GLIBCXX_BEGIN_NAMESPACE_VERSION
    223 
    224   using __gnu_cxx::encoding_state;
    225 
    226   /// codecvt<InternT, _ExternT, encoding_state> specialization.
    227   // This partial specialization takes advantage of iconv to provide
    228   // code conversions between a large number of character encodings.
    229   template<typename _InternT, typename _ExternT>
    230     class codecvt<_InternT, _ExternT, encoding_state>
    231     : public __codecvt_abstract_base<_InternT, _ExternT, encoding_state>
    232     {
    233     public:
    234       // Types:
    235       typedef codecvt_base::result			result;
    236       typedef _InternT 					intern_type;
    237       typedef _ExternT 					extern_type;
    238       typedef __gnu_cxx::encoding_state 		state_type;
    239       typedef state_type::descriptor_type 		descriptor_type;
    240 
    241       // Data Members:
    242       static locale::id 		id;
    243 
    244       explicit
    245       codecvt(size_t __refs = 0)
    246       : __codecvt_abstract_base<intern_type, extern_type, state_type>(__refs)
    247       { }
    248 
    249       explicit
    250       codecvt(state_type& __enc, size_t __refs = 0)
    251       : __codecvt_abstract_base<intern_type, extern_type, state_type>(__refs)
    252       { }
    253 
    254      protected:
    255       virtual
    256       ~codecvt() { }
    257 
    258       virtual result
    259       do_out(state_type& __state, const intern_type* __from,
    260 	     const intern_type* __from_end, const intern_type*& __from_next,
    261 	     extern_type* __to, extern_type* __to_end,
    262 	     extern_type*& __to_next) const;
    263 
    264       virtual result
    265       do_unshift(state_type& __state, extern_type* __to,
    266 		 extern_type* __to_end, extern_type*& __to_next) const;
    267 
    268       virtual result
    269       do_in(state_type& __state, const extern_type* __from,
    270 	    const extern_type* __from_end, const extern_type*& __from_next,
    271 	    intern_type* __to, intern_type* __to_end,
    272 	    intern_type*& __to_next) const;
    273 
    274       virtual int
    275       do_encoding() const throw();
    276 
    277       virtual bool
    278       do_always_noconv() const throw();
    279 
    280       virtual int
    281       do_length(state_type&, const extern_type* __from,
    282 		const extern_type* __end, size_t __max) const;
    283 
    284       virtual int
    285       do_max_length() const throw();
    286     };
    287 
    288   template<typename _InternT, typename _ExternT>
    289     locale::id
    290     codecvt<_InternT, _ExternT, encoding_state>::id;
    291 
    292   // This adaptor works around the signature problems of the second
    293   // argument to iconv():  SUSv2 and others use 'const char**', but glibc 2.2
    294   // uses 'char**', which matches the POSIX 1003.1-2001 standard.
    295   // Using this adaptor, g++ will do the work for us.
    296   template<typename _Tp>
    297     inline size_t
    298     __iconv_adaptor(size_t(*__func)(iconv_t, _Tp, size_t*, char**, size_t*),
    299                     iconv_t __cd, char** __inbuf, size_t* __inbytes,
    300                     char** __outbuf, size_t* __outbytes)
    301     { return __func(__cd, (_Tp)__inbuf, __inbytes, __outbuf, __outbytes); }
    302 
    303   template<typename _InternT, typename _ExternT>
    304     codecvt_base::result
    305     codecvt<_InternT, _ExternT, encoding_state>::
    306     do_out(state_type& __state, const intern_type* __from,
    307 	   const intern_type* __from_end, const intern_type*& __from_next,
    308 	   extern_type* __to, extern_type* __to_end,
    309 	   extern_type*& __to_next) const
    310     {
    311       result __ret = codecvt_base::error;
    312       if (__state.good())
    313 	{
    314 	  const descriptor_type& __desc = __state.out_descriptor();
    315 	  const size_t __fmultiple = sizeof(intern_type);
    316 	  size_t __fbytes = __fmultiple * (__from_end - __from);
    317 	  const size_t __tmultiple = sizeof(extern_type);
    318 	  size_t __tbytes = __tmultiple * (__to_end - __to);
    319 
    320 	  // Argument list for iconv specifies a byte sequence. Thus,
    321 	  // all to/from arrays must be brutally casted to char*.
    322 	  char* __cto = reinterpret_cast<char*>(__to);
    323 	  char* __cfrom;
    324 	  size_t __conv;
    325 
    326 	  // Some encodings need a byte order marker as the first item
    327 	  // in the byte stream, to designate endian-ness. The default
    328 	  // value for the byte order marker is NULL, so if this is
    329 	  // the case, it's not necessary and we can just go on our
    330 	  // merry way.
    331 	  int __int_bom = __state.internal_bom();
    332 	  if (__int_bom)
    333 	    {
    334 	      size_t __size = __from_end - __from;
    335 	      intern_type* __cfixed = static_cast<intern_type*>
    336 		(__builtin_alloca(sizeof(intern_type) * (__size + 1)));
    337 	      __cfixed[0] = static_cast<intern_type>(__int_bom);
    338 	      char_traits<intern_type>::copy(__cfixed + 1, __from, __size);
    339 	      __cfrom = reinterpret_cast<char*>(__cfixed);
    340 	      __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
    341                                         &__fbytes, &__cto, &__tbytes);
    342 	    }
    343 	  else
    344 	    {
    345 	      intern_type* __cfixed = const_cast<intern_type*>(__from);
    346 	      __cfrom = reinterpret_cast<char*>(__cfixed);
    347 	      __conv = __iconv_adaptor(iconv, __desc, &__cfrom, &__fbytes,
    348 				       &__cto, &__tbytes);
    349 	    }
    350 
    351 	  if (__conv != size_t(-1))
    352 	    {
    353 	      __from_next = reinterpret_cast<const intern_type*>(__cfrom);
    354 	      __to_next = reinterpret_cast<extern_type*>(__cto);
    355 	      __ret = codecvt_base::ok;
    356 	    }
    357 	  else
    358 	    {
    359 	      if (__fbytes < __fmultiple * (__from_end - __from))
    360 		{
    361 		  __from_next = reinterpret_cast<const intern_type*>(__cfrom);
    362 		  __to_next = reinterpret_cast<extern_type*>(__cto);
    363 		  __ret = codecvt_base::partial;
    364 		}
    365 	      else
    366 		__ret = codecvt_base::error;
    367 	    }
    368 	}
    369       return __ret;
    370     }
    371 
    372   template<typename _InternT, typename _ExternT>
    373     codecvt_base::result
    374     codecvt<_InternT, _ExternT, encoding_state>::
    375     do_unshift(state_type& __state, extern_type* __to,
    376 	       extern_type* __to_end, extern_type*& __to_next) const
    377     {
    378       result __ret = codecvt_base::error;
    379       if (__state.good())
    380 	{
    381 	  const descriptor_type& __desc = __state.in_descriptor();
    382 	  const size_t __tmultiple = sizeof(intern_type);
    383 	  size_t __tlen = __tmultiple * (__to_end - __to);
    384 
    385 	  // Argument list for iconv specifies a byte sequence. Thus,
    386 	  // all to/from arrays must be brutally casted to char*.
    387 	  char* __cto = reinterpret_cast<char*>(__to);
    388 	  size_t __conv = __iconv_adaptor(iconv,__desc, 0, 0,
    389                                           &__cto, &__tlen);
    390 
    391 	  if (__conv != size_t(-1))
    392 	    {
    393 	      __to_next = reinterpret_cast<extern_type*>(__cto);
    394 	      if (__tlen == __tmultiple * (__to_end - __to))
    395 		__ret = codecvt_base::noconv;
    396 	      else if (__tlen == 0)
    397 		__ret = codecvt_base::ok;
    398 	      else
    399 		__ret = codecvt_base::partial;
    400 	    }
    401 	  else
    402 	    __ret = codecvt_base::error;
    403 	}
    404       return __ret;
    405     }
    406 
    407   template<typename _InternT, typename _ExternT>
    408     codecvt_base::result
    409     codecvt<_InternT, _ExternT, encoding_state>::
    410     do_in(state_type& __state, const extern_type* __from,
    411 	  const extern_type* __from_end, const extern_type*& __from_next,
    412 	  intern_type* __to, intern_type* __to_end,
    413 	  intern_type*& __to_next) const
    414     {
    415       result __ret = codecvt_base::error;
    416       if (__state.good())
    417 	{
    418 	  const descriptor_type& __desc = __state.in_descriptor();
    419 	  const size_t __fmultiple = sizeof(extern_type);
    420 	  size_t __flen = __fmultiple * (__from_end - __from);
    421 	  const size_t __tmultiple = sizeof(intern_type);
    422 	  size_t __tlen = __tmultiple * (__to_end - __to);
    423 
    424 	  // Argument list for iconv specifies a byte sequence. Thus,
    425 	  // all to/from arrays must be brutally casted to char*.
    426 	  char* __cto = reinterpret_cast<char*>(__to);
    427 	  char* __cfrom;
    428 	  size_t __conv;
    429 
    430 	  // Some encodings need a byte order marker as the first item
    431 	  // in the byte stream, to designate endian-ness. The default
    432 	  // value for the byte order marker is NULL, so if this is
    433 	  // the case, it's not necessary and we can just go on our
    434 	  // merry way.
    435 	  int __ext_bom = __state.external_bom();
    436 	  if (__ext_bom)
    437 	    {
    438 	      size_t __size = __from_end - __from;
    439 	      extern_type* __cfixed =  static_cast<extern_type*>
    440 		(__builtin_alloca(sizeof(extern_type) * (__size + 1)));
    441 	      __cfixed[0] = static_cast<extern_type>(__ext_bom);
    442 	      char_traits<extern_type>::copy(__cfixed + 1, __from, __size);
    443 	      __cfrom = reinterpret_cast<char*>(__cfixed);
    444 	      __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
    445                                        &__flen, &__cto, &__tlen);
    446 	    }
    447 	  else
    448 	    {
    449 	      extern_type* __cfixed = const_cast<extern_type*>(__from);
    450 	      __cfrom = reinterpret_cast<char*>(__cfixed);
    451 	      __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
    452                                        &__flen, &__cto, &__tlen);
    453 	    }
    454 
    455 
    456 	  if (__conv != size_t(-1))
    457 	    {
    458 	      __from_next = reinterpret_cast<const extern_type*>(__cfrom);
    459 	      __to_next = reinterpret_cast<intern_type*>(__cto);
    460 	      __ret = codecvt_base::ok;
    461 	    }
    462 	  else
    463 	    {
    464 	      if (__flen < static_cast<size_t>(__from_end - __from))
    465 		{
    466 		  __from_next = reinterpret_cast<const extern_type*>(__cfrom);
    467 		  __to_next = reinterpret_cast<intern_type*>(__cto);
    468 		  __ret = codecvt_base::partial;
    469 		}
    470 	      else
    471 		__ret = codecvt_base::error;
    472 	    }
    473 	}
    474       return __ret;
    475     }
    476 
    477   template<typename _InternT, typename _ExternT>
    478     int
    479     codecvt<_InternT, _ExternT, encoding_state>::
    480     do_encoding() const throw()
    481     {
    482       int __ret = 0;
    483       if (sizeof(_ExternT) <= sizeof(_InternT))
    484 	__ret = sizeof(_InternT) / sizeof(_ExternT);
    485       return __ret;
    486     }
    487 
    488   template<typename _InternT, typename _ExternT>
    489     bool
    490     codecvt<_InternT, _ExternT, encoding_state>::
    491     do_always_noconv() const throw()
    492     { return false; }
    493 
    494   template<typename _InternT, typename _ExternT>
    495     int
    496     codecvt<_InternT, _ExternT, encoding_state>::
    497     do_length(state_type&, const extern_type* __from,
    498 	      const extern_type* __end, size_t __max) const
    499     { return std::min(__max, static_cast<size_t>(__end - __from)); }
    500 
    501   // _GLIBCXX_RESOLVE_LIB_DEFECTS
    502   // 74.  Garbled text for codecvt::do_max_length
    503   template<typename _InternT, typename _ExternT>
    504     int
    505     codecvt<_InternT, _ExternT, encoding_state>::
    506     do_max_length() const throw()
    507     { return 1; }
    508 
    509 _GLIBCXX_END_NAMESPACE_VERSION
    510 } // namespace
    511 
    512 #endif
    513