Home | History | Annotate | Download | only in ext
      1 // Locale support (codecvt) -*- C++ -*-
      2 
      3 // Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2009
      4 //  Free Software Foundation, Inc.
      5 //
      6 // This file is part of the GNU ISO C++ Library.  This library is free
      7 // software; you can redistribute it and/or modify it under the
      8 // terms of the GNU General Public License as published by the
      9 // Free Software Foundation; either version 3, or (at your option)
     10 // any later version.
     11 
     12 // This library is distributed in the hope that it will be useful,
     13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
     14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     15 // GNU General Public License for more details.
     16 
     17 // Under Section 7 of GPL version 3, you are granted additional
     18 // permissions described in the GCC Runtime Library Exception, version
     19 // 3.1, as published by the Free Software Foundation.
     20 
     21 // You should have received a copy of the GNU General Public License and
     22 // a copy of the GCC Runtime Library Exception along with this program;
     23 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     24 // <http://www.gnu.org/licenses/>.
     25 
     26 //
     27 // ISO C++ 14882: 22.2.1.5 Template class codecvt
     28 //
     29 
     30 // Written by Benjamin Kosnik <bkoz (at) redhat.com>
     31 
     32 /** @file ext/codecvt_specializations.h
     33  *  This file is a GNU extension to the Standard C++ Library.
     34  */
     35 
     36 #ifndef _EXT_CODECVT_SPECIALIZATIONS_H
     37 #define _EXT_CODECVT_SPECIALIZATIONS_H 1
     38 
     39 #include <bits/c++config.h>
     40 #include <locale>
     41 #include <iconv.h>
     42 
     43 _GLIBCXX_BEGIN_NAMESPACE(__gnu_cxx)
     44 
     45   /// Extension to use iconv for dealing with character encodings.
     46   // This includes conversions and comparisons between various character
     47   // sets.  This object encapsulates data that may need to be shared between
     48   // char_traits, codecvt and ctype.
     49   class encoding_state
     50   {
     51   public:
     52     // Types:
     53     // NB: A conversion descriptor subsumes and enhances the
     54     // functionality of a simple state type such as mbstate_t.
     55     typedef iconv_t	descriptor_type;
     56 
     57   protected:
     58     // Name of internal character set encoding.
     59     std::string	       	_M_int_enc;
     60 
     61     // Name of external character set encoding.
     62     std::string  	_M_ext_enc;
     63 
     64     // Conversion descriptor between external encoding to internal encoding.
     65     descriptor_type	_M_in_desc;
     66 
     67     // Conversion descriptor between internal encoding to external encoding.
     68     descriptor_type	_M_out_desc;
     69 
     70     // The byte-order marker for the external encoding, if necessary.
     71     int			_M_ext_bom;
     72 
     73     // The byte-order marker for the internal encoding, if necessary.
     74     int			_M_int_bom;
     75 
     76     // Number of external bytes needed to construct one complete
     77     // character in the internal encoding.
     78     // NB: -1 indicates variable, or stateful, encodings.
     79     int 		_M_bytes;
     80 
     81   public:
     82     explicit
     83     encoding_state()
     84     : _M_in_desc(0), _M_out_desc(0), _M_ext_bom(0), _M_int_bom(0), _M_bytes(0)
     85     { }
     86 
     87     explicit
     88     encoding_state(const char* __int, const char* __ext,
     89 		   int __ibom = 0, int __ebom = 0, int __bytes = 1)
     90     : _M_int_enc(__int), _M_ext_enc(__ext), _M_in_desc(0), _M_out_desc(0),
     91       _M_ext_bom(__ebom), _M_int_bom(__ibom), _M_bytes(__bytes)
     92     { init(); }
     93 
     94     // 21.1.2 traits typedefs
     95     // p4
     96     // typedef STATE_T state_type
     97     // requires: state_type shall meet the requirements of
     98     // CopyConstructible types (20.1.3)
     99     // NB: This does not preserve the actual state of the conversion
    100     // descriptor member, but it does duplicate the encoding
    101     // information.
    102     encoding_state(const encoding_state& __obj) : _M_in_desc(0), _M_out_desc(0)
    103     { construct(__obj); }
    104 
    105     // Need assignment operator as well.
    106     encoding_state&
    107     operator=(const encoding_state& __obj)
    108     {
    109       construct(__obj);
    110       return *this;
    111     }
    112 
    113     ~encoding_state()
    114     { destroy(); }
    115 
    116     bool
    117     good() const throw()
    118     {
    119       const descriptor_type __err = (iconv_t)(-1);
    120       bool __test = _M_in_desc && _M_in_desc != __err;
    121       __test &=  _M_out_desc && _M_out_desc != __err;
    122       return __test;
    123     }
    124 
    125     int
    126     character_ratio() const
    127     { return _M_bytes; }
    128 
    129     const std::string
    130     internal_encoding() const
    131     { return _M_int_enc; }
    132 
    133     int
    134     internal_bom() const
    135     { return _M_int_bom; }
    136 
    137     const std::string
    138     external_encoding() const
    139     { return _M_ext_enc; }
    140 
    141     int
    142     external_bom() const
    143     { return _M_ext_bom; }
    144 
    145     const descriptor_type&
    146     in_descriptor() const
    147     { return _M_in_desc; }
    148 
    149     const descriptor_type&
    150     out_descriptor() const
    151     { return _M_out_desc; }
    152 
    153   protected:
    154     void
    155     init()
    156     {
    157       const descriptor_type __err = (iconv_t)(-1);
    158       const bool __have_encodings = _M_int_enc.size() && _M_ext_enc.size();
    159       if (!_M_in_desc && __have_encodings)
    160 	{
    161 	  _M_in_desc = iconv_open(_M_int_enc.c_str(), _M_ext_enc.c_str());
    162 	  if (_M_in_desc == __err)
    163 	    std::__throw_runtime_error(__N("encoding_state::_M_init "
    164 				    "creating iconv input descriptor failed"));
    165 	}
    166       if (!_M_out_desc && __have_encodings)
    167 	{
    168 	  _M_out_desc = iconv_open(_M_ext_enc.c_str(), _M_int_enc.c_str());
    169 	  if (_M_out_desc == __err)
    170 	    std::__throw_runtime_error(__N("encoding_state::_M_init "
    171 				  "creating iconv output descriptor failed"));
    172 	}
    173     }
    174 
    175     void
    176     construct(const encoding_state& __obj)
    177     {
    178       destroy();
    179       _M_int_enc = __obj._M_int_enc;
    180       _M_ext_enc = __obj._M_ext_enc;
    181       _M_ext_bom = __obj._M_ext_bom;
    182       _M_int_bom = __obj._M_int_bom;
    183       _M_bytes = __obj._M_bytes;
    184       init();
    185     }
    186 
    187     void
    188     destroy() throw()
    189     {
    190       const descriptor_type __err = (iconv_t)(-1);
    191       if (_M_in_desc && _M_in_desc != __err)
    192 	{
    193 	  iconv_close(_M_in_desc);
    194 	  _M_in_desc = 0;
    195 	}
    196       if (_M_out_desc && _M_out_desc != __err)
    197 	{
    198 	  iconv_close(_M_out_desc);
    199 	  _M_out_desc = 0;
    200 	}
    201     }
    202   };
    203 
    204   /// encoding_char_traits
    205   // Custom traits type with encoding_state for the state type, and the
    206   // associated fpos<encoding_state> for the position type, all other
    207   // bits equivalent to the required char_traits instantiations.
    208   template<typename _CharT>
    209     struct encoding_char_traits : public std::char_traits<_CharT>
    210     {
    211       typedef encoding_state				state_type;
    212       typedef typename std::fpos<state_type>		pos_type;
    213     };
    214 
    215 _GLIBCXX_END_NAMESPACE
    216 
    217 
    218 _GLIBCXX_BEGIN_NAMESPACE(std)
    219 
    220   using __gnu_cxx::encoding_state;
    221 
    222   /// codecvt<InternT, _ExternT, encoding_state> specialization.
    223   // This partial specialization takes advantage of iconv to provide
    224   // code conversions between a large number of character encodings.
    225   template<typename _InternT, typename _ExternT>
    226     class codecvt<_InternT, _ExternT, encoding_state>
    227     : public __codecvt_abstract_base<_InternT, _ExternT, encoding_state>
    228     {
    229     public:
    230       // Types:
    231       typedef codecvt_base::result			result;
    232       typedef _InternT 					intern_type;
    233       typedef _ExternT 					extern_type;
    234       typedef __gnu_cxx::encoding_state 		state_type;
    235       typedef state_type::descriptor_type 		descriptor_type;
    236 
    237       // Data Members:
    238       static locale::id 		id;
    239 
    240       explicit
    241       codecvt(size_t __refs = 0)
    242       : __codecvt_abstract_base<intern_type, extern_type, state_type>(__refs)
    243       { }
    244 
    245       explicit
    246       codecvt(state_type& __enc, size_t __refs = 0)
    247       : __codecvt_abstract_base<intern_type, extern_type, state_type>(__refs)
    248       { }
    249 
    250      protected:
    251       virtual
    252       ~codecvt() { }
    253 
    254       virtual result
    255       do_out(state_type& __state, const intern_type* __from,
    256 	     const intern_type* __from_end, const intern_type*& __from_next,
    257 	     extern_type* __to, extern_type* __to_end,
    258 	     extern_type*& __to_next) const;
    259 
    260       virtual result
    261       do_unshift(state_type& __state, extern_type* __to,
    262 		 extern_type* __to_end, extern_type*& __to_next) const;
    263 
    264       virtual result
    265       do_in(state_type& __state, const extern_type* __from,
    266 	    const extern_type* __from_end, const extern_type*& __from_next,
    267 	    intern_type* __to, intern_type* __to_end,
    268 	    intern_type*& __to_next) const;
    269 
    270       virtual int
    271       do_encoding() const throw();
    272 
    273       virtual bool
    274       do_always_noconv() const throw();
    275 
    276       virtual int
    277       do_length(state_type&, const extern_type* __from,
    278 		const extern_type* __end, size_t __max) const;
    279 
    280       virtual int
    281       do_max_length() const throw();
    282     };
    283 
    284   template<typename _InternT, typename _ExternT>
    285     locale::id
    286     codecvt<_InternT, _ExternT, encoding_state>::id;
    287 
    288   // This adaptor works around the signature problems of the second
    289   // argument to iconv():  SUSv2 and others use 'const char**', but glibc 2.2
    290   // uses 'char**', which matches the POSIX 1003.1-2001 standard.
    291   // Using this adaptor, g++ will do the work for us.
    292   template<typename _Tp>
    293     inline size_t
    294     __iconv_adaptor(size_t(*__func)(iconv_t, _Tp, size_t*, char**, size_t*),
    295                     iconv_t __cd, char** __inbuf, size_t* __inbytes,
    296                     char** __outbuf, size_t* __outbytes)
    297     { return __func(__cd, (_Tp)__inbuf, __inbytes, __outbuf, __outbytes); }
    298 
    299   template<typename _InternT, typename _ExternT>
    300     codecvt_base::result
    301     codecvt<_InternT, _ExternT, encoding_state>::
    302     do_out(state_type& __state, const intern_type* __from,
    303 	   const intern_type* __from_end, const intern_type*& __from_next,
    304 	   extern_type* __to, extern_type* __to_end,
    305 	   extern_type*& __to_next) const
    306     {
    307       result __ret = codecvt_base::error;
    308       if (__state.good())
    309 	{
    310 	  const descriptor_type& __desc = __state.out_descriptor();
    311 	  const size_t __fmultiple = sizeof(intern_type);
    312 	  size_t __fbytes = __fmultiple * (__from_end - __from);
    313 	  const size_t __tmultiple = sizeof(extern_type);
    314 	  size_t __tbytes = __tmultiple * (__to_end - __to);
    315 
    316 	  // Argument list for iconv specifies a byte sequence. Thus,
    317 	  // all to/from arrays must be brutally casted to char*.
    318 	  char* __cto = reinterpret_cast<char*>(__to);
    319 	  char* __cfrom;
    320 	  size_t __conv;
    321 
    322 	  // Some encodings need a byte order marker as the first item
    323 	  // in the byte stream, to designate endian-ness. The default
    324 	  // value for the byte order marker is NULL, so if this is
    325 	  // the case, it's not necessary and we can just go on our
    326 	  // merry way.
    327 	  int __int_bom = __state.internal_bom();
    328 	  if (__int_bom)
    329 	    {
    330 	      size_t __size = __from_end - __from;
    331 	      intern_type* __cfixed = static_cast<intern_type*>
    332 		(__builtin_alloca(sizeof(intern_type) * (__size + 1)));
    333 	      __cfixed[0] = static_cast<intern_type>(__int_bom);
    334 	      char_traits<intern_type>::copy(__cfixed + 1, __from, __size);
    335 	      __cfrom = reinterpret_cast<char*>(__cfixed);
    336 	      __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
    337                                         &__fbytes, &__cto, &__tbytes);
    338 	    }
    339 	  else
    340 	    {
    341 	      intern_type* __cfixed = const_cast<intern_type*>(__from);
    342 	      __cfrom = reinterpret_cast<char*>(__cfixed);
    343 	      __conv = __iconv_adaptor(iconv, __desc, &__cfrom, &__fbytes,
    344 				       &__cto, &__tbytes);
    345 	    }
    346 
    347 	  if (__conv != size_t(-1))
    348 	    {
    349 	      __from_next = reinterpret_cast<const intern_type*>(__cfrom);
    350 	      __to_next = reinterpret_cast<extern_type*>(__cto);
    351 	      __ret = codecvt_base::ok;
    352 	    }
    353 	  else
    354 	    {
    355 	      if (__fbytes < __fmultiple * (__from_end - __from))
    356 		{
    357 		  __from_next = reinterpret_cast<const intern_type*>(__cfrom);
    358 		  __to_next = reinterpret_cast<extern_type*>(__cto);
    359 		  __ret = codecvt_base::partial;
    360 		}
    361 	      else
    362 		__ret = codecvt_base::error;
    363 	    }
    364 	}
    365       return __ret;
    366     }
    367 
    368   template<typename _InternT, typename _ExternT>
    369     codecvt_base::result
    370     codecvt<_InternT, _ExternT, encoding_state>::
    371     do_unshift(state_type& __state, extern_type* __to,
    372 	       extern_type* __to_end, extern_type*& __to_next) const
    373     {
    374       result __ret = codecvt_base::error;
    375       if (__state.good())
    376 	{
    377 	  const descriptor_type& __desc = __state.in_descriptor();
    378 	  const size_t __tmultiple = sizeof(intern_type);
    379 	  size_t __tlen = __tmultiple * (__to_end - __to);
    380 
    381 	  // Argument list for iconv specifies a byte sequence. Thus,
    382 	  // all to/from arrays must be brutally casted to char*.
    383 	  char* __cto = reinterpret_cast<char*>(__to);
    384 	  size_t __conv = __iconv_adaptor(iconv,__desc, NULL, NULL,
    385                                           &__cto, &__tlen);
    386 
    387 	  if (__conv != size_t(-1))
    388 	    {
    389 	      __to_next = reinterpret_cast<extern_type*>(__cto);
    390 	      if (__tlen == __tmultiple * (__to_end - __to))
    391 		__ret = codecvt_base::noconv;
    392 	      else if (__tlen == 0)
    393 		__ret = codecvt_base::ok;
    394 	      else
    395 		__ret = codecvt_base::partial;
    396 	    }
    397 	  else
    398 	    __ret = codecvt_base::error;
    399 	}
    400       return __ret;
    401     }
    402 
    403   template<typename _InternT, typename _ExternT>
    404     codecvt_base::result
    405     codecvt<_InternT, _ExternT, encoding_state>::
    406     do_in(state_type& __state, const extern_type* __from,
    407 	  const extern_type* __from_end, const extern_type*& __from_next,
    408 	  intern_type* __to, intern_type* __to_end,
    409 	  intern_type*& __to_next) const
    410     {
    411       result __ret = codecvt_base::error;
    412       if (__state.good())
    413 	{
    414 	  const descriptor_type& __desc = __state.in_descriptor();
    415 	  const size_t __fmultiple = sizeof(extern_type);
    416 	  size_t __flen = __fmultiple * (__from_end - __from);
    417 	  const size_t __tmultiple = sizeof(intern_type);
    418 	  size_t __tlen = __tmultiple * (__to_end - __to);
    419 
    420 	  // Argument list for iconv specifies a byte sequence. Thus,
    421 	  // all to/from arrays must be brutally casted to char*.
    422 	  char* __cto = reinterpret_cast<char*>(__to);
    423 	  char* __cfrom;
    424 	  size_t __conv;
    425 
    426 	  // Some encodings need a byte order marker as the first item
    427 	  // in the byte stream, to designate endian-ness. The default
    428 	  // value for the byte order marker is NULL, so if this is
    429 	  // the case, it's not necessary and we can just go on our
    430 	  // merry way.
    431 	  int __ext_bom = __state.external_bom();
    432 	  if (__ext_bom)
    433 	    {
    434 	      size_t __size = __from_end - __from;
    435 	      extern_type* __cfixed =  static_cast<extern_type*>
    436 		(__builtin_alloca(sizeof(extern_type) * (__size + 1)));
    437 	      __cfixed[0] = static_cast<extern_type>(__ext_bom);
    438 	      char_traits<extern_type>::copy(__cfixed + 1, __from, __size);
    439 	      __cfrom = reinterpret_cast<char*>(__cfixed);
    440 	      __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
    441                                        &__flen, &__cto, &__tlen);
    442 	    }
    443 	  else
    444 	    {
    445 	      extern_type* __cfixed = const_cast<extern_type*>(__from);
    446 	      __cfrom = reinterpret_cast<char*>(__cfixed);
    447 	      __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
    448                                        &__flen, &__cto, &__tlen);
    449 	    }
    450 
    451 
    452 	  if (__conv != size_t(-1))
    453 	    {
    454 	      __from_next = reinterpret_cast<const extern_type*>(__cfrom);
    455 	      __to_next = reinterpret_cast<intern_type*>(__cto);
    456 	      __ret = codecvt_base::ok;
    457 	    }
    458 	  else
    459 	    {
    460 	      if (__flen < static_cast<size_t>(__from_end - __from))
    461 		{
    462 		  __from_next = reinterpret_cast<const extern_type*>(__cfrom);
    463 		  __to_next = reinterpret_cast<intern_type*>(__cto);
    464 		  __ret = codecvt_base::partial;
    465 		}
    466 	      else
    467 		__ret = codecvt_base::error;
    468 	    }
    469 	}
    470       return __ret;
    471     }
    472 
    473   template<typename _InternT, typename _ExternT>
    474     int
    475     codecvt<_InternT, _ExternT, encoding_state>::
    476     do_encoding() const throw()
    477     {
    478       int __ret = 0;
    479       if (sizeof(_ExternT) <= sizeof(_InternT))
    480 	__ret = sizeof(_InternT) / sizeof(_ExternT);
    481       return __ret;
    482     }
    483 
    484   template<typename _InternT, typename _ExternT>
    485     bool
    486     codecvt<_InternT, _ExternT, encoding_state>::
    487     do_always_noconv() const throw()
    488     { return false; }
    489 
    490   template<typename _InternT, typename _ExternT>
    491     int
    492     codecvt<_InternT, _ExternT, encoding_state>::
    493     do_length(state_type&, const extern_type* __from,
    494 	      const extern_type* __end, size_t __max) const
    495     { return std::min(__max, static_cast<size_t>(__end - __from)); }
    496 
    497   // _GLIBCXX_RESOLVE_LIB_DEFECTS
    498   // 74.  Garbled text for codecvt::do_max_length
    499   template<typename _InternT, typename _ExternT>
    500     int
    501     codecvt<_InternT, _ExternT, encoding_state>::
    502     do_max_length() const throw()
    503     { return 1; }
    504 
    505 _GLIBCXX_END_NAMESPACE
    506 
    507 #endif
    508