1 /* 2 * Summary: interface for the encoding conversion functions 3 * Description: interface for the encoding conversion functions needed for 4 * XML basic encoding and iconv() support. 5 * 6 * Related specs are 7 * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies 8 * [ISO-10646] UTF-8 and UTF-16 in Annexes 9 * [ISO-8859-1] ISO Latin-1 characters codes. 10 * [UNICODE] The Unicode Consortium, "The Unicode Standard -- 11 * Worldwide Character Encoding -- Version 1.0", Addison- 12 * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is 13 * described in Unicode Technical Report #4. 14 * [US-ASCII] Coded Character Set--7-bit American Standard Code for 15 * Information Interchange, ANSI X3.4-1986. 16 * 17 * Copy: See Copyright for the status of this software. 18 * 19 * Author: Daniel Veillard 20 */ 21 22 #ifndef __XML_CHAR_ENCODING_H__ 23 #define __XML_CHAR_ENCODING_H__ 24 25 #include <libxml/xmlversion.h> 26 27 #ifdef LIBXML_ICONV_ENABLED 28 #include <iconv.h> 29 #else 30 #ifdef LIBXML_ICU_ENABLED 31 #include <unicode/ucnv.h> 32 #if 0 33 /* Forward-declare UConverter here rather than pulling in <unicode/ucnv.h> 34 * to prevent unwanted ICU symbols being exposed to users of libxml2. 35 * One particular case is Qt4 conflicting on UChar32. 36 */ 37 #include <stdint.h> 38 struct UConverter; 39 typedef struct UConverter UConverter; 40 #ifdef _MSC_VER 41 typedef wchar_t UChar; 42 #else 43 typedef uint16_t UChar; 44 #endif 45 #endif 46 #endif 47 #endif 48 #ifdef __cplusplus 49 extern "C" { 50 #endif 51 52 /* 53 * xmlCharEncoding: 54 * 55 * Predefined values for some standard encodings. 56 * Libxml does not do beforehand translation on UTF8 and ISOLatinX. 57 * It also supports ASCII, ISO-8859-1, and UTF16 (LE and BE) by default. 58 * 59 * Anything else would have to be translated to UTF8 before being 60 * given to the parser itself. The BOM for UTF16 and the encoding 61 * declaration are looked at and a converter is looked for at that 62 * point. If not found the parser stops here as asked by the XML REC. A 63 * converter can be registered by the user using xmlRegisterCharEncodingHandler 64 * but the current form doesn't allow stateful transcoding (a serious 65 * problem agreed !). If iconv has been found it will be used 66 * automatically and allow stateful transcoding, the simplest is then 67 * to be sure to enable iconv and to provide iconv libs for the encoding 68 * support needed. 69 * 70 * Note that the generic "UTF-16" is not a predefined value. Instead, only 71 * the specific UTF-16LE and UTF-16BE are present. 72 */ 73 typedef enum { 74 XML_CHAR_ENCODING_ERROR= -1, /* No char encoding detected */ 75 XML_CHAR_ENCODING_NONE= 0, /* No char encoding detected */ 76 XML_CHAR_ENCODING_UTF8= 1, /* UTF-8 */ 77 XML_CHAR_ENCODING_UTF16LE= 2, /* UTF-16 little endian */ 78 XML_CHAR_ENCODING_UTF16BE= 3, /* UTF-16 big endian */ 79 XML_CHAR_ENCODING_UCS4LE= 4, /* UCS-4 little endian */ 80 XML_CHAR_ENCODING_UCS4BE= 5, /* UCS-4 big endian */ 81 XML_CHAR_ENCODING_EBCDIC= 6, /* EBCDIC uh! */ 82 XML_CHAR_ENCODING_UCS4_2143=7, /* UCS-4 unusual ordering */ 83 XML_CHAR_ENCODING_UCS4_3412=8, /* UCS-4 unusual ordering */ 84 XML_CHAR_ENCODING_UCS2= 9, /* UCS-2 */ 85 XML_CHAR_ENCODING_8859_1= 10,/* ISO-8859-1 ISO Latin 1 */ 86 XML_CHAR_ENCODING_8859_2= 11,/* ISO-8859-2 ISO Latin 2 */ 87 XML_CHAR_ENCODING_8859_3= 12,/* ISO-8859-3 */ 88 XML_CHAR_ENCODING_8859_4= 13,/* ISO-8859-4 */ 89 XML_CHAR_ENCODING_8859_5= 14,/* ISO-8859-5 */ 90 XML_CHAR_ENCODING_8859_6= 15,/* ISO-8859-6 */ 91 XML_CHAR_ENCODING_8859_7= 16,/* ISO-8859-7 */ 92 XML_CHAR_ENCODING_8859_8= 17,/* ISO-8859-8 */ 93 XML_CHAR_ENCODING_8859_9= 18,/* ISO-8859-9 */ 94 XML_CHAR_ENCODING_2022_JP= 19,/* ISO-2022-JP */ 95 XML_CHAR_ENCODING_SHIFT_JIS=20,/* Shift_JIS */ 96 XML_CHAR_ENCODING_EUC_JP= 21,/* EUC-JP */ 97 XML_CHAR_ENCODING_ASCII= 22 /* pure ASCII */ 98 } xmlCharEncoding; 99 100 /** 101 * xmlCharEncodingInputFunc: 102 * @out: a pointer to an array of bytes to store the UTF-8 result 103 * @outlen: the length of @out 104 * @in: a pointer to an array of chars in the original encoding 105 * @inlen: the length of @in 106 * 107 * Take a block of chars in the original encoding and try to convert 108 * it to an UTF-8 block of chars out. 109 * 110 * Returns the number of bytes written, -1 if lack of space, or -2 111 * if the transcoding failed. 112 * The value of @inlen after return is the number of octets consumed 113 * if the return value is positive, else unpredictiable. 114 * The value of @outlen after return is the number of octets consumed. 115 */ 116 typedef int (* xmlCharEncodingInputFunc)(unsigned char *out, int *outlen, 117 const unsigned char *in, int *inlen); 118 119 120 /** 121 * xmlCharEncodingOutputFunc: 122 * @out: a pointer to an array of bytes to store the result 123 * @outlen: the length of @out 124 * @in: a pointer to an array of UTF-8 chars 125 * @inlen: the length of @in 126 * 127 * Take a block of UTF-8 chars in and try to convert it to another 128 * encoding. 129 * Note: a first call designed to produce heading info is called with 130 * in = NULL. If stateful this should also initialize the encoder state. 131 * 132 * Returns the number of bytes written, -1 if lack of space, or -2 133 * if the transcoding failed. 134 * The value of @inlen after return is the number of octets consumed 135 * if the return value is positive, else unpredictiable. 136 * The value of @outlen after return is the number of octets produced. 137 */ 138 typedef int (* xmlCharEncodingOutputFunc)(unsigned char *out, int *outlen, 139 const unsigned char *in, int *inlen); 140 141 142 /* 143 * Block defining the handlers for non UTF-8 encodings. 144 * If iconv is supported, there are two extra fields. 145 */ 146 #ifdef LIBXML_ICU_ENABLED 147 struct _uconv_t { 148 UConverter *uconv; /* for conversion between an encoding and UTF-16 */ 149 UConverter *utf8; /* for conversion between UTF-8 and UTF-16 */ 150 }; 151 typedef struct _uconv_t uconv_t; 152 #endif 153 154 typedef struct _xmlCharEncodingHandler xmlCharEncodingHandler; 155 typedef xmlCharEncodingHandler *xmlCharEncodingHandlerPtr; 156 struct _xmlCharEncodingHandler { 157 char *name; 158 xmlCharEncodingInputFunc input; 159 xmlCharEncodingOutputFunc output; 160 #ifdef LIBXML_ICONV_ENABLED 161 iconv_t iconv_in; 162 iconv_t iconv_out; 163 #endif /* LIBXML_ICONV_ENABLED */ 164 #ifdef LIBXML_ICU_ENABLED 165 uconv_t *uconv_in; 166 uconv_t *uconv_out; 167 #endif /* LIBXML_ICU_ENABLED */ 168 }; 169 170 #ifdef __cplusplus 171 } 172 #endif 173 #include <libxml/tree.h> 174 #ifdef __cplusplus 175 extern "C" { 176 #endif 177 178 /* 179 * Interfaces for encoding handlers. 180 */ 181 XMLPUBFUN void XMLCALL 182 xmlInitCharEncodingHandlers (void); 183 XMLPUBFUN void XMLCALL 184 xmlCleanupCharEncodingHandlers (void); 185 XMLPUBFUN void XMLCALL 186 xmlRegisterCharEncodingHandler (xmlCharEncodingHandlerPtr handler); 187 XMLPUBFUN xmlCharEncodingHandlerPtr XMLCALL 188 xmlGetCharEncodingHandler (xmlCharEncoding enc); 189 XMLPUBFUN xmlCharEncodingHandlerPtr XMLCALL 190 xmlFindCharEncodingHandler (const char *name); 191 XMLPUBFUN xmlCharEncodingHandlerPtr XMLCALL 192 xmlNewCharEncodingHandler (const char *name, 193 xmlCharEncodingInputFunc input, 194 xmlCharEncodingOutputFunc output); 195 196 /* 197 * Interfaces for encoding names and aliases. 198 */ 199 XMLPUBFUN int XMLCALL 200 xmlAddEncodingAlias (const char *name, 201 const char *alias); 202 XMLPUBFUN int XMLCALL 203 xmlDelEncodingAlias (const char *alias); 204 XMLPUBFUN const char * XMLCALL 205 xmlGetEncodingAlias (const char *alias); 206 XMLPUBFUN void XMLCALL 207 xmlCleanupEncodingAliases (void); 208 XMLPUBFUN xmlCharEncoding XMLCALL 209 xmlParseCharEncoding (const char *name); 210 XMLPUBFUN const char * XMLCALL 211 xmlGetCharEncodingName (xmlCharEncoding enc); 212 213 /* 214 * Interfaces directly used by the parsers. 215 */ 216 XMLPUBFUN xmlCharEncoding XMLCALL 217 xmlDetectCharEncoding (const unsigned char *in, 218 int len); 219 220 XMLPUBFUN int XMLCALL 221 xmlCharEncOutFunc (xmlCharEncodingHandler *handler, 222 xmlBufferPtr out, 223 xmlBufferPtr in); 224 225 XMLPUBFUN int XMLCALL 226 xmlCharEncInFunc (xmlCharEncodingHandler *handler, 227 xmlBufferPtr out, 228 xmlBufferPtr in); 229 XMLPUBFUN int XMLCALL 230 xmlCharEncFirstLine (xmlCharEncodingHandler *handler, 231 xmlBufferPtr out, 232 xmlBufferPtr in); 233 XMLPUBFUN int XMLCALL 234 xmlCharEncCloseFunc (xmlCharEncodingHandler *handler); 235 236 /* 237 * Export a few useful functions 238 */ 239 #ifdef LIBXML_OUTPUT_ENABLED 240 XMLPUBFUN int XMLCALL 241 UTF8Toisolat1 (unsigned char *out, 242 int *outlen, 243 const unsigned char *in, 244 int *inlen); 245 #endif /* LIBXML_OUTPUT_ENABLED */ 246 XMLPUBFUN int XMLCALL 247 isolat1ToUTF8 (unsigned char *out, 248 int *outlen, 249 const unsigned char *in, 250 int *inlen); 251 #ifdef __cplusplus 252 } 253 #endif 254 255 #endif /* __XML_CHAR_ENCODING_H__ */ 256