1 // UTFConvert.cpp 2 3 #include "StdAfx.h" 4 5 #include "MyTypes.h" 6 #include "UTFConvert.h" 7 8 #ifdef _WIN32 9 #define _WCHART_IS_16BIT 1 10 #endif 11 12 /* 13 _UTF8_START(n) - is a base value for start byte (head), if there are (n) additional bytes after start byte 14 15 n : _UTF8_START(n) : Bits of code point 16 17 0 : 0x80 : : unused 18 1 : 0xC0 : 11 : 19 2 : 0xE0 : 16 : Basic Multilingual Plane 20 3 : 0xF0 : 21 : Unicode space 21 3 : 0xF8 : 26 : 22 5 : 0xFC : 31 : UCS-4 23 6 : 0xFE : 36 : We can use it, if we want to encode any 32-bit value 24 7 : 0xFF : 25 */ 26 27 #define _UTF8_START(n) (0x100 - (1 << (7 - (n)))) 28 29 #define _UTF8_HEAD_PARSE2(n) if (c < _UTF8_START((n) + 1)) { numBytes = (n); c -= _UTF8_START(n); } 30 31 #define _UTF8_HEAD_PARSE \ 32 _UTF8_HEAD_PARSE2(1) \ 33 else _UTF8_HEAD_PARSE2(2) \ 34 else _UTF8_HEAD_PARSE2(3) \ 35 else _UTF8_HEAD_PARSE2(4) \ 36 else _UTF8_HEAD_PARSE2(5) \ 37 38 // else _UTF8_HEAD_PARSE2(6) 39 40 bool CheckUTF8(const char *src, bool allowReduced) throw() 41 { 42 for (;;) 43 { 44 Byte c = *src++; 45 if (c == 0) 46 return true; 47 48 if (c < 0x80) 49 continue; 50 if (c < 0xC0) // (c < 0xC0 + 2) // if we support only optimal encoding chars 51 return false; 52 53 unsigned numBytes; 54 _UTF8_HEAD_PARSE 55 else 56 return false; 57 58 UInt32 val = c; 59 60 do 61 { 62 Byte c2 = *src++; 63 if (c2 < 0x80 || c2 >= 0xC0) 64 return allowReduced && c2 == 0; 65 val <<= 6; 66 val |= (c2 - 0x80); 67 } 68 while (--numBytes); 69 70 if (val >= 0x110000) 71 return false; 72 } 73 } 74 75 76 #define _ERROR_UTF8 \ 77 { if (dest) dest[destPos] = (wchar_t)0xFFFD; destPos++; ok = false; continue; } 78 79 static bool Utf8_To_Utf16(wchar_t *dest, size_t *destLen, const char *src, const char *srcLim) throw() 80 { 81 size_t destPos = 0; 82 bool ok = true; 83 84 for (;;) 85 { 86 Byte c; 87 if (src == srcLim) 88 { 89 *destLen = destPos; 90 return ok; 91 } 92 c = *src++; 93 94 if (c < 0x80) 95 { 96 if (dest) 97 dest[destPos] = (wchar_t)c; 98 destPos++; 99 continue; 100 } 101 if (c < 0xC0) 102 _ERROR_UTF8 103 104 unsigned numBytes; 105 _UTF8_HEAD_PARSE 106 else 107 _ERROR_UTF8 108 109 UInt32 val = c; 110 111 do 112 { 113 Byte c2; 114 if (src == srcLim) 115 break; 116 c2 = *src; 117 if (c2 < 0x80 || c2 >= 0xC0) 118 break; 119 src++; 120 val <<= 6; 121 val |= (c2 - 0x80); 122 } 123 while (--numBytes); 124 125 if (numBytes != 0) 126 _ERROR_UTF8 127 128 if (val < 0x10000) 129 { 130 if (dest) 131 dest[destPos] = (wchar_t)val; 132 destPos++; 133 } 134 else 135 { 136 val -= 0x10000; 137 if (val >= 0x100000) 138 _ERROR_UTF8 139 if (dest) 140 { 141 dest[destPos + 0] = (wchar_t)(0xD800 + (val >> 10)); 142 dest[destPos + 1] = (wchar_t)(0xDC00 + (val & 0x3FF)); 143 } 144 destPos += 2; 145 } 146 } 147 } 148 149 #define _UTF8_RANGE(n) (((UInt32)1) << ((n) * 5 + 6)) 150 151 #define _UTF8_HEAD(n, val) ((char)(_UTF8_START(n) + (val >> (6 * (n))))) 152 #define _UTF8_CHAR(n, val) ((char)(0x80 + (((val) >> (6 * (n))) & 0x3F))) 153 154 static size_t Utf16_To_Utf8_Calc(const wchar_t *src, const wchar_t *srcLim) 155 { 156 size_t size = srcLim - src; 157 for (;;) 158 { 159 if (src == srcLim) 160 return size; 161 162 UInt32 val = *src++; 163 164 if (val < 0x80) 165 continue; 166 167 if (val < _UTF8_RANGE(1)) 168 { 169 size++; 170 continue; 171 } 172 173 if (val >= 0xD800 && val < 0xDC00 && src != srcLim) 174 { 175 UInt32 c2 = *src; 176 if (c2 >= 0xDC00 && c2 < 0xE000) 177 { 178 src++; 179 size += 2; 180 continue; 181 } 182 } 183 184 #ifdef _WCHART_IS_16BIT 185 186 size += 2; 187 188 #else 189 190 if (val < _UTF8_RANGE(2)) size += 2; 191 else if (val < _UTF8_RANGE(3)) size += 3; 192 else if (val < _UTF8_RANGE(4)) size += 4; 193 else if (val < _UTF8_RANGE(5)) size += 5; 194 else size += 6; 195 196 #endif 197 } 198 } 199 200 static char *Utf16_To_Utf8(char *dest, const wchar_t *src, const wchar_t *srcLim) 201 { 202 for (;;) 203 { 204 if (src == srcLim) 205 return dest; 206 207 UInt32 val = *src++; 208 209 if (val < 0x80) 210 { 211 *dest++ = (char)val; 212 continue; 213 } 214 215 if (val < _UTF8_RANGE(1)) 216 { 217 dest[0] = _UTF8_HEAD(1, val); 218 dest[1] = _UTF8_CHAR(0, val); 219 dest += 2; 220 continue; 221 } 222 223 if (val >= 0xD800 && val < 0xDC00 && src != srcLim) 224 { 225 UInt32 c2 = *src; 226 if (c2 >= 0xDC00 && c2 < 0xE000) 227 { 228 src++; 229 val = (((val - 0xD800) << 10) | (c2 - 0xDC00)) + 0x10000; 230 dest[0] = _UTF8_HEAD(3, val); 231 dest[1] = _UTF8_CHAR(2, val); 232 dest[2] = _UTF8_CHAR(1, val); 233 dest[3] = _UTF8_CHAR(0, val); 234 dest += 4; 235 continue; 236 } 237 } 238 239 #ifndef _WCHART_IS_16BIT 240 if (val < _UTF8_RANGE(2)) 241 #endif 242 { 243 dest[0] = _UTF8_HEAD(2, val); 244 dest[1] = _UTF8_CHAR(1, val); 245 dest[2] = _UTF8_CHAR(0, val); 246 dest += 3; 247 continue; 248 } 249 250 #ifndef _WCHART_IS_16BIT 251 252 UInt32 b; 253 unsigned numBits; 254 if (val < _UTF8_RANGE(3)) { numBits = 6 * 3; b = _UTF8_HEAD(3, val); } 255 else if (val < _UTF8_RANGE(4)) { numBits = 6 * 4; b = _UTF8_HEAD(4, val); } 256 else if (val < _UTF8_RANGE(5)) { numBits = 6 * 5; b = _UTF8_HEAD(5, val); } 257 else { numBits = 6 * 6; b = _UTF8_START(6); } 258 259 *dest++ = (Byte)b; 260 261 do 262 { 263 numBits -= 6; 264 *dest++ = (char)(0x80 + ((val >> numBits) & 0x3F)); 265 } 266 while (numBits != 0); 267 268 #endif 269 } 270 } 271 272 bool ConvertUTF8ToUnicode(const AString &src, UString &dest) 273 { 274 dest.Empty(); 275 size_t destLen = 0; 276 Utf8_To_Utf16(NULL, &destLen, src, src.Ptr(src.Len())); 277 bool res = Utf8_To_Utf16(dest.GetBuf((unsigned)destLen), &destLen, src, src.Ptr(src.Len())); 278 dest.ReleaseBuf_SetEnd((unsigned)destLen); 279 return res; 280 } 281 282 void ConvertUnicodeToUTF8(const UString &src, AString &dest) 283 { 284 dest.Empty(); 285 size_t destLen = Utf16_To_Utf8_Calc(src, src.Ptr(src.Len())); 286 Utf16_To_Utf8(dest.GetBuf((unsigned)destLen), src, src.Ptr(src.Len())); 287 dest.ReleaseBuf_SetEnd((unsigned)destLen); 288 } 289