1 /* 2 * Copyright (C) 2007-2009 Torch Mobile, Inc. All rights reserved. 3 * Copyright (C) 2010-2011 Patrick Gansterer <paroga (at) paroga.com> 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * This library is distributed in the hope that i will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 * Library General Public License for more details. 18 * 19 * You should have received a copy of the GNU Library General Public License 20 * along with this library; see the file COPYING.LIB. If not, write to 21 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 22 * Boston, MA 02110-1301, USA. 23 */ 24 25 #include "config.h" 26 #include "TextCodecWinCE.h" 27 28 #include "FontCache.h" 29 #include "PlatformString.h" 30 #include <mlang.h> 31 #include <winbase.h> 32 #include <winnls.h> 33 #include <wtf/HashMap.h> 34 #include <wtf/HashSet.h> 35 #include <wtf/text/CString.h> 36 #include <wtf/text/StringConcatenate.h> 37 #include <wtf/text/StringHash.h> 38 39 namespace WebCore { 40 41 struct CharsetInfo { 42 CString m_name; 43 String m_friendlyName; 44 UINT m_codePage; 45 Vector<CString> m_aliases; 46 }; 47 48 class LanguageManager { 49 private: 50 LanguageManager(); 51 52 friend LanguageManager& languageManager(); 53 }; 54 55 // Usage: a lookup table used to get CharsetInfo with code page ID. 56 // Key: code page ID. Value: charset information. 57 static HashMap<UINT, CString>& codePageCharsets() 58 { 59 static HashMap<UINT, CString> cc; 60 return cc; 61 } 62 63 static HashMap<String, CharsetInfo>& knownCharsets() 64 { 65 static HashMap<String, CharsetInfo> kc; 66 return kc; 67 } 68 69 // Usage: a map that stores charsets that are supported by system. Sorted by name. 70 // Key: charset. Value: code page ID. 71 typedef HashSet<String> CharsetSet; 72 static CharsetSet& supportedCharsets() 73 { 74 static CharsetSet sl; 75 return sl; 76 } 77 78 static LanguageManager& languageManager() 79 { 80 static LanguageManager lm; 81 return lm; 82 } 83 84 LanguageManager::LanguageManager() 85 { 86 IEnumCodePage* enumInterface; 87 IMultiLanguage* mli = FontCache::getMultiLanguageInterface(); 88 if (mli && S_OK == mli->EnumCodePages(MIMECONTF_BROWSER, &enumInterface)) { 89 MIMECPINFO cpInfo; 90 ULONG ccpInfo; 91 while (S_OK == enumInterface->Next(1, &cpInfo, &ccpInfo) && ccpInfo) { 92 if (!IsValidCodePage(cpInfo.uiCodePage)) 93 continue; 94 95 HashMap<UINT, CString>::iterator i = codePageCharsets().find(cpInfo.uiCodePage); 96 97 CString name(String(cpInfo.wszWebCharset).latin1()); 98 if (i == codePageCharsets().end()) { 99 CharsetInfo info; 100 info.m_codePage = cpInfo.uiCodePage; 101 knownCharsets().set(name.data(), info); 102 i = codePageCharsets().set(cpInfo.uiCodePage, name).first; 103 } 104 if (i != codePageCharsets().end()) { 105 HashMap<String, CharsetInfo>::iterator j = knownCharsets().find(String(i->second.data(), i->second.length())); 106 ASSERT(j != knownCharsets().end()); 107 CharsetInfo& info = j->second; 108 info.m_name = i->second.data(); 109 info.m_friendlyName = cpInfo.wszDescription; 110 info.m_aliases.append(name); 111 info.m_aliases.append(String(cpInfo.wszHeaderCharset).latin1()); 112 info.m_aliases.append(String(cpInfo.wszBodyCharset).latin1()); 113 String cpName = makeString("cp", String::number(cpInfo.uiCodePage)); 114 info.m_aliases.append(cpName.latin1()); 115 supportedCharsets().add(i->second.data()); 116 } 117 } 118 enumInterface->Release(); 119 } 120 } 121 122 static UINT getCodePage(const char* name) 123 { 124 // Explicitly use a "const" reference to fix the silly VS build error 125 // saying "==" is not found for const_iterator and iterator 126 const HashMap<String, CharsetInfo>& charsets = knownCharsets(); 127 HashMap<String, CharsetInfo>::const_iterator i = charsets.find(name); 128 return i == charsets.end() ? CP_ACP : i->second.m_codePage; 129 } 130 131 static PassOwnPtr<TextCodec> newTextCodecWinCE(const TextEncoding& encoding, const void*) 132 { 133 return new TextCodecWinCE(getCodePage(encoding.name())); 134 } 135 136 TextCodecWinCE::TextCodecWinCE(UINT codePage) 137 : m_codePage(codePage) 138 { 139 } 140 141 TextCodecWinCE::~TextCodecWinCE() 142 { 143 } 144 145 void TextCodecWinCE::registerExtendedEncodingNames(EncodingNameRegistrar registrar) 146 { 147 languageManager(); 148 for (CharsetSet::iterator i = supportedCharsets().begin(); i != supportedCharsets().end(); ++i) { 149 HashMap<String, CharsetInfo>::iterator j = knownCharsets().find(*i); 150 if (j != knownCharsets().end()) { 151 registrar(j->second.m_name.data(), j->second.m_name.data()); 152 for (Vector<CString>::const_iterator alias = j->second.m_aliases.begin(); alias != j->second.m_aliases.end(); ++alias) 153 registrar(alias->data(), j->second.m_name.data()); 154 } 155 } 156 } 157 158 void TextCodecWinCE::registerExtendedCodecs(TextCodecRegistrar registrar) 159 { 160 languageManager(); 161 for (CharsetSet::iterator i = supportedCharsets().begin(); i != supportedCharsets().end(); ++i) { 162 HashMap<String, CharsetInfo>::iterator j = knownCharsets().find(*i); 163 if (j != knownCharsets().end()) 164 registrar(j->second.m_name.data(), newTextCodecWinCE, 0); 165 } 166 } 167 168 static DWORD getCodePageFlags(UINT codePage) 169 { 170 if (codePage == 42) // Symbol 171 return 0; 172 173 // Microsoft says the flag must be 0 for the following code pages 174 if (codePage > 50000) { 175 if ((codePage >= 50220 && codePage <= 50222) 176 || codePage == 50225 177 || codePage == 50227 178 || codePage == 50229 179 || codePage == 52936 180 || codePage == 54936 181 || (codePage >= 57002 && codePage <= 57001) 182 || codePage == 65000 // UTF-7 183 ) 184 return 0; 185 } 186 187 return MB_PRECOMPOSED | MB_ERR_INVALID_CHARS; 188 } 189 190 static inline const char* findFirstNonAsciiCharacter(const char* bytes, size_t length) 191 { 192 for (const char* bytesEnd = bytes + length; bytes < bytesEnd; ++bytes) { 193 if (*bytes & 0x80) 194 break; 195 } 196 return bytes; 197 } 198 199 static void decodeInternal(Vector<UChar, 8192>& result, UINT codePage, const char* bytes, size_t length, size_t* left) 200 { 201 *left = length; 202 if (!bytes || !length) 203 return; 204 205 DWORD flags = getCodePageFlags(codePage); 206 207 int testLength = length; 208 int untestedLength = length; 209 for (;;) { 210 int resultLength = MultiByteToWideChar(codePage, flags, bytes, testLength, 0, 0); 211 212 if (resultLength > 0) { 213 int oldSize = result.size(); 214 result.resize(oldSize + resultLength); 215 216 MultiByteToWideChar(codePage, flags, bytes, testLength, result.data() + oldSize, resultLength); 217 218 if (testLength == untestedLength) { 219 *left = length - testLength; 220 break; 221 } 222 untestedLength -= testLength; 223 length -= testLength; 224 bytes += testLength; 225 } else { 226 untestedLength = testLength - 1; 227 if (!untestedLength) { 228 *left = length; 229 break; 230 } 231 } 232 testLength = (untestedLength + 1) / 2; 233 } 234 } 235 236 String TextCodecWinCE::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError) 237 { 238 if (!m_decodeBuffer.isEmpty()) { 239 m_decodeBuffer.append(bytes, length); 240 bytes = m_decodeBuffer.data(); 241 length = m_decodeBuffer.size(); 242 } 243 244 size_t left; 245 Vector<UChar, 8192> result; 246 for (;;) { 247 decodeInternal(result, m_codePage, bytes, length, &left); 248 if (!left) 249 break; 250 251 if (!flush && left < 16) 252 break; 253 254 result.append(L'?'); 255 sawError = true; 256 if (stopOnError) 257 return String::adopt(result); 258 259 if (left == 1) 260 break; 261 262 bytes += length - left + 1; 263 length = left - 1; 264 } 265 if (left && !flush) { 266 if (m_decodeBuffer.isEmpty()) 267 m_decodeBuffer.append(bytes + length - left, left); 268 else { 269 memmove(m_decodeBuffer.data(), bytes + length - left, left); 270 m_decodeBuffer.resize(left); 271 } 272 } else 273 m_decodeBuffer.clear(); 274 275 return String::adopt(result); 276 } 277 278 CString TextCodecWinCE::encode(const UChar* characters, size_t length, UnencodableHandling) 279 { 280 if (!characters || !length) 281 return CString(); 282 283 int resultLength = WideCharToMultiByte(m_codePage, WC_COMPOSITECHECK, characters, length, 0, 0, 0, 0); 284 285 // FIXME: We need to implement UnencodableHandling: QuestionMarksForUnencodables, EntitiesForUnencodables, and URLEncodedEntitiesForUnencodables. 286 287 if (resultLength <= 0) 288 return "?"; 289 290 char* characterBuffer; 291 CString result = CString::newUninitialized(resultLength, characterBuffer); 292 293 WideCharToMultiByte(m_codePage, WC_COMPOSITECHECK, characters, length, characterBuffer, resultLength, 0, 0); 294 295 return result; 296 } 297 298 void TextCodecWinCE::enumerateSupportedEncodings(EncodingReceiver& receiver) 299 { 300 languageManager(); 301 for (CharsetSet::iterator i = supportedCharsets().begin(); i != supportedCharsets().end(); ++i) { 302 HashMap<String, CharsetInfo>::iterator j = knownCharsets().find(*i); 303 if (j != knownCharsets().end() && !receiver.receive(j->second.m_name.data(), j->second.m_friendlyName.charactersWithNullTermination(), j->second.m_codePage)) 304 break; 305 } 306 } 307 308 } // namespace WebCore 309