1 /* 2 * Copyright (C) 2008, 2009 Google Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions are 6 * met: 7 * 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above 11 * copyright notice, this list of conditions and the following disclaimer 12 * in the documentation and/or other materials provided with the 13 * distribution. 14 * * Neither the name of Google Inc. nor the names of its 15 * contributors may be used to endorse or promote products derived from 16 * this software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 #include "config.h" 32 #include "TextEncodingDetector.h" 33 34 #include "TextEncoding.h" 35 #include <wtf/UnusedParam.h> 36 37 #ifndef BUILDING_ON_TIGER 38 #include "unicode/ucnv.h" 39 #include "unicode/ucsdet.h" 40 #endif 41 42 namespace WebCore { 43 44 bool detectTextEncoding(const char* data, size_t len, 45 const char* hintEncodingName, 46 TextEncoding* detectedEncoding) 47 { 48 *detectedEncoding = TextEncoding(); 49 #ifdef BUILDING_ON_TIGER 50 // Tiger came with ICU 3.2 and does not have the encoding detector. 51 UNUSED_PARAM(data); 52 UNUSED_PARAM(len); 53 UNUSED_PARAM(hintEncodingName); 54 return false; 55 #else 56 int matchesCount = 0; 57 UErrorCode status = U_ZERO_ERROR; 58 UCharsetDetector* detector = ucsdet_open(&status); 59 if (U_FAILURE(status)) 60 return false; 61 ucsdet_enableInputFilter(detector, true); 62 ucsdet_setText(detector, data, static_cast<int32_t>(len), &status); 63 if (U_FAILURE(status)) 64 return false; 65 66 // FIXME: A few things we can do other than improving 67 // the ICU detector itself. 68 // 1. Use ucsdet_detectAll and pick the most likely one given 69 // "the context" (parent-encoding, referrer encoding, etc). 70 // 2. 'Emulate' Firefox/IE's non-Universal detectors (e.g. 71 // Chinese, Japanese, Russian, Korean and Hebrew) by picking the 72 // encoding with a highest confidence among the detector-specific 73 // limited set of candidate encodings. 74 // Below is a partial implementation of the first part of what's outlined 75 // above. 76 const UCharsetMatch** matches = ucsdet_detectAll(detector, &matchesCount, &status); 77 if (U_FAILURE(status)) { 78 ucsdet_close(detector); 79 return false; 80 } 81 82 const char* encoding = 0; 83 if (hintEncodingName) { 84 TextEncoding hintEncoding(hintEncodingName); 85 // 10 is the minimum confidence value consistent with the codepoint 86 // allocation in a given encoding. The size of a chunk passed to 87 // us varies even for the same html file (apparently depending on 88 // the network load). When we're given a rather short chunk, we 89 // don't have a sufficiently reliable signal other than the fact that 90 // the chunk is consistent with a set of encodings. So, instead of 91 // setting an arbitrary threshold, we have to scan all the encodings 92 // consistent with the data. 93 const int32_t kThresold = 10; 94 for (int i = 0; i < matchesCount; ++i) { 95 int32_t confidence = ucsdet_getConfidence(matches[i], &status); 96 if (U_FAILURE(status)) { 97 status = U_ZERO_ERROR; 98 continue; 99 } 100 if (confidence < kThresold) 101 break; 102 const char* matchEncoding = ucsdet_getName(matches[i], &status); 103 if (U_FAILURE(status)) { 104 status = U_ZERO_ERROR; 105 continue; 106 } 107 if (TextEncoding(matchEncoding) == hintEncoding) { 108 encoding = hintEncodingName; 109 break; 110 } 111 } 112 } 113 // If no match is found so far, just pick the top match. 114 // This can happen, say, when a parent frame in EUC-JP refers to 115 // a child frame in Shift_JIS and both frames do NOT specify the encoding 116 // making us resort to auto-detection (when it IS turned on). 117 if (!encoding && matchesCount > 0) 118 encoding = ucsdet_getName(matches[0], &status); 119 if (U_SUCCESS(status)) { 120 *detectedEncoding = TextEncoding(encoding); 121 ucsdet_close(detector); 122 return true; 123 } 124 ucsdet_close(detector); 125 return false; 126 #endif 127 } 128 129 } 130