Home | History | Annotate | Download | only in text
      1 /*
      2  * Copyright (C) 2008, 2009 Google Inc. All rights reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions are
      6  * met:
      7  *
      8  *     * Redistributions of source code must retain the above copyright
      9  * notice, this list of conditions and the following disclaimer.
     10  *     * Redistributions in binary form must reproduce the above
     11  * copyright notice, this list of conditions and the following disclaimer
     12  * in the documentation and/or other materials provided with the
     13  * distribution.
     14  *     * Neither the name of Google Inc. nor the names of its
     15  * contributors may be used to endorse or promote products derived from
     16  * this software without specific prior written permission.
     17  *
     18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     22  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     24  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     28  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29  */
     30 
     31 #include "config.h"
     32 #include "core/platform/text/TextEncodingDetector.h"
     33 
     34 #include "wtf/text/TextEncoding.h"
     35 #include <unicode/ucnv.h>
     36 #include <unicode/ucsdet.h>
     37 
     38 namespace WebCore {
     39 
     40 bool detectTextEncoding(const char* data, size_t len,
     41                         const char* hintEncodingName,
     42                         WTF::TextEncoding* detectedEncoding)
     43 {
     44     *detectedEncoding = WTF::TextEncoding();
     45     int matchesCount = 0;
     46     UErrorCode status = U_ZERO_ERROR;
     47     UCharsetDetector* detector = ucsdet_open(&status);
     48     if (U_FAILURE(status))
     49         return false;
     50     ucsdet_enableInputFilter(detector, true);
     51     ucsdet_setText(detector, data, static_cast<int32_t>(len), &status);
     52     if (U_FAILURE(status))
     53         return false;
     54 
     55     // FIXME: A few things we can do other than improving
     56     // the ICU detector itself.
     57     // 1. Use ucsdet_detectAll and pick the most likely one given
     58     // "the context" (parent-encoding, referrer encoding, etc).
     59     // 2. 'Emulate' Firefox/IE's non-Universal detectors (e.g.
     60     // Chinese, Japanese, Russian, Korean and Hebrew) by picking the
     61     // encoding with a highest confidence among the detector-specific
     62     // limited set of candidate encodings.
     63     // Below is a partial implementation of the first part of what's outlined
     64     // above.
     65     const UCharsetMatch** matches = ucsdet_detectAll(detector, &matchesCount, &status);
     66     if (U_FAILURE(status)) {
     67         ucsdet_close(detector);
     68         return false;
     69     }
     70 
     71     const char* encoding = 0;
     72     if (hintEncodingName) {
     73         WTF::TextEncoding hintEncoding(hintEncodingName);
     74         // 10 is the minimum confidence value consistent with the codepoint
     75         // allocation in a given encoding. The size of a chunk passed to
     76         // us varies even for the same html file (apparently depending on
     77         // the network load). When we're given a rather short chunk, we
     78         // don't have a sufficiently reliable signal other than the fact that
     79         // the chunk is consistent with a set of encodings. So, instead of
     80         // setting an arbitrary threshold, we have to scan all the encodings
     81         // consistent with the data.
     82         const int32_t kThresold = 10;
     83         for (int i = 0; i < matchesCount; ++i) {
     84             int32_t confidence = ucsdet_getConfidence(matches[i], &status);
     85             if (U_FAILURE(status)) {
     86                 status = U_ZERO_ERROR;
     87                 continue;
     88             }
     89             if (confidence < kThresold)
     90                 break;
     91             const char* matchEncoding = ucsdet_getName(matches[i], &status);
     92             if (U_FAILURE(status)) {
     93                 status = U_ZERO_ERROR;
     94                 continue;
     95             }
     96             if (WTF::TextEncoding(matchEncoding) == hintEncoding) {
     97                 encoding = hintEncodingName;
     98                 break;
     99             }
    100         }
    101     }
    102     // If no match is found so far, just pick the top match.
    103     // This can happen, say, when a parent frame in EUC-JP refers to
    104     // a child frame in Shift_JIS and both frames do NOT specify the encoding
    105     // making us resort to auto-detection (when it IS turned on).
    106     if (!encoding && matchesCount > 0)
    107         encoding = ucsdet_getName(matches[0], &status);
    108     if (U_SUCCESS(status)) {
    109         *detectedEncoding = WTF::TextEncoding(encoding);
    110         ucsdet_close(detector);
    111         return true;
    112     }
    113     ucsdet_close(detector);
    114     return false;
    115 }
    116 
    117 }
    118