Home | History | Annotate | Download | only in translate
      1 // Copyright 2013 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/common/translate/language_detection_util.h"
      6 
      7 #include "base/logging.h"
      8 #include "base/strings/string_split.h"
      9 #include "base/strings/string_util.h"
     10 #include "base/time/time.h"
     11 #include "chrome/common/chrome_constants.h"
     12 #include "chrome/common/translate/translate_common_metrics.h"
     13 #include "chrome/common/translate/translate_util.h"
     14 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h"
     15 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h"
     16 
     17 namespace {
     18 
     19 // Similar language code list. Some languages are very similar and difficult
     20 // for CLD to distinguish.
     21 struct SimilarLanguageCode {
     22   const char* const code;
     23   int group;
     24 };
     25 
     26 const SimilarLanguageCode kSimilarLanguageCodes[] = {
     27   {"bs", 1},
     28   {"hr", 1},
     29   {"hi", 2},
     30   {"ne", 2},
     31 };
     32 
     33 // Checks |kSimilarLanguageCodes| and returns group code.
     34 int GetSimilarLanguageGroupCode(const std::string& language) {
     35   for (size_t i = 0; i < arraysize(kSimilarLanguageCodes); ++i) {
     36     if (language.find(kSimilarLanguageCodes[i].code) != 0)
     37       continue;
     38     return kSimilarLanguageCodes[i].group;
     39   }
     40   return 0;
     41 }
     42 
     43 // Well-known languages which often have wrong server configuration of
     44 // Content-Language: en.
     45 // TODO(toyoshim): Remove these static tables and caller functions to
     46 // chrome/common/translate, and implement them as std::set<>.
     47 const char* kWellKnownCodesOnWrongConfiguration[] = {
     48   "es", "pt", "ja", "ru", "de", "zh-CN", "zh-TW", "ar", "id", "fr", "it", "th"
     49 };
     50 
     51 // Applies a series of language code modification in proper order.
     52 void ApplyLanguageCodeCorrection(std::string* code) {
     53   // Correct well-known format errors.
     54   LanguageDetectionUtil::CorrectLanguageCodeTypo(code);
     55 
     56   if (!LanguageDetectionUtil::IsValidLanguageCode(*code)) {
     57     *code = std::string();
     58     return;
     59   }
     60 
     61   TranslateUtil::ToTranslateLanguageSynonym(code);
     62 }
     63 
     64 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it
     65 // failed.
     66 // |is_cld_reliable| will be set as true if CLD says the detection is reliable.
     67 std::string DetermineTextLanguage(const base::string16& text,
     68                                   bool* is_cld_reliable) {
     69   std::string language = chrome::kUnknownLanguageCode;
     70   int num_languages = 0;
     71   int text_bytes = 0;
     72   bool is_reliable = false;
     73   Language cld_language =
     74       DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable,
     75                                   &num_languages, NULL, &text_bytes);
     76   if (is_cld_reliable != NULL)
     77     *is_cld_reliable = is_reliable;
     78 
     79   // We don't trust the result if the CLD reports that the detection is not
     80   // reliable, or if the actual text used to detect the language was less than
     81   // 100 bytes (short texts can often lead to wrong results).
     82   // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that
     83   // the determined language code is correct with 50% confidence. Chrome should
     84   // handle the real confidence value to judge.
     85   if (is_reliable && text_bytes >= 100 && cld_language != NUM_LANGUAGES &&
     86       cld_language != UNKNOWN_LANGUAGE && cld_language != TG_UNKNOWN_LANGUAGE) {
     87     // We should not use LanguageCode_ISO_639_1 because it does not cover all
     88     // the languages CLD can detect. As a result, it'll return the invalid
     89     // language code for tradtional Chinese among others.
     90     // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and
     91     // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN
     92     // for Simplified Chinese.
     93     language = LanguageCodeWithDialects(cld_language);
     94   }
     95   VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text
     96           << "\n*************************************\n";
     97   return language;
     98 }
     99 
    100 // Checks if CLD can complement a sub code when the page language doesn't know
    101 // the sub code.
    102 bool CanCLDComplementSubCode(
    103     const std::string& page_language, const std::string& cld_language) {
    104   // Translate server cannot treat general Chinese. If Content-Language and
    105   // CLD agree that the language is Chinese and Content-Language doesn't know
    106   // which dialect is used, CLD language has priority.
    107   // TODO(hajimehoshi): How about the other dialects like zh-MO?
    108   return page_language == "zh" && StartsWithASCII(cld_language, "zh-", false);
    109 }
    110 
    111 }  // namespace
    112 
    113 namespace LanguageDetectionUtil {
    114 
    115 std::string DeterminePageLanguage(const std::string& code,
    116                                   const std::string& html_lang,
    117                                   const base::string16& contents,
    118                                   std::string* cld_language_p,
    119                                   bool* is_cld_reliable_p) {
    120   base::TimeTicks begin_time = base::TimeTicks::Now();
    121   bool is_cld_reliable;
    122   std::string cld_language = DetermineTextLanguage(contents, &is_cld_reliable);
    123   TranslateCommonMetrics::ReportLanguageDetectionTime(begin_time,
    124                                                       base::TimeTicks::Now());
    125 
    126   if (cld_language_p != NULL)
    127     *cld_language_p = cld_language;
    128   if (is_cld_reliable_p != NULL)
    129     *is_cld_reliable_p = is_cld_reliable;
    130   TranslateUtil::ToTranslateLanguageSynonym(&cld_language);
    131 
    132   // Check if html lang attribute is valid.
    133   std::string modified_html_lang;
    134   if (!html_lang.empty()) {
    135     modified_html_lang = html_lang;
    136     ApplyLanguageCodeCorrection(&modified_html_lang);
    137     TranslateCommonMetrics::ReportHtmlLang(html_lang, modified_html_lang);
    138     VLOG(9) << "html lang based language code: " << modified_html_lang;
    139   }
    140 
    141   // Check if Content-Language is valid.
    142   std::string modified_code;
    143   if (!code.empty()) {
    144     modified_code = code;
    145     ApplyLanguageCodeCorrection(&modified_code);
    146     TranslateCommonMetrics::ReportContentLanguage(code, modified_code);
    147   }
    148 
    149   // Adopt |modified_html_lang| if it is valid. Otherwise, adopt
    150   // |modified_code|.
    151   std::string language = modified_html_lang.empty() ? modified_code :
    152                                                       modified_html_lang;
    153 
    154   // If |language| is empty, just use CLD result even though it might be
    155   // chrome::kUnknownLanguageCode.
    156   if (language.empty()) {
    157     TranslateCommonMetrics::ReportLanguageVerification(
    158         TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_ONLY);
    159     return cld_language;
    160   }
    161 
    162   if (cld_language == chrome::kUnknownLanguageCode) {
    163     TranslateCommonMetrics::ReportLanguageVerification(
    164         TranslateCommonMetrics::LANGUAGE_VERIFICATION_UNKNOWN);
    165     return language;
    166   } else if (CanCLDComplementSubCode(language, cld_language)) {
    167     TranslateCommonMetrics::ReportLanguageVerification(
    168         TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_COMPLEMENT_SUB_CODE);
    169     return cld_language;
    170   } else if (IsSameOrSimilarLanguages(language, cld_language)) {
    171     TranslateCommonMetrics::ReportLanguageVerification(
    172         TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_AGREE);
    173     return language;
    174   } else if (MaybeServerWrongConfiguration(language, cld_language)) {
    175     TranslateCommonMetrics::ReportLanguageVerification(
    176         TranslateCommonMetrics::LANGUAGE_VERIFICATION_TRUST_CLD);
    177     return cld_language;
    178   } else {
    179     TranslateCommonMetrics::ReportLanguageVerification(
    180         TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_DISAGREE);
    181     // Content-Language value might be wrong because CLD says that this page
    182     // is written in another language with confidence.
    183     // In this case, Chrome doesn't rely on any of the language codes, and
    184     // gives up suggesting a translation.
    185     return std::string(chrome::kUnknownLanguageCode);
    186   }
    187 
    188   return language;
    189 }
    190 
    191 void CorrectLanguageCodeTypo(std::string* code) {
    192   DCHECK(code);
    193 
    194   size_t coma_index = code->find(',');
    195   if (coma_index != std::string::npos) {
    196     // There are more than 1 language specified, just keep the first one.
    197     *code = code->substr(0, coma_index);
    198   }
    199   TrimWhitespaceASCII(*code, TRIM_ALL, code);
    200 
    201   // An underscore instead of a dash is a frequent mistake.
    202   size_t underscore_index = code->find('_');
    203   if (underscore_index != std::string::npos)
    204     (*code)[underscore_index] = '-';
    205 
    206   // Change everything up to a dash to lower-case and everything after to upper.
    207   size_t dash_index = code->find('-');
    208   if (dash_index != std::string::npos) {
    209     *code = StringToLowerASCII(code->substr(0, dash_index)) +
    210         StringToUpperASCII(code->substr(dash_index));
    211   } else {
    212     *code = StringToLowerASCII(*code);
    213   }
    214 }
    215 
    216 bool IsValidLanguageCode(const std::string& code) {
    217   // Roughly check if the language code follows /[a-zA-Z]{2,3}(-[a-zA-Z]{2})?/.
    218   // TODO(hajimehoshi): How about es-419, which is used as an Accept language?
    219   std::vector<std::string> chunks;
    220   base::SplitString(code, '-', &chunks);
    221 
    222   if (chunks.size() < 1 || 2 < chunks.size())
    223     return false;
    224 
    225   const std::string& main_code = chunks[0];
    226 
    227   if (main_code.size() < 1 || 3 < main_code.size())
    228     return false;
    229 
    230   for (std::string::const_iterator it = main_code.begin();
    231        it != main_code.end(); ++it) {
    232     if (!IsAsciiAlpha(*it))
    233       return false;
    234   }
    235 
    236   if (chunks.size() == 1)
    237     return true;
    238 
    239   const std::string& sub_code = chunks[1];
    240 
    241   if (sub_code.size() != 2)
    242     return false;
    243 
    244   for (std::string::const_iterator it = sub_code.begin();
    245        it != sub_code.end(); ++it) {
    246     if (!IsAsciiAlpha(*it))
    247       return false;
    248   }
    249 
    250   return true;
    251 }
    252 
    253 bool IsSameOrSimilarLanguages(const std::string& page_language,
    254                               const std::string& cld_language) {
    255   // Language code part of |page_language| is matched to one of |cld_language|.
    256   // Country code is ignored here.
    257   if (page_language.size() >= 2 &&
    258       cld_language.find(page_language.c_str(), 0, 2) == 0) {
    259     // Languages are matched strictly. Reports false to metrics, but returns
    260     // true.
    261     TranslateCommonMetrics::ReportSimilarLanguageMatch(false);
    262     return true;
    263   }
    264 
    265   // Check if |page_language| and |cld_language| are in the similar language
    266   // list and belong to the same language group.
    267   int page_code = GetSimilarLanguageGroupCode(page_language);
    268   bool match = page_code != 0 &&
    269                page_code == GetSimilarLanguageGroupCode(cld_language);
    270 
    271   TranslateCommonMetrics::ReportSimilarLanguageMatch(match);
    272   return match;
    273 }
    274 
    275 bool MaybeServerWrongConfiguration(const std::string& page_language,
    276                                    const std::string& cld_language) {
    277   // If |page_language| is not "en-*", respect it and just return false here.
    278   if (!StartsWithASCII(page_language, "en", false))
    279     return false;
    280 
    281   // A server provides a language meta information representing "en-*". But it
    282   // might be just a default value due to missing user configuration.
    283   // Let's trust |cld_language| if the determined language is not difficult to
    284   // distinguish from English, and the language is one of well-known languages
    285   // which often provide "en-*" meta information mistakenly.
    286   for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) {
    287     if (cld_language == kWellKnownCodesOnWrongConfiguration[i])
    288       return true;
    289   }
    290   return false;
    291 }
    292 
    293 std::string GetCLDVersion() {
    294   return CompactLangDet::DetectLanguageVersion();
    295 }
    296 
    297 }  // namespace LanguageDetectionUtil
    298