1 // Copyright 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "chrome/common/translate/language_detection_util.h" 6 7 #include "base/logging.h" 8 #include "base/strings/string_split.h" 9 #include "base/strings/string_util.h" 10 #include "base/time/time.h" 11 #include "chrome/common/chrome_constants.h" 12 #include "chrome/common/translate/translate_common_metrics.h" 13 #include "chrome/common/translate/translate_util.h" 14 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h" 15 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h" 16 17 namespace { 18 19 // Similar language code list. Some languages are very similar and difficult 20 // for CLD to distinguish. 21 struct SimilarLanguageCode { 22 const char* const code; 23 int group; 24 }; 25 26 const SimilarLanguageCode kSimilarLanguageCodes[] = { 27 {"bs", 1}, 28 {"hr", 1}, 29 {"hi", 2}, 30 {"ne", 2}, 31 }; 32 33 // Checks |kSimilarLanguageCodes| and returns group code. 34 int GetSimilarLanguageGroupCode(const std::string& language) { 35 for (size_t i = 0; i < arraysize(kSimilarLanguageCodes); ++i) { 36 if (language.find(kSimilarLanguageCodes[i].code) != 0) 37 continue; 38 return kSimilarLanguageCodes[i].group; 39 } 40 return 0; 41 } 42 43 // Well-known languages which often have wrong server configuration of 44 // Content-Language: en. 45 // TODO(toyoshim): Remove these static tables and caller functions to 46 // chrome/common/translate, and implement them as std::set<>. 47 const char* kWellKnownCodesOnWrongConfiguration[] = { 48 "es", "pt", "ja", "ru", "de", "zh-CN", "zh-TW", "ar", "id", "fr", "it", "th" 49 }; 50 51 // Applies a series of language code modification in proper order. 52 void ApplyLanguageCodeCorrection(std::string* code) { 53 // Correct well-known format errors. 54 LanguageDetectionUtil::CorrectLanguageCodeTypo(code); 55 56 if (!LanguageDetectionUtil::IsValidLanguageCode(*code)) { 57 *code = std::string(); 58 return; 59 } 60 61 TranslateUtil::ToTranslateLanguageSynonym(code); 62 } 63 64 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it 65 // failed. 66 // |is_cld_reliable| will be set as true if CLD says the detection is reliable. 67 std::string DetermineTextLanguage(const base::string16& text, 68 bool* is_cld_reliable) { 69 std::string language = chrome::kUnknownLanguageCode; 70 int num_languages = 0; 71 int text_bytes = 0; 72 bool is_reliable = false; 73 Language cld_language = 74 DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable, 75 &num_languages, NULL, &text_bytes); 76 if (is_cld_reliable != NULL) 77 *is_cld_reliable = is_reliable; 78 79 // We don't trust the result if the CLD reports that the detection is not 80 // reliable, or if the actual text used to detect the language was less than 81 // 100 bytes (short texts can often lead to wrong results). 82 // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that 83 // the determined language code is correct with 50% confidence. Chrome should 84 // handle the real confidence value to judge. 85 if (is_reliable && text_bytes >= 100 && cld_language != NUM_LANGUAGES && 86 cld_language != UNKNOWN_LANGUAGE && cld_language != TG_UNKNOWN_LANGUAGE) { 87 // We should not use LanguageCode_ISO_639_1 because it does not cover all 88 // the languages CLD can detect. As a result, it'll return the invalid 89 // language code for tradtional Chinese among others. 90 // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and 91 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN 92 // for Simplified Chinese. 93 language = LanguageCodeWithDialects(cld_language); 94 } 95 VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text 96 << "\n*************************************\n"; 97 return language; 98 } 99 100 // Checks if CLD can complement a sub code when the page language doesn't know 101 // the sub code. 102 bool CanCLDComplementSubCode( 103 const std::string& page_language, const std::string& cld_language) { 104 // Translate server cannot treat general Chinese. If Content-Language and 105 // CLD agree that the language is Chinese and Content-Language doesn't know 106 // which dialect is used, CLD language has priority. 107 // TODO(hajimehoshi): How about the other dialects like zh-MO? 108 return page_language == "zh" && StartsWithASCII(cld_language, "zh-", false); 109 } 110 111 } // namespace 112 113 namespace LanguageDetectionUtil { 114 115 std::string DeterminePageLanguage(const std::string& code, 116 const std::string& html_lang, 117 const base::string16& contents, 118 std::string* cld_language_p, 119 bool* is_cld_reliable_p) { 120 base::TimeTicks begin_time = base::TimeTicks::Now(); 121 bool is_cld_reliable; 122 std::string cld_language = DetermineTextLanguage(contents, &is_cld_reliable); 123 TranslateCommonMetrics::ReportLanguageDetectionTime(begin_time, 124 base::TimeTicks::Now()); 125 126 if (cld_language_p != NULL) 127 *cld_language_p = cld_language; 128 if (is_cld_reliable_p != NULL) 129 *is_cld_reliable_p = is_cld_reliable; 130 TranslateUtil::ToTranslateLanguageSynonym(&cld_language); 131 132 // Check if html lang attribute is valid. 133 std::string modified_html_lang; 134 if (!html_lang.empty()) { 135 modified_html_lang = html_lang; 136 ApplyLanguageCodeCorrection(&modified_html_lang); 137 TranslateCommonMetrics::ReportHtmlLang(html_lang, modified_html_lang); 138 VLOG(9) << "html lang based language code: " << modified_html_lang; 139 } 140 141 // Check if Content-Language is valid. 142 std::string modified_code; 143 if (!code.empty()) { 144 modified_code = code; 145 ApplyLanguageCodeCorrection(&modified_code); 146 TranslateCommonMetrics::ReportContentLanguage(code, modified_code); 147 } 148 149 // Adopt |modified_html_lang| if it is valid. Otherwise, adopt 150 // |modified_code|. 151 std::string language = modified_html_lang.empty() ? modified_code : 152 modified_html_lang; 153 154 // If |language| is empty, just use CLD result even though it might be 155 // chrome::kUnknownLanguageCode. 156 if (language.empty()) { 157 TranslateCommonMetrics::ReportLanguageVerification( 158 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_ONLY); 159 return cld_language; 160 } 161 162 if (cld_language == chrome::kUnknownLanguageCode) { 163 TranslateCommonMetrics::ReportLanguageVerification( 164 TranslateCommonMetrics::LANGUAGE_VERIFICATION_UNKNOWN); 165 return language; 166 } else if (CanCLDComplementSubCode(language, cld_language)) { 167 TranslateCommonMetrics::ReportLanguageVerification( 168 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_COMPLEMENT_SUB_CODE); 169 return cld_language; 170 } else if (IsSameOrSimilarLanguages(language, cld_language)) { 171 TranslateCommonMetrics::ReportLanguageVerification( 172 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_AGREE); 173 return language; 174 } else if (MaybeServerWrongConfiguration(language, cld_language)) { 175 TranslateCommonMetrics::ReportLanguageVerification( 176 TranslateCommonMetrics::LANGUAGE_VERIFICATION_TRUST_CLD); 177 return cld_language; 178 } else { 179 TranslateCommonMetrics::ReportLanguageVerification( 180 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_DISAGREE); 181 // Content-Language value might be wrong because CLD says that this page 182 // is written in another language with confidence. 183 // In this case, Chrome doesn't rely on any of the language codes, and 184 // gives up suggesting a translation. 185 return std::string(chrome::kUnknownLanguageCode); 186 } 187 188 return language; 189 } 190 191 void CorrectLanguageCodeTypo(std::string* code) { 192 DCHECK(code); 193 194 size_t coma_index = code->find(','); 195 if (coma_index != std::string::npos) { 196 // There are more than 1 language specified, just keep the first one. 197 *code = code->substr(0, coma_index); 198 } 199 TrimWhitespaceASCII(*code, TRIM_ALL, code); 200 201 // An underscore instead of a dash is a frequent mistake. 202 size_t underscore_index = code->find('_'); 203 if (underscore_index != std::string::npos) 204 (*code)[underscore_index] = '-'; 205 206 // Change everything up to a dash to lower-case and everything after to upper. 207 size_t dash_index = code->find('-'); 208 if (dash_index != std::string::npos) { 209 *code = StringToLowerASCII(code->substr(0, dash_index)) + 210 StringToUpperASCII(code->substr(dash_index)); 211 } else { 212 *code = StringToLowerASCII(*code); 213 } 214 } 215 216 bool IsValidLanguageCode(const std::string& code) { 217 // Roughly check if the language code follows /[a-zA-Z]{2,3}(-[a-zA-Z]{2})?/. 218 // TODO(hajimehoshi): How about es-419, which is used as an Accept language? 219 std::vector<std::string> chunks; 220 base::SplitString(code, '-', &chunks); 221 222 if (chunks.size() < 1 || 2 < chunks.size()) 223 return false; 224 225 const std::string& main_code = chunks[0]; 226 227 if (main_code.size() < 1 || 3 < main_code.size()) 228 return false; 229 230 for (std::string::const_iterator it = main_code.begin(); 231 it != main_code.end(); ++it) { 232 if (!IsAsciiAlpha(*it)) 233 return false; 234 } 235 236 if (chunks.size() == 1) 237 return true; 238 239 const std::string& sub_code = chunks[1]; 240 241 if (sub_code.size() != 2) 242 return false; 243 244 for (std::string::const_iterator it = sub_code.begin(); 245 it != sub_code.end(); ++it) { 246 if (!IsAsciiAlpha(*it)) 247 return false; 248 } 249 250 return true; 251 } 252 253 bool IsSameOrSimilarLanguages(const std::string& page_language, 254 const std::string& cld_language) { 255 // Language code part of |page_language| is matched to one of |cld_language|. 256 // Country code is ignored here. 257 if (page_language.size() >= 2 && 258 cld_language.find(page_language.c_str(), 0, 2) == 0) { 259 // Languages are matched strictly. Reports false to metrics, but returns 260 // true. 261 TranslateCommonMetrics::ReportSimilarLanguageMatch(false); 262 return true; 263 } 264 265 // Check if |page_language| and |cld_language| are in the similar language 266 // list and belong to the same language group. 267 int page_code = GetSimilarLanguageGroupCode(page_language); 268 bool match = page_code != 0 && 269 page_code == GetSimilarLanguageGroupCode(cld_language); 270 271 TranslateCommonMetrics::ReportSimilarLanguageMatch(match); 272 return match; 273 } 274 275 bool MaybeServerWrongConfiguration(const std::string& page_language, 276 const std::string& cld_language) { 277 // If |page_language| is not "en-*", respect it and just return false here. 278 if (!StartsWithASCII(page_language, "en", false)) 279 return false; 280 281 // A server provides a language meta information representing "en-*". But it 282 // might be just a default value due to missing user configuration. 283 // Let's trust |cld_language| if the determined language is not difficult to 284 // distinguish from English, and the language is one of well-known languages 285 // which often provide "en-*" meta information mistakenly. 286 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) { 287 if (cld_language == kWellKnownCodesOnWrongConfiguration[i]) 288 return true; 289 } 290 return false; 291 } 292 293 std::string GetCLDVersion() { 294 return CompactLangDet::DetectLanguageVersion(); 295 } 296 297 } // namespace LanguageDetectionUtil 298