1 // Copyright 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "components/translate/language_detection/language_detection_util.h" 6 7 #include "base/logging.h" 8 #include "base/metrics/field_trial.h" 9 #include "base/strings/string_split.h" 10 #include "base/strings/string_util.h" 11 #include "base/strings/utf_string_conversions.h" 12 #include "base/time/time.h" 13 #include "components/translate/common/translate_constants.h" 14 #include "components/translate/common/translate_metrics.h" 15 #include "components/translate/common/translate_util.h" 16 17 #if !defined(CLD_VERSION) || CLD_VERSION==1 18 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h" 19 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h" 20 #endif 21 22 #if !defined(CLD_VERSION) || CLD_VERSION==2 23 #include "third_party/cld_2/src/public/compact_lang_det.h" 24 #endif 25 26 namespace { 27 28 // Similar language code list. Some languages are very similar and difficult 29 // for CLD to distinguish. 30 struct SimilarLanguageCode { 31 const char* const code; 32 int group; 33 }; 34 35 const SimilarLanguageCode kSimilarLanguageCodes[] = { 36 {"bs", 1}, 37 {"hr", 1}, 38 {"hi", 2}, 39 {"ne", 2}, 40 }; 41 42 // Checks |kSimilarLanguageCodes| and returns group code. 43 int GetSimilarLanguageGroupCode(const std::string& language) { 44 for (size_t i = 0; i < arraysize(kSimilarLanguageCodes); ++i) { 45 if (language.find(kSimilarLanguageCodes[i].code) != 0) 46 continue; 47 return kSimilarLanguageCodes[i].group; 48 } 49 return 0; 50 } 51 52 // Well-known languages which often have wrong server configuration of 53 // Content-Language: en. 54 // TODO(toyoshim): Remove these static tables and caller functions to 55 // translate/common, and implement them as std::set<>. 56 const char* kWellKnownCodesOnWrongConfiguration[] = { 57 "es", "pt", "ja", "ru", "de", "zh-CN", "zh-TW", "ar", "id", "fr", "it", "th" 58 }; 59 60 // Applies a series of language code modification in proper order. 61 void ApplyLanguageCodeCorrection(std::string* code) { 62 // Correct well-known format errors. 63 translate::CorrectLanguageCodeTypo(code); 64 65 if (!translate::IsValidLanguageCode(*code)) { 66 *code = std::string(); 67 return; 68 } 69 70 translate::ToTranslateLanguageSynonym(code); 71 } 72 73 int GetCLDMajorVersion() { 74 #if !defined(CLD_VERSION) 75 std::string group_name = base::FieldTrialList::FindFullName("CLD1VsCLD2"); 76 if (group_name == "CLD2") 77 return 2; 78 else 79 return 1; 80 #else 81 return CLD_VERSION; 82 #endif 83 } 84 85 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it 86 // failed. 87 // |is_cld_reliable| will be set as true if CLD says the detection is reliable. 88 std::string DetermineTextLanguage(const base::string16& text, 89 bool* is_cld_reliable) { 90 std::string language = translate::kUnknownLanguageCode; 91 int text_bytes = 0; 92 bool is_reliable = false; 93 94 // Language or CLD2::Language 95 int cld_language = 0; 96 bool is_valid_language = false; 97 98 switch (GetCLDMajorVersion()) { 99 #if !defined(CLD_VERSION) || CLD_VERSION==1 100 case 1: { 101 int num_languages = 0; 102 cld_language = 103 DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable, 104 &num_languages, NULL, &text_bytes); 105 is_valid_language = cld_language != NUM_LANGUAGES && 106 cld_language != UNKNOWN_LANGUAGE && 107 cld_language != TG_UNKNOWN_LANGUAGE; 108 break; 109 } 110 #endif 111 #if !defined(CLD_VERSION) || CLD_VERSION==2 112 case 2: { 113 std::string utf8_text(UTF16ToUTF8(text)); 114 CLD2::Language language3[3]; 115 int percent3[3]; 116 CLD2::DetectLanguageSummary( 117 utf8_text.c_str(), (int)utf8_text.size(), true, language3, percent3, 118 &text_bytes, &is_reliable); 119 cld_language = language3[0]; 120 is_valid_language = cld_language != CLD2::NUM_LANGUAGES && 121 cld_language != CLD2::UNKNOWN_LANGUAGE && 122 cld_language != CLD2::TG_UNKNOWN_LANGUAGE; 123 break; 124 } 125 #endif 126 default: 127 NOTREACHED(); 128 } 129 130 if (is_cld_reliable != NULL) 131 *is_cld_reliable = is_reliable; 132 133 // We don't trust the result if the CLD reports that the detection is not 134 // reliable, or if the actual text used to detect the language was less than 135 // 100 bytes (short texts can often lead to wrong results). 136 // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that 137 // the determined language code is correct with 50% confidence. Chrome should 138 // handle the real confidence value to judge. 139 if (is_reliable && text_bytes >= 100 && is_valid_language) { 140 // We should not use LanguageCode_ISO_639_1 because it does not cover all 141 // the languages CLD can detect. As a result, it'll return the invalid 142 // language code for tradtional Chinese among others. 143 // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and 144 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN 145 // for Simplified Chinese. 146 switch (GetCLDMajorVersion()) { 147 #if !defined(CLD_VERSION) || CLD_VERSION==1 148 case 1: 149 language = 150 LanguageCodeWithDialects(static_cast<Language>(cld_language)); 151 break; 152 #endif 153 #if !defined(CLD_VERSION) || CLD_VERSION==2 154 case 2: 155 // (1) CLD2's LanguageCode returns general Chinese 'zh' for 156 // CLD2::CHINESE, but Translate server doesn't accept it. This is 157 // converted to 'zh-CN' in the same way as CLD1's 158 // LanguageCodeWithDialects. 159 // 160 // (2) CLD2's LanguageCode returns zh-Hant instead of zh-TW for 161 // CLD2::CHINESE_T. This is technically more precise for the language 162 // code of traditional Chinese, while Translate server hasn't accepted 163 // zh-Hant yet. 164 if (cld_language == CLD2::CHINESE) { 165 language = "zh-CN"; 166 } else if (cld_language == CLD2::CHINESE_T) { 167 language = "zh-TW"; 168 } else { 169 language = 170 CLD2::LanguageCode(static_cast<CLD2::Language>(cld_language)); 171 } 172 break; 173 #endif 174 default: 175 NOTREACHED(); 176 } 177 } 178 VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text 179 << "\n*************************************\n"; 180 return language; 181 } 182 183 // Checks if CLD can complement a sub code when the page language doesn't know 184 // the sub code. 185 bool CanCLDComplementSubCode( 186 const std::string& page_language, const std::string& cld_language) { 187 // Translate server cannot treat general Chinese. If Content-Language and 188 // CLD agree that the language is Chinese and Content-Language doesn't know 189 // which dialect is used, CLD language has priority. 190 // TODO(hajimehoshi): How about the other dialects like zh-MO? 191 return page_language == "zh" && StartsWithASCII(cld_language, "zh-", false); 192 } 193 194 } // namespace 195 196 namespace translate { 197 198 std::string DeterminePageLanguage(const std::string& code, 199 const std::string& html_lang, 200 const base::string16& contents, 201 std::string* cld_language_p, 202 bool* is_cld_reliable_p) { 203 base::TimeTicks begin_time = base::TimeTicks::Now(); 204 bool is_cld_reliable; 205 std::string cld_language = DetermineTextLanguage(contents, &is_cld_reliable); 206 translate::ReportLanguageDetectionTime(begin_time, base::TimeTicks::Now()); 207 208 if (cld_language_p != NULL) 209 *cld_language_p = cld_language; 210 if (is_cld_reliable_p != NULL) 211 *is_cld_reliable_p = is_cld_reliable; 212 translate::ToTranslateLanguageSynonym(&cld_language); 213 214 // Check if html lang attribute is valid. 215 std::string modified_html_lang; 216 if (!html_lang.empty()) { 217 modified_html_lang = html_lang; 218 ApplyLanguageCodeCorrection(&modified_html_lang); 219 translate::ReportHtmlLang(html_lang, modified_html_lang); 220 VLOG(9) << "html lang based language code: " << modified_html_lang; 221 } 222 223 // Check if Content-Language is valid. 224 std::string modified_code; 225 if (!code.empty()) { 226 modified_code = code; 227 ApplyLanguageCodeCorrection(&modified_code); 228 translate::ReportContentLanguage(code, modified_code); 229 } 230 231 // Adopt |modified_html_lang| if it is valid. Otherwise, adopt 232 // |modified_code|. 233 std::string language = modified_html_lang.empty() ? modified_code : 234 modified_html_lang; 235 236 // If |language| is empty, just use CLD result even though it might be 237 // translate::kUnknownLanguageCode. 238 if (language.empty()) { 239 translate::ReportLanguageVerification( 240 translate::LANGUAGE_VERIFICATION_CLD_ONLY); 241 return cld_language; 242 } 243 244 if (cld_language == kUnknownLanguageCode) { 245 translate::ReportLanguageVerification( 246 translate::LANGUAGE_VERIFICATION_UNKNOWN); 247 return language; 248 } else if (CanCLDComplementSubCode(language, cld_language)) { 249 translate::ReportLanguageVerification( 250 translate::LANGUAGE_VERIFICATION_CLD_COMPLEMENT_SUB_CODE); 251 return cld_language; 252 } else if (IsSameOrSimilarLanguages(language, cld_language)) { 253 translate::ReportLanguageVerification( 254 translate::LANGUAGE_VERIFICATION_CLD_AGREE); 255 return language; 256 } else if (MaybeServerWrongConfiguration(language, cld_language)) { 257 translate::ReportLanguageVerification( 258 translate::LANGUAGE_VERIFICATION_TRUST_CLD); 259 return cld_language; 260 } else { 261 translate::ReportLanguageVerification( 262 translate::LANGUAGE_VERIFICATION_CLD_DISAGREE); 263 // Content-Language value might be wrong because CLD says that this page 264 // is written in another language with confidence. 265 // In this case, Chrome doesn't rely on any of the language codes, and 266 // gives up suggesting a translation. 267 return std::string(kUnknownLanguageCode); 268 } 269 270 return language; 271 } 272 273 void CorrectLanguageCodeTypo(std::string* code) { 274 DCHECK(code); 275 276 size_t coma_index = code->find(','); 277 if (coma_index != std::string::npos) { 278 // There are more than 1 language specified, just keep the first one. 279 *code = code->substr(0, coma_index); 280 } 281 TrimWhitespaceASCII(*code, TRIM_ALL, code); 282 283 // An underscore instead of a dash is a frequent mistake. 284 size_t underscore_index = code->find('_'); 285 if (underscore_index != std::string::npos) 286 (*code)[underscore_index] = '-'; 287 288 // Change everything up to a dash to lower-case and everything after to upper. 289 size_t dash_index = code->find('-'); 290 if (dash_index != std::string::npos) { 291 *code = StringToLowerASCII(code->substr(0, dash_index)) + 292 StringToUpperASCII(code->substr(dash_index)); 293 } else { 294 *code = StringToLowerASCII(*code); 295 } 296 } 297 298 bool IsValidLanguageCode(const std::string& code) { 299 // Roughly check if the language code follows /[a-zA-Z]{2,3}(-[a-zA-Z]{2})?/. 300 // TODO(hajimehoshi): How about es-419, which is used as an Accept language? 301 std::vector<std::string> chunks; 302 base::SplitString(code, '-', &chunks); 303 304 if (chunks.size() < 1 || 2 < chunks.size()) 305 return false; 306 307 const std::string& main_code = chunks[0]; 308 309 if (main_code.size() < 1 || 3 < main_code.size()) 310 return false; 311 312 for (std::string::const_iterator it = main_code.begin(); 313 it != main_code.end(); ++it) { 314 if (!IsAsciiAlpha(*it)) 315 return false; 316 } 317 318 if (chunks.size() == 1) 319 return true; 320 321 const std::string& sub_code = chunks[1]; 322 323 if (sub_code.size() != 2) 324 return false; 325 326 for (std::string::const_iterator it = sub_code.begin(); 327 it != sub_code.end(); ++it) { 328 if (!IsAsciiAlpha(*it)) 329 return false; 330 } 331 332 return true; 333 } 334 335 bool IsSameOrSimilarLanguages(const std::string& page_language, 336 const std::string& cld_language) { 337 std::vector<std::string> chunks; 338 339 base::SplitString(page_language, '-', &chunks); 340 if (chunks.size() == 0) 341 return false; 342 std::string page_language_main_part = chunks[0]; 343 344 base::SplitString(cld_language, '-', &chunks); 345 if (chunks.size() == 0) 346 return false; 347 std::string cld_language_main_part = chunks[0]; 348 349 // Language code part of |page_language| is matched to one of |cld_language|. 350 // Country code is ignored here. 351 if (page_language_main_part == cld_language_main_part) { 352 // Languages are matched strictly. Reports false to metrics, but returns 353 // true. 354 translate::ReportSimilarLanguageMatch(false); 355 return true; 356 } 357 358 // Check if |page_language| and |cld_language| are in the similar language 359 // list and belong to the same language group. 360 int page_code = GetSimilarLanguageGroupCode(page_language); 361 bool match = page_code != 0 && 362 page_code == GetSimilarLanguageGroupCode(cld_language); 363 364 translate::ReportSimilarLanguageMatch(match); 365 return match; 366 } 367 368 bool MaybeServerWrongConfiguration(const std::string& page_language, 369 const std::string& cld_language) { 370 // If |page_language| is not "en-*", respect it and just return false here. 371 if (!StartsWithASCII(page_language, "en", false)) 372 return false; 373 374 // A server provides a language meta information representing "en-*". But it 375 // might be just a default value due to missing user configuration. 376 // Let's trust |cld_language| if the determined language is not difficult to 377 // distinguish from English, and the language is one of well-known languages 378 // which often provide "en-*" meta information mistakenly. 379 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) { 380 if (cld_language == kWellKnownCodesOnWrongConfiguration[i]) 381 return true; 382 } 383 return false; 384 } 385 386 } // namespace translate 387