Home | History | Annotate | Download | only in language_detection
      1 // Copyright 2013 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "components/translate/language_detection/language_detection_util.h"
      6 
      7 #include "base/logging.h"
      8 #include "base/metrics/field_trial.h"
      9 #include "base/strings/string_split.h"
     10 #include "base/strings/string_util.h"
     11 #include "base/strings/utf_string_conversions.h"
     12 #include "base/time/time.h"
     13 #include "components/translate/common/translate_constants.h"
     14 #include "components/translate/common/translate_metrics.h"
     15 #include "components/translate/common/translate_util.h"
     16 
     17 #if !defined(CLD_VERSION) || CLD_VERSION==1
     18 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h"
     19 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h"
     20 #endif
     21 
     22 #if !defined(CLD_VERSION) || CLD_VERSION==2
     23 #include "third_party/cld_2/src/public/compact_lang_det.h"
     24 #endif
     25 
     26 namespace {
     27 
     28 // Similar language code list. Some languages are very similar and difficult
     29 // for CLD to distinguish.
     30 struct SimilarLanguageCode {
     31   const char* const code;
     32   int group;
     33 };
     34 
     35 const SimilarLanguageCode kSimilarLanguageCodes[] = {
     36   {"bs", 1},
     37   {"hr", 1},
     38   {"hi", 2},
     39   {"ne", 2},
     40 };
     41 
     42 // Checks |kSimilarLanguageCodes| and returns group code.
     43 int GetSimilarLanguageGroupCode(const std::string& language) {
     44   for (size_t i = 0; i < arraysize(kSimilarLanguageCodes); ++i) {
     45     if (language.find(kSimilarLanguageCodes[i].code) != 0)
     46       continue;
     47     return kSimilarLanguageCodes[i].group;
     48   }
     49   return 0;
     50 }
     51 
     52 // Well-known languages which often have wrong server configuration of
     53 // Content-Language: en.
     54 // TODO(toyoshim): Remove these static tables and caller functions to
     55 // translate/common, and implement them as std::set<>.
     56 const char* kWellKnownCodesOnWrongConfiguration[] = {
     57   "es", "pt", "ja", "ru", "de", "zh-CN", "zh-TW", "ar", "id", "fr", "it", "th"
     58 };
     59 
     60 // Applies a series of language code modification in proper order.
     61 void ApplyLanguageCodeCorrection(std::string* code) {
     62   // Correct well-known format errors.
     63   translate::CorrectLanguageCodeTypo(code);
     64 
     65   if (!translate::IsValidLanguageCode(*code)) {
     66     *code = std::string();
     67     return;
     68   }
     69 
     70   translate::ToTranslateLanguageSynonym(code);
     71 }
     72 
     73 int GetCLDMajorVersion() {
     74 #if !defined(CLD_VERSION)
     75   std::string group_name = base::FieldTrialList::FindFullName("CLD1VsCLD2");
     76   if (group_name == "CLD2")
     77     return 2;
     78   else
     79     return 1;
     80 #else
     81   return CLD_VERSION;
     82 #endif
     83 }
     84 
     85 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it
     86 // failed.
     87 // |is_cld_reliable| will be set as true if CLD says the detection is reliable.
     88 std::string DetermineTextLanguage(const base::string16& text,
     89                                   bool* is_cld_reliable) {
     90   std::string language = translate::kUnknownLanguageCode;
     91   int text_bytes = 0;
     92   bool is_reliable = false;
     93 
     94   // Language or CLD2::Language
     95   int cld_language = 0;
     96   bool is_valid_language = false;
     97 
     98   switch (GetCLDMajorVersion()) {
     99 #if !defined(CLD_VERSION) || CLD_VERSION==1
    100     case 1: {
    101       int num_languages = 0;
    102       cld_language =
    103           DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable,
    104                                       &num_languages, NULL, &text_bytes);
    105       is_valid_language = cld_language != NUM_LANGUAGES &&
    106           cld_language != UNKNOWN_LANGUAGE &&
    107           cld_language != TG_UNKNOWN_LANGUAGE;
    108       break;
    109     }
    110 #endif
    111 #if !defined(CLD_VERSION) || CLD_VERSION==2
    112     case 2: {
    113       std::string utf8_text(UTF16ToUTF8(text));
    114       CLD2::Language language3[3];
    115       int percent3[3];
    116       CLD2::DetectLanguageSummary(
    117           utf8_text.c_str(), (int)utf8_text.size(), true, language3, percent3,
    118           &text_bytes, &is_reliable);
    119       cld_language = language3[0];
    120       is_valid_language = cld_language != CLD2::NUM_LANGUAGES &&
    121           cld_language != CLD2::UNKNOWN_LANGUAGE &&
    122           cld_language != CLD2::TG_UNKNOWN_LANGUAGE;
    123       break;
    124     }
    125 #endif
    126     default:
    127       NOTREACHED();
    128   }
    129 
    130   if (is_cld_reliable != NULL)
    131     *is_cld_reliable = is_reliable;
    132 
    133   // We don't trust the result if the CLD reports that the detection is not
    134   // reliable, or if the actual text used to detect the language was less than
    135   // 100 bytes (short texts can often lead to wrong results).
    136   // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that
    137   // the determined language code is correct with 50% confidence. Chrome should
    138   // handle the real confidence value to judge.
    139   if (is_reliable && text_bytes >= 100 && is_valid_language) {
    140     // We should not use LanguageCode_ISO_639_1 because it does not cover all
    141     // the languages CLD can detect. As a result, it'll return the invalid
    142     // language code for tradtional Chinese among others.
    143     // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and
    144     // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN
    145     // for Simplified Chinese.
    146     switch (GetCLDMajorVersion()) {
    147 #if !defined(CLD_VERSION) || CLD_VERSION==1
    148       case 1:
    149         language =
    150             LanguageCodeWithDialects(static_cast<Language>(cld_language));
    151         break;
    152 #endif
    153 #if !defined(CLD_VERSION) || CLD_VERSION==2
    154       case 2:
    155         // (1) CLD2's LanguageCode returns general Chinese 'zh' for
    156         // CLD2::CHINESE, but Translate server doesn't accept it. This is
    157         // converted to 'zh-CN' in the same way as CLD1's
    158         // LanguageCodeWithDialects.
    159         //
    160         // (2) CLD2's LanguageCode returns zh-Hant instead of zh-TW for
    161         // CLD2::CHINESE_T. This is technically more precise for the language
    162         // code of traditional Chinese, while Translate server hasn't accepted
    163         // zh-Hant yet.
    164         if (cld_language == CLD2::CHINESE) {
    165           language = "zh-CN";
    166         } else if (cld_language == CLD2::CHINESE_T) {
    167           language = "zh-TW";
    168         } else {
    169           language =
    170               CLD2::LanguageCode(static_cast<CLD2::Language>(cld_language));
    171         }
    172         break;
    173 #endif
    174       default:
    175         NOTREACHED();
    176     }
    177   }
    178   VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text
    179           << "\n*************************************\n";
    180   return language;
    181 }
    182 
    183 // Checks if CLD can complement a sub code when the page language doesn't know
    184 // the sub code.
    185 bool CanCLDComplementSubCode(
    186     const std::string& page_language, const std::string& cld_language) {
    187   // Translate server cannot treat general Chinese. If Content-Language and
    188   // CLD agree that the language is Chinese and Content-Language doesn't know
    189   // which dialect is used, CLD language has priority.
    190   // TODO(hajimehoshi): How about the other dialects like zh-MO?
    191   return page_language == "zh" && StartsWithASCII(cld_language, "zh-", false);
    192 }
    193 
    194 }  // namespace
    195 
    196 namespace translate {
    197 
    198 std::string DeterminePageLanguage(const std::string& code,
    199                                   const std::string& html_lang,
    200                                   const base::string16& contents,
    201                                   std::string* cld_language_p,
    202                                   bool* is_cld_reliable_p) {
    203   base::TimeTicks begin_time = base::TimeTicks::Now();
    204   bool is_cld_reliable;
    205   std::string cld_language = DetermineTextLanguage(contents, &is_cld_reliable);
    206   translate::ReportLanguageDetectionTime(begin_time, base::TimeTicks::Now());
    207 
    208   if (cld_language_p != NULL)
    209     *cld_language_p = cld_language;
    210   if (is_cld_reliable_p != NULL)
    211     *is_cld_reliable_p = is_cld_reliable;
    212   translate::ToTranslateLanguageSynonym(&cld_language);
    213 
    214   // Check if html lang attribute is valid.
    215   std::string modified_html_lang;
    216   if (!html_lang.empty()) {
    217     modified_html_lang = html_lang;
    218     ApplyLanguageCodeCorrection(&modified_html_lang);
    219     translate::ReportHtmlLang(html_lang, modified_html_lang);
    220     VLOG(9) << "html lang based language code: " << modified_html_lang;
    221   }
    222 
    223   // Check if Content-Language is valid.
    224   std::string modified_code;
    225   if (!code.empty()) {
    226     modified_code = code;
    227     ApplyLanguageCodeCorrection(&modified_code);
    228     translate::ReportContentLanguage(code, modified_code);
    229   }
    230 
    231   // Adopt |modified_html_lang| if it is valid. Otherwise, adopt
    232   // |modified_code|.
    233   std::string language = modified_html_lang.empty() ? modified_code :
    234                                                       modified_html_lang;
    235 
    236   // If |language| is empty, just use CLD result even though it might be
    237   // translate::kUnknownLanguageCode.
    238   if (language.empty()) {
    239     translate::ReportLanguageVerification(
    240         translate::LANGUAGE_VERIFICATION_CLD_ONLY);
    241     return cld_language;
    242   }
    243 
    244   if (cld_language == kUnknownLanguageCode) {
    245     translate::ReportLanguageVerification(
    246         translate::LANGUAGE_VERIFICATION_UNKNOWN);
    247     return language;
    248   } else if (CanCLDComplementSubCode(language, cld_language)) {
    249     translate::ReportLanguageVerification(
    250         translate::LANGUAGE_VERIFICATION_CLD_COMPLEMENT_SUB_CODE);
    251     return cld_language;
    252   } else if (IsSameOrSimilarLanguages(language, cld_language)) {
    253     translate::ReportLanguageVerification(
    254         translate::LANGUAGE_VERIFICATION_CLD_AGREE);
    255     return language;
    256   } else if (MaybeServerWrongConfiguration(language, cld_language)) {
    257     translate::ReportLanguageVerification(
    258         translate::LANGUAGE_VERIFICATION_TRUST_CLD);
    259     return cld_language;
    260   } else {
    261     translate::ReportLanguageVerification(
    262         translate::LANGUAGE_VERIFICATION_CLD_DISAGREE);
    263     // Content-Language value might be wrong because CLD says that this page
    264     // is written in another language with confidence.
    265     // In this case, Chrome doesn't rely on any of the language codes, and
    266     // gives up suggesting a translation.
    267     return std::string(kUnknownLanguageCode);
    268   }
    269 
    270   return language;
    271 }
    272 
    273 void CorrectLanguageCodeTypo(std::string* code) {
    274   DCHECK(code);
    275 
    276   size_t coma_index = code->find(',');
    277   if (coma_index != std::string::npos) {
    278     // There are more than 1 language specified, just keep the first one.
    279     *code = code->substr(0, coma_index);
    280   }
    281   TrimWhitespaceASCII(*code, TRIM_ALL, code);
    282 
    283   // An underscore instead of a dash is a frequent mistake.
    284   size_t underscore_index = code->find('_');
    285   if (underscore_index != std::string::npos)
    286     (*code)[underscore_index] = '-';
    287 
    288   // Change everything up to a dash to lower-case and everything after to upper.
    289   size_t dash_index = code->find('-');
    290   if (dash_index != std::string::npos) {
    291     *code = StringToLowerASCII(code->substr(0, dash_index)) +
    292         StringToUpperASCII(code->substr(dash_index));
    293   } else {
    294     *code = StringToLowerASCII(*code);
    295   }
    296 }
    297 
    298 bool IsValidLanguageCode(const std::string& code) {
    299   // Roughly check if the language code follows /[a-zA-Z]{2,3}(-[a-zA-Z]{2})?/.
    300   // TODO(hajimehoshi): How about es-419, which is used as an Accept language?
    301   std::vector<std::string> chunks;
    302   base::SplitString(code, '-', &chunks);
    303 
    304   if (chunks.size() < 1 || 2 < chunks.size())
    305     return false;
    306 
    307   const std::string& main_code = chunks[0];
    308 
    309   if (main_code.size() < 1 || 3 < main_code.size())
    310     return false;
    311 
    312   for (std::string::const_iterator it = main_code.begin();
    313        it != main_code.end(); ++it) {
    314     if (!IsAsciiAlpha(*it))
    315       return false;
    316   }
    317 
    318   if (chunks.size() == 1)
    319     return true;
    320 
    321   const std::string& sub_code = chunks[1];
    322 
    323   if (sub_code.size() != 2)
    324     return false;
    325 
    326   for (std::string::const_iterator it = sub_code.begin();
    327        it != sub_code.end(); ++it) {
    328     if (!IsAsciiAlpha(*it))
    329       return false;
    330   }
    331 
    332   return true;
    333 }
    334 
    335 bool IsSameOrSimilarLanguages(const std::string& page_language,
    336                               const std::string& cld_language) {
    337   std::vector<std::string> chunks;
    338 
    339   base::SplitString(page_language, '-', &chunks);
    340   if (chunks.size() == 0)
    341     return false;
    342   std::string page_language_main_part = chunks[0];
    343 
    344   base::SplitString(cld_language, '-', &chunks);
    345   if (chunks.size() == 0)
    346     return false;
    347   std::string cld_language_main_part = chunks[0];
    348 
    349   // Language code part of |page_language| is matched to one of |cld_language|.
    350   // Country code is ignored here.
    351   if (page_language_main_part == cld_language_main_part) {
    352     // Languages are matched strictly. Reports false to metrics, but returns
    353     // true.
    354     translate::ReportSimilarLanguageMatch(false);
    355     return true;
    356   }
    357 
    358   // Check if |page_language| and |cld_language| are in the similar language
    359   // list and belong to the same language group.
    360   int page_code = GetSimilarLanguageGroupCode(page_language);
    361   bool match = page_code != 0 &&
    362                page_code == GetSimilarLanguageGroupCode(cld_language);
    363 
    364   translate::ReportSimilarLanguageMatch(match);
    365   return match;
    366 }
    367 
    368 bool MaybeServerWrongConfiguration(const std::string& page_language,
    369                                    const std::string& cld_language) {
    370   // If |page_language| is not "en-*", respect it and just return false here.
    371   if (!StartsWithASCII(page_language, "en", false))
    372     return false;
    373 
    374   // A server provides a language meta information representing "en-*". But it
    375   // might be just a default value due to missing user configuration.
    376   // Let's trust |cld_language| if the determined language is not difficult to
    377   // distinguish from English, and the language is one of well-known languages
    378   // which often provide "en-*" meta information mistakenly.
    379   for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) {
    380     if (cld_language == kWellKnownCodesOnWrongConfiguration[i])
    381       return true;
    382   }
    383   return false;
    384 }
    385 
    386 }  // namespace translate
    387