1 // Copyright 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "chrome/common/translate/language_detection_util.h" 6 7 #include "base/strings/string16.h" 8 #include "base/strings/utf_string_conversions.h" 9 #include "chrome/common/chrome_constants.h" 10 #include "testing/gtest/include/gtest/gtest.h" 11 12 typedef testing::Test LanguageDetectionUtilTest; 13 14 // Tests that well-known language code typos are fixed. 15 TEST_F(LanguageDetectionUtilTest, LanguageCodeTypoCorrection) { 16 std::string language; 17 18 // Strip the second and later codes. 19 language = std::string("ja,en"); 20 LanguageDetectionUtil::CorrectLanguageCodeTypo(&language); 21 EXPECT_EQ("ja", language); 22 23 // Replace dash with hyphen. 24 language = std::string("ja_JP"); 25 LanguageDetectionUtil::CorrectLanguageCodeTypo(&language); 26 EXPECT_EQ("ja-JP", language); 27 28 // Correct wrong cases. 29 language = std::string("JA-jp"); 30 LanguageDetectionUtil::CorrectLanguageCodeTypo(&language); 31 EXPECT_EQ("ja-JP", language); 32 } 33 34 // Tests if the language codes' format is invalid. 35 TEST_F(LanguageDetectionUtilTest, IsValidLanguageCode) { 36 std::string language; 37 38 language = std::string("ja"); 39 EXPECT_TRUE(LanguageDetectionUtil::IsValidLanguageCode(language)); 40 41 language = std::string("ja-JP"); 42 EXPECT_TRUE(LanguageDetectionUtil::IsValidLanguageCode(language)); 43 44 language = std::string("ceb"); 45 EXPECT_TRUE(LanguageDetectionUtil::IsValidLanguageCode(language)); 46 47 language = std::string("ceb-XX"); 48 EXPECT_TRUE(LanguageDetectionUtil::IsValidLanguageCode(language)); 49 50 // Invalid because the sub code consists of a number. 51 language = std::string("utf-8"); 52 EXPECT_FALSE(LanguageDetectionUtil::IsValidLanguageCode(language)); 53 54 // Invalid because of six characters after hyphen. 55 language = std::string("ja-YUKARI"); 56 EXPECT_FALSE(LanguageDetectionUtil::IsValidLanguageCode(language)); 57 58 // Invalid because of four characters. 59 language = std::string("DHMO"); 60 EXPECT_FALSE(LanguageDetectionUtil::IsValidLanguageCode(language)); 61 } 62 63 // Tests that similar language table works. 64 TEST_F(LanguageDetectionUtilTest, SimilarLanguageCode) { 65 EXPECT_TRUE(LanguageDetectionUtil::IsSameOrSimilarLanguages("en", "en")); 66 EXPECT_FALSE(LanguageDetectionUtil::IsSameOrSimilarLanguages("en", "ja")); 67 EXPECT_TRUE(LanguageDetectionUtil::IsSameOrSimilarLanguages("bs", "hr")); 68 EXPECT_TRUE(LanguageDetectionUtil::IsSameOrSimilarLanguages("sr-ME", "sr")); 69 EXPECT_TRUE(LanguageDetectionUtil::IsSameOrSimilarLanguages("ne", "hi")); 70 EXPECT_FALSE(LanguageDetectionUtil::IsSameOrSimilarLanguages("bs", "hi")); 71 } 72 73 // Tests that well-known languages which often have wrong server configuration 74 // are handles. 75 TEST_F(LanguageDetectionUtilTest, WellKnownWrongConfiguration) { 76 EXPECT_TRUE(LanguageDetectionUtil::MaybeServerWrongConfiguration("en", "ja")); 77 EXPECT_TRUE(LanguageDetectionUtil::MaybeServerWrongConfiguration("en-US", 78 "ja")); 79 EXPECT_TRUE(LanguageDetectionUtil::MaybeServerWrongConfiguration("en", 80 "zh-CN")); 81 EXPECT_FALSE(LanguageDetectionUtil::MaybeServerWrongConfiguration("ja", 82 "en")); 83 EXPECT_FALSE(LanguageDetectionUtil::MaybeServerWrongConfiguration("en", 84 "he")); 85 } 86 87 // Tests that the language meta tag providing wrong information is ignored by 88 // LanguageDetectionUtil due to disagreement between meta tag and CLD. 89 TEST_F(LanguageDetectionUtilTest, CLDDisagreeWithWrongLanguageCode) { 90 base::string16 contents = ASCIIToUTF16( 91 "<html><head><meta http-equiv='Content-Language' content='ja'></head>" 92 "<body>This is a page apparently written in English. Even though " 93 "content-language is provided, the value will be ignored if the value " 94 "is suspicious.</body></html>"); 95 std::string cld_language; 96 bool is_cld_reliable; 97 std::string language = LanguageDetectionUtil::DeterminePageLanguage( 98 std::string("ja"), std::string(), contents, &cld_language, 99 &is_cld_reliable); 100 EXPECT_EQ(chrome::kUnknownLanguageCode, language); 101 EXPECT_EQ("en", cld_language); 102 EXPECT_TRUE(is_cld_reliable); 103 } 104 105 // Tests that the language meta tag providing "en-US" style information is 106 // agreed by CLD. 107 TEST_F(LanguageDetectionUtilTest, CLDAgreeWithLanguageCodeHavingCountryCode) { 108 base::string16 contents = ASCIIToUTF16( 109 "<html><head><meta http-equiv='Content-Language' content='en-US'></head>" 110 "<body>This is a page apparently written in English. Even though " 111 "content-language is provided, the value will be ignored if the value " 112 "is suspicious.</body></html>"); 113 std::string cld_language; 114 bool is_cld_reliable; 115 std::string language = LanguageDetectionUtil::DeterminePageLanguage( 116 std::string("en-US"), std::string(), contents, &cld_language, 117 &is_cld_reliable); 118 EXPECT_EQ("en-US", language); 119 EXPECT_EQ("en", cld_language); 120 EXPECT_TRUE(is_cld_reliable); 121 } 122 123 // Tests that the language meta tag providing wrong information is ignored and 124 // CLD's language will be adopted by LanguageDetectionUtil due to an invalid 125 // meta tag. 126 TEST_F(LanguageDetectionUtilTest, InvalidLanguageMetaTagProviding) { 127 base::string16 contents = ASCIIToUTF16( 128 "<html><head><meta http-equiv='Content-Language' content='utf-8'></head>" 129 "<body>This is a page apparently written in English. Even though " 130 "content-language is provided, the value will be ignored and CLD's" 131 " language will be adopted if the value is invalid.</body></html>"); 132 std::string cld_language; 133 bool is_cld_reliable; 134 std::string language = LanguageDetectionUtil::DeterminePageLanguage( 135 std::string("utf-8"), std::string(), contents, &cld_language, 136 &is_cld_reliable); 137 EXPECT_EQ("en", language); 138 EXPECT_EQ("en", cld_language); 139 EXPECT_TRUE(is_cld_reliable); 140 } 141 142 // Tests that the language meta tag providing wrong information is ignored 143 // because of valid html lang attribute. 144 TEST_F(LanguageDetectionUtilTest, AdoptHtmlLang) { 145 base::string16 contents = ASCIIToUTF16( 146 "<html lang='en'><head><meta http-equiv='Content-Language' content='ja'>" 147 "</head><body>This is a page apparently written in English. Even though " 148 "content-language is provided, the value will be ignored if the value " 149 "is suspicious.</body></html>"); 150 std::string cld_language; 151 bool is_cld_reliable; 152 std::string language = LanguageDetectionUtil::DeterminePageLanguage( 153 std::string("ja"), std::string("en"), contents, &cld_language, 154 &is_cld_reliable); 155 EXPECT_EQ("en", language); 156 EXPECT_EQ("en", cld_language); 157 EXPECT_TRUE(is_cld_reliable); 158 } 159