1 /* 2 * Copyright (C) 2017 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "lang_id/lang-id.h" 18 19 #include <memory> 20 #include <string> 21 #include <utility> 22 #include <vector> 23 24 #include "base.h" 25 #include "util/base/logging.h" 26 #include "gtest/gtest.h" 27 28 namespace libtextclassifier { 29 namespace nlp_core { 30 namespace lang_id { 31 32 namespace { 33 34 std::string GetModelPath() { 35 return TEST_DATA_DIR "langid.model"; 36 } 37 38 // Creates a LangId with default model. Passes ownership to 39 // the caller. 40 LangId *CreateLanguageDetector() { return new LangId(GetModelPath()); } 41 42 } // namespace 43 44 TEST(LangIdTest, Normal) { 45 std::unique_ptr<LangId> lang_id(CreateLanguageDetector()); 46 47 EXPECT_EQ("en", lang_id->FindLanguage("This text is written in English.")); 48 EXPECT_EQ("en", 49 lang_id->FindLanguage("This text is written in English. ")); 50 EXPECT_EQ("en", 51 lang_id->FindLanguage(" This text is written in English. ")); 52 EXPECT_EQ("fr", lang_id->FindLanguage("Vive la France! Vive la France!")); 53 EXPECT_EQ("ro", lang_id->FindLanguage("Sunt foarte foarte foarte fericit!")); 54 } 55 56 // Test that for very small queries, we return the default language and a low 57 // confidence score. 58 TEST(LangIdTest, SuperSmallQueries) { 59 std::unique_ptr<LangId> lang_id(CreateLanguageDetector()); 60 61 // Use a default language different from any real language: to be sure the 62 // result is the default language, not a language that happens to be the 63 // default language. 64 const std::string kDefaultLanguage = "dflt-lng"; 65 lang_id->SetDefaultLanguage(kDefaultLanguage); 66 67 // Test the simple FindLanguage() method: that method returns a single 68 // language. 69 EXPECT_EQ(kDefaultLanguage, lang_id->FindLanguage("y")); 70 EXPECT_EQ(kDefaultLanguage, lang_id->FindLanguage("j")); 71 EXPECT_EQ(kDefaultLanguage, lang_id->FindLanguage("l")); 72 EXPECT_EQ(kDefaultLanguage, lang_id->FindLanguage("w")); 73 EXPECT_EQ(kDefaultLanguage, lang_id->FindLanguage("z")); 74 EXPECT_EQ(kDefaultLanguage, lang_id->FindLanguage("zulu")); 75 76 // Test the more complex FindLanguages() method: that method returns a vector 77 // of (language, confidence_score) pairs. 78 std::vector<std::pair<std::string, float>> languages; 79 languages = lang_id->FindLanguages("y"); 80 EXPECT_EQ(1, languages.size()); 81 EXPECT_EQ(kDefaultLanguage, languages[0].first); 82 EXPECT_GT(0.01f, languages[0].second); 83 84 languages = lang_id->FindLanguages("Todoist"); 85 EXPECT_EQ(1, languages.size()); 86 EXPECT_EQ(kDefaultLanguage, languages[0].first); 87 EXPECT_GT(0.01f, languages[0].second); 88 89 // A few tests with a default language that is a real language code. 90 const std::string kJapanese = "ja"; 91 lang_id->SetDefaultLanguage(kJapanese); 92 EXPECT_EQ(kJapanese, lang_id->FindLanguage("y")); 93 EXPECT_EQ(kJapanese, lang_id->FindLanguage("j")); 94 EXPECT_EQ(kJapanese, lang_id->FindLanguage("l")); 95 languages = lang_id->FindLanguages("y"); 96 EXPECT_EQ(1, languages.size()); 97 EXPECT_EQ(kJapanese, languages[0].first); 98 EXPECT_GT(0.01f, languages[0].second); 99 100 // Make sure the min text size limit is applied to the number of real 101 // characters (e.g., without spaces and punctuation chars, which don't 102 // influence language identification). 103 const std::string kWhitespaces = " \t \n \t\t\t\n \t"; 104 const std::string kPunctuation = "... ?!!--- -%%^...-"; 105 std::string still_small_string = kWhitespaces + "y" + kWhitespaces + 106 kPunctuation + kWhitespaces + kPunctuation + 107 kPunctuation; 108 EXPECT_LE(100, still_small_string.size()); 109 lang_id->SetDefaultLanguage(kDefaultLanguage); 110 EXPECT_EQ(kDefaultLanguage, lang_id->FindLanguage(still_small_string)); 111 languages = lang_id->FindLanguages(still_small_string); 112 EXPECT_EQ(1, languages.size()); 113 EXPECT_EQ(kDefaultLanguage, languages[0].first); 114 EXPECT_GT(0.01f, languages[0].second); 115 } 116 117 namespace { 118 void CheckPredictionForGibberishStrings(const std::string &default_language) { 119 static const char *const kGibberish[] = { 120 "", 121 " ", 122 " ", 123 " ___ ", 124 "123 456 789", 125 "><> (-_-) <><", 126 nullptr, 127 }; 128 129 std::unique_ptr<LangId> lang_id(CreateLanguageDetector()); 130 TC_LOG(INFO) << "Default language: " << default_language; 131 lang_id->SetDefaultLanguage(default_language); 132 for (int i = 0; true; ++i) { 133 const char *gibberish = kGibberish[i]; 134 if (gibberish == nullptr) { 135 break; 136 } 137 const std::string predicted_language = lang_id->FindLanguage(gibberish); 138 TC_LOG(INFO) << "Predicted " << predicted_language << " for \"" << gibberish 139 << "\""; 140 EXPECT_EQ(default_language, predicted_language); 141 } 142 } 143 } // namespace 144 145 TEST(LangIdTest, CornerCases) { 146 CheckPredictionForGibberishStrings("en"); 147 CheckPredictionForGibberishStrings("ro"); 148 CheckPredictionForGibberishStrings("fr"); 149 } 150 151 } // namespace lang_id 152 } // namespace nlp_core 153 } // namespace libtextclassifier 154