Home | History | Annotate | Download | only in tests
      1 /*
      2  * Copyright (C) 2017 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "lang_id/lang-id.h"
     18 
     19 #include <memory>
     20 #include <string>
     21 #include <utility>
     22 #include <vector>
     23 
     24 #include "base.h"
     25 #include "util/base/logging.h"
     26 #include "gtest/gtest.h"
     27 
     28 namespace libtextclassifier {
     29 namespace nlp_core {
     30 namespace lang_id {
     31 
     32 namespace {
     33 
     34 std::string GetModelPath() {
     35   return TEST_DATA_DIR "langid.model";
     36 }
     37 
     38 // Creates a LangId with default model.  Passes ownership to
     39 // the caller.
     40 LangId *CreateLanguageDetector() { return new LangId(GetModelPath()); }
     41 
     42 }  // namespace
     43 
     44 TEST(LangIdTest, Normal) {
     45   std::unique_ptr<LangId> lang_id(CreateLanguageDetector());
     46 
     47   EXPECT_EQ("en", lang_id->FindLanguage("This text is written in English."));
     48   EXPECT_EQ("en",
     49             lang_id->FindLanguage("This text   is written in   English.  "));
     50   EXPECT_EQ("en",
     51             lang_id->FindLanguage("  This text is written in English.  "));
     52   EXPECT_EQ("fr", lang_id->FindLanguage("Vive la France!  Vive la France!"));
     53   EXPECT_EQ("ro", lang_id->FindLanguage("Sunt foarte foarte foarte fericit!"));
     54 }
     55 
     56 // Test that for very small queries, we return the default language and a low
     57 // confidence score.
     58 TEST(LangIdTest, SuperSmallQueries) {
     59   std::unique_ptr<LangId> lang_id(CreateLanguageDetector());
     60 
     61   // Use a default language different from any real language: to be sure the
     62   // result is the default language, not a language that happens to be the
     63   // default language.
     64   const std::string kDefaultLanguage = "dflt-lng";
     65   lang_id->SetDefaultLanguage(kDefaultLanguage);
     66 
     67   // Test the simple FindLanguage() method: that method returns a single
     68   // language.
     69   EXPECT_EQ(kDefaultLanguage, lang_id->FindLanguage("y"));
     70   EXPECT_EQ(kDefaultLanguage, lang_id->FindLanguage("j"));
     71   EXPECT_EQ(kDefaultLanguage, lang_id->FindLanguage("l"));
     72   EXPECT_EQ(kDefaultLanguage, lang_id->FindLanguage("w"));
     73   EXPECT_EQ(kDefaultLanguage, lang_id->FindLanguage("z"));
     74   EXPECT_EQ(kDefaultLanguage, lang_id->FindLanguage("zulu"));
     75 
     76   // Test the more complex FindLanguages() method: that method returns a vector
     77   // of (language, confidence_score) pairs.
     78   std::vector<std::pair<std::string, float>> languages;
     79   languages = lang_id->FindLanguages("y");
     80   EXPECT_EQ(1, languages.size());
     81   EXPECT_EQ(kDefaultLanguage, languages[0].first);
     82   EXPECT_GT(0.01f, languages[0].second);
     83 
     84   languages = lang_id->FindLanguages("Todoist");
     85   EXPECT_EQ(1, languages.size());
     86   EXPECT_EQ(kDefaultLanguage, languages[0].first);
     87   EXPECT_GT(0.01f, languages[0].second);
     88 
     89   // A few tests with a default language that is a real language code.
     90   const std::string kJapanese = "ja";
     91   lang_id->SetDefaultLanguage(kJapanese);
     92   EXPECT_EQ(kJapanese, lang_id->FindLanguage("y"));
     93   EXPECT_EQ(kJapanese, lang_id->FindLanguage("j"));
     94   EXPECT_EQ(kJapanese, lang_id->FindLanguage("l"));
     95   languages = lang_id->FindLanguages("y");
     96   EXPECT_EQ(1, languages.size());
     97   EXPECT_EQ(kJapanese, languages[0].first);
     98   EXPECT_GT(0.01f, languages[0].second);
     99 
    100   // Make sure the min text size limit is applied to the number of real
    101   // characters (e.g., without spaces and punctuation chars, which don't
    102   // influence language identification).
    103   const std::string kWhitespaces = "   \t   \n   \t\t\t\n    \t";
    104   const std::string kPunctuation = "... ?!!--- -%%^...-";
    105   std::string still_small_string = kWhitespaces + "y" + kWhitespaces +
    106                                    kPunctuation + kWhitespaces + kPunctuation +
    107                                    kPunctuation;
    108   EXPECT_LE(100, still_small_string.size());
    109   lang_id->SetDefaultLanguage(kDefaultLanguage);
    110   EXPECT_EQ(kDefaultLanguage, lang_id->FindLanguage(still_small_string));
    111   languages = lang_id->FindLanguages(still_small_string);
    112   EXPECT_EQ(1, languages.size());
    113   EXPECT_EQ(kDefaultLanguage, languages[0].first);
    114   EXPECT_GT(0.01f, languages[0].second);
    115 }
    116 
    117 namespace {
    118 void CheckPredictionForGibberishStrings(const std::string &default_language) {
    119   static const char *const kGibberish[] = {
    120     "",
    121     " ",
    122     "       ",
    123     "  ___  ",
    124     "123 456 789",
    125     "><> (-_-) <><",
    126     nullptr,
    127   };
    128 
    129   std::unique_ptr<LangId> lang_id(CreateLanguageDetector());
    130   TC_LOG(INFO) << "Default language: " << default_language;
    131   lang_id->SetDefaultLanguage(default_language);
    132   for (int i = 0; true; ++i) {
    133     const char *gibberish = kGibberish[i];
    134     if (gibberish == nullptr) {
    135       break;
    136     }
    137     const std::string predicted_language = lang_id->FindLanguage(gibberish);
    138     TC_LOG(INFO) << "Predicted " << predicted_language << " for \"" << gibberish
    139                  << "\"";
    140     EXPECT_EQ(default_language, predicted_language);
    141   }
    142 }
    143 }  // namespace
    144 
    145 TEST(LangIdTest, CornerCases) {
    146   CheckPredictionForGibberishStrings("en");
    147   CheckPredictionForGibberishStrings("ro");
    148   CheckPredictionForGibberishStrings("fr");
    149 }
    150 
    151 }  // namespace lang_id
    152 }  // namespace nlp_core
    153 }  // namespace libtextclassifier
    154