Home | History | Annotate | Download | only in sentencepiece
      1 /*
      2  * Copyright (C) 2018 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include <fstream>
     18 #include <string>
     19 
     20 #include "gmock/gmock.h"
     21 #include "gtest/gtest.h"
     22 
     23 #include "utils/sentencepiece/double_array_trie.h"
     24 #include "utils/sentencepiece/normalizer.h"
     25 #include "utils/sentencepiece/test_utils.h"
     26 #include "utils/strings/stringpiece.h"
     27 
     28 namespace libtextclassifier3 {
     29 namespace {
     30 
     31 std::string GetTestConfigPath() {
     32   return "";
     33 }
     34 
     35 TEST(NormalizerTest, NormalizesAsReferenceNormalizer) {
     36   std::ifstream test_config_stream(GetTestConfigPath());
     37   std::string config((std::istreambuf_iterator<char>(test_config_stream)),
     38                      (std::istreambuf_iterator<char>()));
     39   SentencePieceNormalizer normalizer =
     40       NormalizerFromSpec(config, /*add_dummy_prefix=*/true,
     41                          /*remove_extra_whitespaces=*/true,
     42                          /*escape_whitespaces=*/true);
     43   {
     44     std::string normalized;
     45     EXPECT_TRUE(normalizer.Normalize("hello there", &normalized));
     46     EXPECT_EQ(normalized, "hellothere");
     47   }
     48 
     49   // Redundant whitespace.
     50   {
     51     std::string normalized;
     52     EXPECT_TRUE(normalizer.Normalize("when is  the  world cup?", &normalized));
     53     EXPECT_EQ(normalized, "whenistheworldcup?");
     54   }
     55 
     56   // Different whitespace.
     57   {
     58     std::string normalized;
     59     EXPECT_TRUE(normalizer.Normalize("general\tkenobi", &normalized));
     60     EXPECT_EQ(normalized, "generalkenobi");
     61   }
     62 
     63   // NFKC char to multi-char normalization.
     64   {
     65     std::string normalized;
     66     EXPECT_TRUE(normalizer.Normalize("", &normalized));
     67     EXPECT_EQ(normalized, "");
     68   }
     69 
     70   // Half width katakana, character composition happens.
     71   {
     72     std::string normalized;
     73     EXPECT_TRUE(normalizer.Normalize("  ", &normalized));
     74     EXPECT_EQ(normalized, "");
     75   }
     76 
     77   // NFKC char to char normalization.
     78   {
     79     std::string normalized;
     80     EXPECT_TRUE(normalizer.Normalize("", &normalized));
     81     EXPECT_EQ(normalized, "123");
     82   }
     83 }
     84 
     85 TEST(NormalizerTest, NoDummyPrefix) {
     86   std::ifstream test_config_stream(GetTestConfigPath());
     87   std::string config((std::istreambuf_iterator<char>(test_config_stream)),
     88                      (std::istreambuf_iterator<char>()));
     89   SentencePieceNormalizer normalizer =
     90       NormalizerFromSpec(config, /*add_dummy_prefix=*/false,
     91                          /*remove_extra_whitespaces=*/true,
     92                          /*escape_whitespaces=*/true);
     93 
     94   // NFKC char to char normalization.
     95   {
     96     std::string normalized;
     97     EXPECT_TRUE(normalizer.Normalize("hello there", &normalized));
     98     EXPECT_EQ(normalized, "hellothere");
     99   }
    100 
    101   // Redundant whitespace.
    102   {
    103     std::string normalized;
    104     EXPECT_TRUE(normalizer.Normalize("when is  the  world cup?", &normalized));
    105     EXPECT_EQ(normalized, "whenistheworldcup?");
    106   }
    107 
    108   // Different whitespace.
    109   {
    110     std::string normalized;
    111     EXPECT_TRUE(normalizer.Normalize("general\tkenobi", &normalized));
    112     EXPECT_EQ(normalized, "generalkenobi");
    113   }
    114 
    115   // NFKC char to multi-char normalization.
    116   {
    117     std::string normalized;
    118     EXPECT_TRUE(normalizer.Normalize("", &normalized));
    119     EXPECT_EQ(normalized, "");
    120   }
    121 
    122   // Half width katakana, character composition happens.
    123   {
    124     std::string normalized;
    125     EXPECT_TRUE(normalizer.Normalize("  ", &normalized));
    126     EXPECT_EQ(normalized, "");
    127   }
    128 
    129   // NFKC char to char normalization.
    130   {
    131     std::string normalized;
    132     EXPECT_TRUE(normalizer.Normalize("", &normalized));
    133     EXPECT_EQ(normalized, "123");
    134   }
    135 }
    136 
    137 TEST(NormalizerTest, NoRemoveExtraWhitespace) {
    138   std::ifstream test_config_stream(GetTestConfigPath());
    139   std::string config((std::istreambuf_iterator<char>(test_config_stream)),
    140                      (std::istreambuf_iterator<char>()));
    141   SentencePieceNormalizer normalizer =
    142       NormalizerFromSpec(config, /*add_dummy_prefix=*/false,
    143                          /*remove_extra_whitespaces=*/false,
    144                          /*escape_whitespaces=*/true);
    145 
    146   {
    147     std::string normalized;
    148     EXPECT_TRUE(normalizer.Normalize("hello there", &normalized));
    149     EXPECT_EQ(normalized, "hellothere");
    150   }
    151 
    152   // Redundant whitespace.
    153   {
    154     std::string normalized;
    155     EXPECT_TRUE(normalizer.Normalize("when is  the  world cup?", &normalized));
    156     EXPECT_EQ(normalized, "whenistheworldcup?");
    157   }
    158 
    159   // Different whitespace.
    160   {
    161     std::string normalized;
    162     EXPECT_TRUE(normalizer.Normalize("general\tkenobi", &normalized));
    163     EXPECT_EQ(normalized, "generalkenobi");
    164   }
    165 }
    166 
    167 TEST(NormalizerTest, NoEscapeWhitespaces) {
    168   std::ifstream test_config_stream(GetTestConfigPath());
    169   std::string config((std::istreambuf_iterator<char>(test_config_stream)),
    170                      (std::istreambuf_iterator<char>()));
    171   SentencePieceNormalizer normalizer =
    172       NormalizerFromSpec(config, /*add_dummy_prefix=*/false,
    173                          /*remove_extra_whitespaces=*/false,
    174                          /*escape_whitespaces=*/false);
    175 
    176   {
    177     std::string normalized;
    178     EXPECT_TRUE(normalizer.Normalize("hello there", &normalized));
    179     EXPECT_EQ(normalized, "hello there");
    180   }
    181 
    182   // Redundant whitespace.
    183   {
    184     std::string normalized;
    185     EXPECT_TRUE(normalizer.Normalize("when is  the  world cup?", &normalized));
    186     EXPECT_EQ(normalized, "when is  the  world cup?");
    187   }
    188 
    189   // Different whitespace.
    190   {
    191     std::string normalized;
    192     EXPECT_TRUE(normalizer.Normalize("general\tkenobi", &normalized));
    193     EXPECT_EQ(normalized, "general kenobi");
    194   }
    195 }
    196 
    197 }  // namespace
    198 }  // namespace libtextclassifier3
    199