1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include <fstream> 18 #include <string> 19 20 #include "gmock/gmock.h" 21 #include "gtest/gtest.h" 22 23 #include "utils/sentencepiece/double_array_trie.h" 24 #include "utils/sentencepiece/normalizer.h" 25 #include "utils/sentencepiece/test_utils.h" 26 #include "utils/strings/stringpiece.h" 27 28 namespace libtextclassifier3 { 29 namespace { 30 31 std::string GetTestConfigPath() { 32 return ""; 33 } 34 35 TEST(NormalizerTest, NormalizesAsReferenceNormalizer) { 36 std::ifstream test_config_stream(GetTestConfigPath()); 37 std::string config((std::istreambuf_iterator<char>(test_config_stream)), 38 (std::istreambuf_iterator<char>())); 39 SentencePieceNormalizer normalizer = 40 NormalizerFromSpec(config, /*add_dummy_prefix=*/true, 41 /*remove_extra_whitespaces=*/true, 42 /*escape_whitespaces=*/true); 43 { 44 std::string normalized; 45 EXPECT_TRUE(normalizer.Normalize("hello there", &normalized)); 46 EXPECT_EQ(normalized, "hellothere"); 47 } 48 49 // Redundant whitespace. 50 { 51 std::string normalized; 52 EXPECT_TRUE(normalizer.Normalize("when is the world cup?", &normalized)); 53 EXPECT_EQ(normalized, "whenistheworldcup?"); 54 } 55 56 // Different whitespace. 57 { 58 std::string normalized; 59 EXPECT_TRUE(normalizer.Normalize("general\tkenobi", &normalized)); 60 EXPECT_EQ(normalized, "generalkenobi"); 61 } 62 63 // NFKC char to multi-char normalization. 64 { 65 std::string normalized; 66 EXPECT_TRUE(normalizer.Normalize("", &normalized)); 67 EXPECT_EQ(normalized, ""); 68 } 69 70 // Half width katakana, character composition happens. 71 { 72 std::string normalized; 73 EXPECT_TRUE(normalizer.Normalize(" ", &normalized)); 74 EXPECT_EQ(normalized, ""); 75 } 76 77 // NFKC char to char normalization. 78 { 79 std::string normalized; 80 EXPECT_TRUE(normalizer.Normalize("", &normalized)); 81 EXPECT_EQ(normalized, "123"); 82 } 83 } 84 85 TEST(NormalizerTest, NoDummyPrefix) { 86 std::ifstream test_config_stream(GetTestConfigPath()); 87 std::string config((std::istreambuf_iterator<char>(test_config_stream)), 88 (std::istreambuf_iterator<char>())); 89 SentencePieceNormalizer normalizer = 90 NormalizerFromSpec(config, /*add_dummy_prefix=*/false, 91 /*remove_extra_whitespaces=*/true, 92 /*escape_whitespaces=*/true); 93 94 // NFKC char to char normalization. 95 { 96 std::string normalized; 97 EXPECT_TRUE(normalizer.Normalize("hello there", &normalized)); 98 EXPECT_EQ(normalized, "hellothere"); 99 } 100 101 // Redundant whitespace. 102 { 103 std::string normalized; 104 EXPECT_TRUE(normalizer.Normalize("when is the world cup?", &normalized)); 105 EXPECT_EQ(normalized, "whenistheworldcup?"); 106 } 107 108 // Different whitespace. 109 { 110 std::string normalized; 111 EXPECT_TRUE(normalizer.Normalize("general\tkenobi", &normalized)); 112 EXPECT_EQ(normalized, "generalkenobi"); 113 } 114 115 // NFKC char to multi-char normalization. 116 { 117 std::string normalized; 118 EXPECT_TRUE(normalizer.Normalize("", &normalized)); 119 EXPECT_EQ(normalized, ""); 120 } 121 122 // Half width katakana, character composition happens. 123 { 124 std::string normalized; 125 EXPECT_TRUE(normalizer.Normalize(" ", &normalized)); 126 EXPECT_EQ(normalized, ""); 127 } 128 129 // NFKC char to char normalization. 130 { 131 std::string normalized; 132 EXPECT_TRUE(normalizer.Normalize("", &normalized)); 133 EXPECT_EQ(normalized, "123"); 134 } 135 } 136 137 TEST(NormalizerTest, NoRemoveExtraWhitespace) { 138 std::ifstream test_config_stream(GetTestConfigPath()); 139 std::string config((std::istreambuf_iterator<char>(test_config_stream)), 140 (std::istreambuf_iterator<char>())); 141 SentencePieceNormalizer normalizer = 142 NormalizerFromSpec(config, /*add_dummy_prefix=*/false, 143 /*remove_extra_whitespaces=*/false, 144 /*escape_whitespaces=*/true); 145 146 { 147 std::string normalized; 148 EXPECT_TRUE(normalizer.Normalize("hello there", &normalized)); 149 EXPECT_EQ(normalized, "hellothere"); 150 } 151 152 // Redundant whitespace. 153 { 154 std::string normalized; 155 EXPECT_TRUE(normalizer.Normalize("when is the world cup?", &normalized)); 156 EXPECT_EQ(normalized, "whenistheworldcup?"); 157 } 158 159 // Different whitespace. 160 { 161 std::string normalized; 162 EXPECT_TRUE(normalizer.Normalize("general\tkenobi", &normalized)); 163 EXPECT_EQ(normalized, "generalkenobi"); 164 } 165 } 166 167 TEST(NormalizerTest, NoEscapeWhitespaces) { 168 std::ifstream test_config_stream(GetTestConfigPath()); 169 std::string config((std::istreambuf_iterator<char>(test_config_stream)), 170 (std::istreambuf_iterator<char>())); 171 SentencePieceNormalizer normalizer = 172 NormalizerFromSpec(config, /*add_dummy_prefix=*/false, 173 /*remove_extra_whitespaces=*/false, 174 /*escape_whitespaces=*/false); 175 176 { 177 std::string normalized; 178 EXPECT_TRUE(normalizer.Normalize("hello there", &normalized)); 179 EXPECT_EQ(normalized, "hello there"); 180 } 181 182 // Redundant whitespace. 183 { 184 std::string normalized; 185 EXPECT_TRUE(normalizer.Normalize("when is the world cup?", &normalized)); 186 EXPECT_EQ(normalized, "when is the world cup?"); 187 } 188 189 // Different whitespace. 190 { 191 std::string normalized; 192 EXPECT_TRUE(normalizer.Normalize("general\tkenobi", &normalized)); 193 EXPECT_EQ(normalized, "general kenobi"); 194 } 195 } 196 197 } // namespace 198 } // namespace libtextclassifier3 199