1 /* 2 * Copyright (C) 2017 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "util/utf8/unilib.h" 18 19 #include "util/base/logging.h" 20 #include "util/utf8/unicodetext.h" 21 #include "gmock/gmock.h" 22 #include "gtest/gtest.h" 23 24 namespace libtextclassifier2 { 25 namespace { 26 27 using ::testing::ElementsAre; 28 29 TEST(UniLibTest, CharacterClassesAscii) { 30 CREATE_UNILIB_FOR_TESTING; 31 EXPECT_TRUE(unilib.IsOpeningBracket('(')); 32 EXPECT_TRUE(unilib.IsClosingBracket(')')); 33 EXPECT_FALSE(unilib.IsWhitespace(')')); 34 EXPECT_TRUE(unilib.IsWhitespace(' ')); 35 EXPECT_FALSE(unilib.IsDigit(')')); 36 EXPECT_TRUE(unilib.IsDigit('0')); 37 EXPECT_TRUE(unilib.IsDigit('9')); 38 EXPECT_FALSE(unilib.IsUpper(')')); 39 EXPECT_TRUE(unilib.IsUpper('A')); 40 EXPECT_TRUE(unilib.IsUpper('Z')); 41 EXPECT_EQ(unilib.ToLower('A'), 'a'); 42 EXPECT_EQ(unilib.ToLower('Z'), 'z'); 43 EXPECT_EQ(unilib.ToLower(')'), ')'); 44 EXPECT_EQ(unilib.GetPairedBracket(')'), '('); 45 EXPECT_EQ(unilib.GetPairedBracket('}'), '{'); 46 } 47 48 #ifndef LIBTEXTCLASSIFIER_UNILIB_DUMMY 49 TEST(UniLibTest, CharacterClassesUnicode) { 50 CREATE_UNILIB_FOR_TESTING; 51 EXPECT_TRUE(unilib.IsOpeningBracket(0x0F3C)); // TIBET ANG KHANG GYON 52 EXPECT_TRUE(unilib.IsClosingBracket(0x0F3D)); // TIBET ANG KHANG GYAS 53 EXPECT_FALSE(unilib.IsWhitespace(0x23F0)); // ALARM CLOCK 54 EXPECT_TRUE(unilib.IsWhitespace(0x2003)); // EM SPACE 55 EXPECT_FALSE(unilib.IsDigit(0xA619)); // VAI SYMBOL JONG 56 EXPECT_TRUE(unilib.IsDigit(0xA620)); // VAI DIGIT ZERO 57 EXPECT_TRUE(unilib.IsDigit(0xA629)); // VAI DIGIT NINE 58 EXPECT_FALSE(unilib.IsDigit(0xA62A)); // VAI SYLLABLE NDOLE MA 59 EXPECT_FALSE(unilib.IsUpper(0x0211)); // SMALL R WITH DOUBLE GRAVE 60 EXPECT_TRUE(unilib.IsUpper(0x0212)); // CAPITAL R WITH DOUBLE GRAVE 61 EXPECT_TRUE(unilib.IsUpper(0x0391)); // GREEK CAPITAL ALPHA 62 EXPECT_TRUE(unilib.IsUpper(0x03AB)); // GREEK CAPITAL UPSILON W DIAL 63 EXPECT_FALSE(unilib.IsUpper(0x03AC)); // GREEK SMALL ALPHA WITH TONOS 64 EXPECT_EQ(unilib.ToLower(0x0391), 0x03B1); // GREEK ALPHA 65 EXPECT_EQ(unilib.ToLower(0x03AB), 0x03CB); // GREEK UPSILON WITH DIALYTIKA 66 EXPECT_EQ(unilib.ToLower(0x03C0), 0x03C0); // GREEK SMALL PI 67 68 EXPECT_EQ(unilib.GetPairedBracket(0x0F3C), 0x0F3D); 69 EXPECT_EQ(unilib.GetPairedBracket(0x0F3D), 0x0F3C); 70 } 71 #endif // ndef LIBTEXTCLASSIFIER_UNILIB_DUMMY 72 73 TEST(UniLibTest, RegexInterface) { 74 CREATE_UNILIB_FOR_TESTING; 75 const UnicodeText regex_pattern = 76 UTF8ToUnicodeText("[0-9]+", /*do_copy=*/true); 77 std::unique_ptr<UniLib::RegexPattern> pattern = 78 unilib.CreateRegexPattern(regex_pattern); 79 const UnicodeText input = UTF8ToUnicodeText("hello 0123", /*do_copy=*/false); 80 int status; 81 std::unique_ptr<UniLib::RegexMatcher> matcher = pattern->Matcher(input); 82 TC_LOG(INFO) << matcher->Matches(&status); 83 TC_LOG(INFO) << matcher->Find(&status); 84 TC_LOG(INFO) << matcher->Start(0, &status); 85 TC_LOG(INFO) << matcher->End(0, &status); 86 TC_LOG(INFO) << matcher->Group(0, &status).size_codepoints(); 87 } 88 89 #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU 90 TEST(UniLibTest, Regex) { 91 CREATE_UNILIB_FOR_TESTING; 92 93 // The smiley face is a 4-byte UTF8 codepoint 0x1F60B, and it's important to 94 // test the regex functionality with it to verify we are handling the indices 95 // correctly. 96 const UnicodeText regex_pattern = 97 UTF8ToUnicodeText("[0-9]+", /*do_copy=*/false); 98 std::unique_ptr<UniLib::RegexPattern> pattern = 99 unilib.CreateRegexPattern(regex_pattern); 100 int status; 101 std::unique_ptr<UniLib::RegexMatcher> matcher; 102 103 matcher = pattern->Matcher(UTF8ToUnicodeText("0123", /*do_copy=*/false)); 104 EXPECT_TRUE(matcher->Matches(&status)); 105 EXPECT_TRUE(matcher->ApproximatelyMatches(&status)); 106 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); 107 EXPECT_TRUE(matcher->Matches(&status)); // Check that the state is reset. 108 EXPECT_TRUE(matcher->ApproximatelyMatches(&status)); 109 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); 110 111 matcher = pattern->Matcher( 112 UTF8ToUnicodeText("hello 0123 world", /*do_copy=*/false)); 113 EXPECT_FALSE(matcher->Matches(&status)); 114 EXPECT_FALSE(matcher->ApproximatelyMatches(&status)); 115 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); 116 117 matcher = pattern->Matcher( 118 UTF8ToUnicodeText("hello 0123 world", /*do_copy=*/false)); 119 EXPECT_TRUE(matcher->Find(&status)); 120 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); 121 EXPECT_EQ(matcher->Start(0, &status), 8); 122 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); 123 EXPECT_EQ(matcher->End(0, &status), 13); 124 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); 125 EXPECT_EQ(matcher->Group(0, &status).ToUTF8String(), "0123"); 126 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); 127 } 128 #endif // LIBTEXTCLASSIFIER_UNILIB_ICU 129 130 #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU 131 TEST(UniLibTest, RegexGroups) { 132 CREATE_UNILIB_FOR_TESTING; 133 134 // The smiley face is a 4-byte UTF8 codepoint 0x1F60B, and it's important to 135 // test the regex functionality with it to verify we are handling the indices 136 // correctly. 137 const UnicodeText regex_pattern = UTF8ToUnicodeText( 138 "(?<group1>[0-9])(?<group2>[0-9]+)", /*do_copy=*/false); 139 std::unique_ptr<UniLib::RegexPattern> pattern = 140 unilib.CreateRegexPattern(regex_pattern); 141 int status; 142 std::unique_ptr<UniLib::RegexMatcher> matcher; 143 144 matcher = pattern->Matcher( 145 UTF8ToUnicodeText("hello 0123 world", /*do_copy=*/false)); 146 EXPECT_TRUE(matcher->Find(&status)); 147 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); 148 EXPECT_EQ(matcher->Start(0, &status), 8); 149 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); 150 EXPECT_EQ(matcher->Start(1, &status), 8); 151 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); 152 EXPECT_EQ(matcher->Start(2, &status), 9); 153 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); 154 EXPECT_EQ(matcher->End(0, &status), 13); 155 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); 156 EXPECT_EQ(matcher->End(1, &status), 9); 157 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); 158 EXPECT_EQ(matcher->End(2, &status), 12); 159 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); 160 EXPECT_EQ(matcher->Group(0, &status).ToUTF8String(), "0123"); 161 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); 162 EXPECT_EQ(matcher->Group(1, &status).ToUTF8String(), "0"); 163 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); 164 EXPECT_EQ(matcher->Group(2, &status).ToUTF8String(), "123"); 165 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError); 166 } 167 #endif // LIBTEXTCLASSIFIER_UNILIB_ICU 168 169 #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU 170 171 TEST(UniLibTest, BreakIterator) { 172 CREATE_UNILIB_FOR_TESTING; 173 const UnicodeText text = UTF8ToUnicodeText("some text", /*do_copy=*/false); 174 std::unique_ptr<UniLib::BreakIterator> iterator = 175 unilib.CreateBreakIterator(text); 176 std::vector<int> break_indices; 177 int break_index = 0; 178 while ((break_index = iterator->Next()) != UniLib::BreakIterator::kDone) { 179 break_indices.push_back(break_index); 180 } 181 EXPECT_THAT(break_indices, ElementsAre(4, 5, 9)); 182 } 183 #endif // LIBTEXTCLASSIFIER_UNILIB_ICU 184 185 #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU 186 TEST(UniLibTest, BreakIterator4ByteUTF8) { 187 CREATE_UNILIB_FOR_TESTING; 188 const UnicodeText text = UTF8ToUnicodeText("", /*do_copy=*/false); 189 std::unique_ptr<UniLib::BreakIterator> iterator = 190 unilib.CreateBreakIterator(text); 191 std::vector<int> break_indices; 192 int break_index = 0; 193 while ((break_index = iterator->Next()) != UniLib::BreakIterator::kDone) { 194 break_indices.push_back(break_index); 195 } 196 EXPECT_THAT(break_indices, ElementsAre(1, 2, 3)); 197 } 198 #endif // LIBTEXTCLASSIFIER_UNILIB_ICU 199 200 #ifndef LIBTEXTCLASSIFIER_UNILIB_JAVAICU 201 TEST(UniLibTest, IntegerParse) { 202 CREATE_UNILIB_FOR_TESTING; 203 int result; 204 EXPECT_TRUE( 205 unilib.ParseInt32(UTF8ToUnicodeText("123", /*do_copy=*/false), &result)); 206 EXPECT_EQ(result, 123); 207 } 208 #endif // ndef LIBTEXTCLASSIFIER_UNILIB_JAVAICU 209 210 #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU 211 TEST(UniLibTest, IntegerParseFullWidth) { 212 CREATE_UNILIB_FOR_TESTING; 213 int result; 214 // The input string here is full width 215 EXPECT_TRUE(unilib.ParseInt32(UTF8ToUnicodeText("", /*do_copy=*/false), 216 &result)); 217 EXPECT_EQ(result, 123); 218 } 219 #endif // LIBTEXTCLASSIFIER_UNILIB_ICU 220 221 #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU 222 TEST(UniLibTest, IntegerParseFullWidthWithAlpha) { 223 CREATE_UNILIB_FOR_TESTING; 224 int result; 225 // The input string here is full width 226 EXPECT_FALSE(unilib.ParseInt32(UTF8ToUnicodeText("a", /*do_copy=*/false), 227 &result)); 228 } 229 #endif // LIBTEXTCLASSIFIER_UNILIB_ICU 230 231 } // namespace 232 } // namespace libtextclassifier2 233