Home | History | Annotate | Download | only in utf8
      1 /*
      2  * Copyright (C) 2018 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "utils/utf8/unilib_test-include.h"
     18 
     19 #include "gmock/gmock.h"
     20 
     21 namespace libtextclassifier3 {
     22 namespace test_internal {
     23 
     24 using ::testing::ElementsAre;
     25 
     26 TEST_F(UniLibTest, CharacterClassesAscii) {
     27   EXPECT_TRUE(unilib_.IsOpeningBracket('('));
     28   EXPECT_TRUE(unilib_.IsClosingBracket(')'));
     29   EXPECT_FALSE(unilib_.IsWhitespace(')'));
     30   EXPECT_TRUE(unilib_.IsWhitespace(' '));
     31   EXPECT_FALSE(unilib_.IsDigit(')'));
     32   EXPECT_TRUE(unilib_.IsDigit('0'));
     33   EXPECT_TRUE(unilib_.IsDigit('9'));
     34   EXPECT_FALSE(unilib_.IsUpper(')'));
     35   EXPECT_TRUE(unilib_.IsUpper('A'));
     36   EXPECT_TRUE(unilib_.IsUpper('Z'));
     37   EXPECT_EQ(unilib_.ToLower('A'), 'a');
     38   EXPECT_EQ(unilib_.ToLower('Z'), 'z');
     39   EXPECT_EQ(unilib_.ToLower(')'), ')');
     40   EXPECT_EQ(unilib_.GetPairedBracket(')'), '(');
     41   EXPECT_EQ(unilib_.GetPairedBracket('}'), '{');
     42 }
     43 
     44 TEST_F(UniLibTest, CharacterClassesUnicode) {
     45   EXPECT_TRUE(unilib_.IsOpeningBracket(0x0F3C));  // TIBET ANG KHANG GYON
     46   EXPECT_TRUE(unilib_.IsClosingBracket(0x0F3D));  // TIBET ANG KHANG GYAS
     47   EXPECT_FALSE(unilib_.IsWhitespace(0x23F0));     // ALARM CLOCK
     48   EXPECT_TRUE(unilib_.IsWhitespace(0x2003));      // EM SPACE
     49   EXPECT_FALSE(unilib_.IsDigit(0xA619));          // VAI SYMBOL JONG
     50   EXPECT_TRUE(unilib_.IsDigit(0xA620));           // VAI DIGIT ZERO
     51   EXPECT_TRUE(unilib_.IsDigit(0xA629));           // VAI DIGIT NINE
     52   EXPECT_FALSE(unilib_.IsDigit(0xA62A));          // VAI SYLLABLE NDOLE MA
     53   EXPECT_FALSE(unilib_.IsUpper(0x0211));          // SMALL R WITH DOUBLE GRAVE
     54   EXPECT_TRUE(unilib_.IsUpper(0x0212));           // CAPITAL R WITH DOUBLE GRAVE
     55   EXPECT_TRUE(unilib_.IsUpper(0x0391));           // GREEK CAPITAL ALPHA
     56   EXPECT_TRUE(unilib_.IsUpper(0x03AB));        // GREEK CAPITAL UPSILON W DIAL
     57   EXPECT_FALSE(unilib_.IsUpper(0x03AC));       // GREEK SMALL ALPHA WITH TONOS
     58   EXPECT_EQ(unilib_.ToLower(0x0391), 0x03B1);  // GREEK ALPHA
     59   EXPECT_EQ(unilib_.ToLower(0x03AB), 0x03CB);  // GREEK UPSILON WITH DIALYTIKA
     60   EXPECT_EQ(unilib_.ToLower(0x03C0), 0x03C0);  // GREEK SMALL PI
     61 
     62   EXPECT_EQ(unilib_.GetPairedBracket(0x0F3C), 0x0F3D);
     63   EXPECT_EQ(unilib_.GetPairedBracket(0x0F3D), 0x0F3C);
     64 }
     65 
     66 TEST_F(UniLibTest, RegexInterface) {
     67   const UnicodeText regex_pattern =
     68       UTF8ToUnicodeText("[0-9]+", /*do_copy=*/true);
     69   std::unique_ptr<UniLib::RegexPattern> pattern =
     70       unilib_.CreateRegexPattern(regex_pattern);
     71   const UnicodeText input = UTF8ToUnicodeText("hello 0123", /*do_copy=*/false);
     72   int status;
     73   std::unique_ptr<UniLib::RegexMatcher> matcher = pattern->Matcher(input);
     74   TC3_LOG(INFO) << matcher->Matches(&status);
     75   TC3_LOG(INFO) << matcher->Find(&status);
     76   TC3_LOG(INFO) << matcher->Start(0, &status);
     77   TC3_LOG(INFO) << matcher->End(0, &status);
     78   TC3_LOG(INFO) << matcher->Group(0, &status).size_codepoints();
     79 }
     80 
     81 TEST_F(UniLibTest, Regex) {
     82   // The smiley face is a 4-byte UTF8 codepoint 0x1F60B, and it's important to
     83   // test the regex functionality with it to verify we are handling the indices
     84   // correctly.
     85   const UnicodeText regex_pattern =
     86       UTF8ToUnicodeText("[0-9]+", /*do_copy=*/false);
     87   std::unique_ptr<UniLib::RegexPattern> pattern =
     88       unilib_.CreateRegexPattern(regex_pattern);
     89   int status;
     90   std::unique_ptr<UniLib::RegexMatcher> matcher;
     91 
     92   matcher = pattern->Matcher(UTF8ToUnicodeText("0123", /*do_copy=*/false));
     93   EXPECT_TRUE(matcher->Matches(&status));
     94   EXPECT_TRUE(matcher->ApproximatelyMatches(&status));
     95   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
     96   EXPECT_TRUE(matcher->Matches(&status));  // Check that the state is reset.
     97   EXPECT_TRUE(matcher->ApproximatelyMatches(&status));
     98   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
     99 
    100   matcher = pattern->Matcher(
    101       UTF8ToUnicodeText("hello 0123 world", /*do_copy=*/false));
    102   EXPECT_FALSE(matcher->Matches(&status));
    103   EXPECT_FALSE(matcher->ApproximatelyMatches(&status));
    104   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
    105 
    106   matcher = pattern->Matcher(
    107       UTF8ToUnicodeText("hello 0123 world", /*do_copy=*/false));
    108   EXPECT_TRUE(matcher->Find(&status));
    109   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
    110   EXPECT_EQ(matcher->Start(0, &status), 8);
    111   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
    112   EXPECT_EQ(matcher->End(0, &status), 13);
    113   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
    114   EXPECT_EQ(matcher->Group(0, &status).ToUTF8String(), "0123");
    115   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
    116 }
    117 
    118 TEST_F(UniLibTest, RegexLazy) {
    119   std::unique_ptr<UniLib::RegexPattern> pattern =
    120       unilib_.CreateLazyRegexPattern(
    121           UTF8ToUnicodeText("[a-z][0-9]", /*do_copy=*/false));
    122   int status;
    123   std::unique_ptr<UniLib::RegexMatcher> matcher;
    124 
    125   matcher = pattern->Matcher(UTF8ToUnicodeText("a3", /*do_copy=*/false));
    126   EXPECT_TRUE(matcher->Matches(&status));
    127   EXPECT_TRUE(matcher->ApproximatelyMatches(&status));
    128   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
    129   EXPECT_TRUE(matcher->Matches(&status));  // Check that the state is reset.
    130   EXPECT_TRUE(matcher->ApproximatelyMatches(&status));
    131   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
    132 
    133   matcher = pattern->Matcher(UTF8ToUnicodeText("3a", /*do_copy=*/false));
    134   EXPECT_FALSE(matcher->Matches(&status));
    135   EXPECT_FALSE(matcher->ApproximatelyMatches(&status));
    136   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
    137 }
    138 
    139 TEST_F(UniLibTest, RegexGroups) {
    140   // The smiley face is a 4-byte UTF8 codepoint 0x1F60B, and it's important to
    141   // test the regex functionality with it to verify we are handling the indices
    142   // correctly.
    143   const UnicodeText regex_pattern =
    144       UTF8ToUnicodeText("([0-9])([0-9]+)", /*do_copy=*/false);
    145   std::unique_ptr<UniLib::RegexPattern> pattern =
    146       unilib_.CreateRegexPattern(regex_pattern);
    147   int status;
    148   std::unique_ptr<UniLib::RegexMatcher> matcher;
    149 
    150   matcher = pattern->Matcher(
    151       UTF8ToUnicodeText("hello 0123 world", /*do_copy=*/false));
    152   EXPECT_TRUE(matcher->Find(&status));
    153   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
    154   EXPECT_EQ(matcher->Start(0, &status), 8);
    155   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
    156   EXPECT_EQ(matcher->Start(1, &status), 8);
    157   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
    158   EXPECT_EQ(matcher->Start(2, &status), 9);
    159   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
    160   EXPECT_EQ(matcher->End(0, &status), 13);
    161   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
    162   EXPECT_EQ(matcher->End(1, &status), 9);
    163   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
    164   EXPECT_EQ(matcher->End(2, &status), 12);
    165   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
    166   EXPECT_EQ(matcher->Group(0, &status).ToUTF8String(), "0123");
    167   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
    168   EXPECT_EQ(matcher->Group(1, &status).ToUTF8String(), "0");
    169   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
    170   EXPECT_EQ(matcher->Group(2, &status).ToUTF8String(), "123");
    171   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
    172 }
    173 
    174 TEST_F(UniLibTest, BreakIterator) {
    175   const UnicodeText text = UTF8ToUnicodeText("some text", /*do_copy=*/false);
    176   std::unique_ptr<UniLib::BreakIterator> iterator =
    177       unilib_.CreateBreakIterator(text);
    178   std::vector<int> break_indices;
    179   int break_index = 0;
    180   while ((break_index = iterator->Next()) != UniLib::BreakIterator::kDone) {
    181     break_indices.push_back(break_index);
    182   }
    183   EXPECT_THAT(break_indices, ElementsAre(4, 5, 9));
    184 }
    185 
    186 TEST_F(UniLibTest, BreakIterator4ByteUTF8) {
    187   const UnicodeText text = UTF8ToUnicodeText("", /*do_copy=*/false);
    188   std::unique_ptr<UniLib::BreakIterator> iterator =
    189       unilib_.CreateBreakIterator(text);
    190   std::vector<int> break_indices;
    191   int break_index = 0;
    192   while ((break_index = iterator->Next()) != UniLib::BreakIterator::kDone) {
    193     break_indices.push_back(break_index);
    194   }
    195   EXPECT_THAT(break_indices, ElementsAre(1, 2, 3));
    196 }
    197 
    198 TEST_F(UniLibTest, IntegerParse) {
    199   int result;
    200   EXPECT_TRUE(
    201       unilib_.ParseInt32(UTF8ToUnicodeText("123", /*do_copy=*/false), &result));
    202   EXPECT_EQ(result, 123);
    203 }
    204 
    205 TEST_F(UniLibTest, IntegerParseFullWidth) {
    206   int result;
    207   // The input string here is full width
    208   EXPECT_TRUE(unilib_.ParseInt32(UTF8ToUnicodeText("", /*do_copy=*/false),
    209                                  &result));
    210   EXPECT_EQ(result, 123);
    211 }
    212 
    213 TEST_F(UniLibTest, IntegerParseFullWidthWithAlpha) {
    214   int result;
    215   // The input string here is full width
    216   EXPECT_FALSE(unilib_.ParseInt32(UTF8ToUnicodeText("a", /*do_copy=*/false),
    217                                   &result));
    218 }
    219 
    220 }  // namespace test_internal
    221 }  // namespace libtextclassifier3
    222