Home | History | Annotate | Download | only in utf8
      1 /*
      2  * Copyright (C) 2017 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "util/utf8/unilib.h"
     18 
     19 #include "util/base/logging.h"
     20 #include "util/utf8/unicodetext.h"
     21 #include "gmock/gmock.h"
     22 #include "gtest/gtest.h"
     23 
     24 namespace libtextclassifier2 {
     25 namespace {
     26 
     27 using ::testing::ElementsAre;
     28 
     29 TEST(UniLibTest, CharacterClassesAscii) {
     30   CREATE_UNILIB_FOR_TESTING;
     31   EXPECT_TRUE(unilib.IsOpeningBracket('('));
     32   EXPECT_TRUE(unilib.IsClosingBracket(')'));
     33   EXPECT_FALSE(unilib.IsWhitespace(')'));
     34   EXPECT_TRUE(unilib.IsWhitespace(' '));
     35   EXPECT_FALSE(unilib.IsDigit(')'));
     36   EXPECT_TRUE(unilib.IsDigit('0'));
     37   EXPECT_TRUE(unilib.IsDigit('9'));
     38   EXPECT_FALSE(unilib.IsUpper(')'));
     39   EXPECT_TRUE(unilib.IsUpper('A'));
     40   EXPECT_TRUE(unilib.IsUpper('Z'));
     41   EXPECT_EQ(unilib.ToLower('A'), 'a');
     42   EXPECT_EQ(unilib.ToLower('Z'), 'z');
     43   EXPECT_EQ(unilib.ToLower(')'), ')');
     44   EXPECT_EQ(unilib.GetPairedBracket(')'), '(');
     45   EXPECT_EQ(unilib.GetPairedBracket('}'), '{');
     46 }
     47 
     48 #ifndef LIBTEXTCLASSIFIER_UNILIB_DUMMY
     49 TEST(UniLibTest, CharacterClassesUnicode) {
     50   CREATE_UNILIB_FOR_TESTING;
     51   EXPECT_TRUE(unilib.IsOpeningBracket(0x0F3C));  // TIBET ANG KHANG GYON
     52   EXPECT_TRUE(unilib.IsClosingBracket(0x0F3D));  // TIBET ANG KHANG GYAS
     53   EXPECT_FALSE(unilib.IsWhitespace(0x23F0));     // ALARM CLOCK
     54   EXPECT_TRUE(unilib.IsWhitespace(0x2003));      // EM SPACE
     55   EXPECT_FALSE(unilib.IsDigit(0xA619));          // VAI SYMBOL JONG
     56   EXPECT_TRUE(unilib.IsDigit(0xA620));           // VAI DIGIT ZERO
     57   EXPECT_TRUE(unilib.IsDigit(0xA629));           // VAI DIGIT NINE
     58   EXPECT_FALSE(unilib.IsDigit(0xA62A));          // VAI SYLLABLE NDOLE MA
     59   EXPECT_FALSE(unilib.IsUpper(0x0211));          // SMALL R WITH DOUBLE GRAVE
     60   EXPECT_TRUE(unilib.IsUpper(0x0212));           // CAPITAL R WITH DOUBLE GRAVE
     61   EXPECT_TRUE(unilib.IsUpper(0x0391));           // GREEK CAPITAL ALPHA
     62   EXPECT_TRUE(unilib.IsUpper(0x03AB));           // GREEK CAPITAL UPSILON W DIAL
     63   EXPECT_FALSE(unilib.IsUpper(0x03AC));          // GREEK SMALL ALPHA WITH TONOS
     64   EXPECT_EQ(unilib.ToLower(0x0391), 0x03B1);     // GREEK ALPHA
     65   EXPECT_EQ(unilib.ToLower(0x03AB), 0x03CB);     // GREEK UPSILON WITH DIALYTIKA
     66   EXPECT_EQ(unilib.ToLower(0x03C0), 0x03C0);     // GREEK SMALL PI
     67 
     68   EXPECT_EQ(unilib.GetPairedBracket(0x0F3C), 0x0F3D);
     69   EXPECT_EQ(unilib.GetPairedBracket(0x0F3D), 0x0F3C);
     70 }
     71 #endif  // ndef LIBTEXTCLASSIFIER_UNILIB_DUMMY
     72 
     73 TEST(UniLibTest, RegexInterface) {
     74   CREATE_UNILIB_FOR_TESTING;
     75   const UnicodeText regex_pattern =
     76       UTF8ToUnicodeText("[0-9]+", /*do_copy=*/true);
     77   std::unique_ptr<UniLib::RegexPattern> pattern =
     78       unilib.CreateRegexPattern(regex_pattern);
     79   const UnicodeText input = UTF8ToUnicodeText("hello 0123", /*do_copy=*/false);
     80   int status;
     81   std::unique_ptr<UniLib::RegexMatcher> matcher = pattern->Matcher(input);
     82   TC_LOG(INFO) << matcher->Matches(&status);
     83   TC_LOG(INFO) << matcher->Find(&status);
     84   TC_LOG(INFO) << matcher->Start(0, &status);
     85   TC_LOG(INFO) << matcher->End(0, &status);
     86   TC_LOG(INFO) << matcher->Group(0, &status).size_codepoints();
     87 }
     88 
     89 #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU
     90 TEST(UniLibTest, Regex) {
     91   CREATE_UNILIB_FOR_TESTING;
     92 
     93   // The smiley face is a 4-byte UTF8 codepoint 0x1F60B, and it's important to
     94   // test the regex functionality with it to verify we are handling the indices
     95   // correctly.
     96   const UnicodeText regex_pattern =
     97       UTF8ToUnicodeText("[0-9]+", /*do_copy=*/false);
     98   std::unique_ptr<UniLib::RegexPattern> pattern =
     99       unilib.CreateRegexPattern(regex_pattern);
    100   int status;
    101   std::unique_ptr<UniLib::RegexMatcher> matcher;
    102 
    103   matcher = pattern->Matcher(UTF8ToUnicodeText("0123", /*do_copy=*/false));
    104   EXPECT_TRUE(matcher->Matches(&status));
    105   EXPECT_TRUE(matcher->ApproximatelyMatches(&status));
    106   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
    107   EXPECT_TRUE(matcher->Matches(&status));  // Check that the state is reset.
    108   EXPECT_TRUE(matcher->ApproximatelyMatches(&status));
    109   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
    110 
    111   matcher = pattern->Matcher(
    112       UTF8ToUnicodeText("hello 0123 world", /*do_copy=*/false));
    113   EXPECT_FALSE(matcher->Matches(&status));
    114   EXPECT_FALSE(matcher->ApproximatelyMatches(&status));
    115   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
    116 
    117   matcher = pattern->Matcher(
    118       UTF8ToUnicodeText("hello 0123 world", /*do_copy=*/false));
    119   EXPECT_TRUE(matcher->Find(&status));
    120   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
    121   EXPECT_EQ(matcher->Start(0, &status), 8);
    122   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
    123   EXPECT_EQ(matcher->End(0, &status), 13);
    124   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
    125   EXPECT_EQ(matcher->Group(0, &status).ToUTF8String(), "0123");
    126   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
    127 }
    128 #endif  // LIBTEXTCLASSIFIER_UNILIB_ICU
    129 
    130 #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU
    131 TEST(UniLibTest, RegexGroups) {
    132   CREATE_UNILIB_FOR_TESTING;
    133 
    134   // The smiley face is a 4-byte UTF8 codepoint 0x1F60B, and it's important to
    135   // test the regex functionality with it to verify we are handling the indices
    136   // correctly.
    137   const UnicodeText regex_pattern = UTF8ToUnicodeText(
    138       "(?<group1>[0-9])(?<group2>[0-9]+)", /*do_copy=*/false);
    139   std::unique_ptr<UniLib::RegexPattern> pattern =
    140       unilib.CreateRegexPattern(regex_pattern);
    141   int status;
    142   std::unique_ptr<UniLib::RegexMatcher> matcher;
    143 
    144   matcher = pattern->Matcher(
    145       UTF8ToUnicodeText("hello 0123 world", /*do_copy=*/false));
    146   EXPECT_TRUE(matcher->Find(&status));
    147   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
    148   EXPECT_EQ(matcher->Start(0, &status), 8);
    149   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
    150   EXPECT_EQ(matcher->Start(1, &status), 8);
    151   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
    152   EXPECT_EQ(matcher->Start(2, &status), 9);
    153   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
    154   EXPECT_EQ(matcher->End(0, &status), 13);
    155   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
    156   EXPECT_EQ(matcher->End(1, &status), 9);
    157   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
    158   EXPECT_EQ(matcher->End(2, &status), 12);
    159   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
    160   EXPECT_EQ(matcher->Group(0, &status).ToUTF8String(), "0123");
    161   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
    162   EXPECT_EQ(matcher->Group(1, &status).ToUTF8String(), "0");
    163   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
    164   EXPECT_EQ(matcher->Group(2, &status).ToUTF8String(), "123");
    165   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
    166 }
    167 #endif  // LIBTEXTCLASSIFIER_UNILIB_ICU
    168 
    169 #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU
    170 
    171 TEST(UniLibTest, BreakIterator) {
    172   CREATE_UNILIB_FOR_TESTING;
    173   const UnicodeText text = UTF8ToUnicodeText("some text", /*do_copy=*/false);
    174   std::unique_ptr<UniLib::BreakIterator> iterator =
    175       unilib.CreateBreakIterator(text);
    176   std::vector<int> break_indices;
    177   int break_index = 0;
    178   while ((break_index = iterator->Next()) != UniLib::BreakIterator::kDone) {
    179     break_indices.push_back(break_index);
    180   }
    181   EXPECT_THAT(break_indices, ElementsAre(4, 5, 9));
    182 }
    183 #endif  // LIBTEXTCLASSIFIER_UNILIB_ICU
    184 
    185 #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU
    186 TEST(UniLibTest, BreakIterator4ByteUTF8) {
    187   CREATE_UNILIB_FOR_TESTING;
    188   const UnicodeText text = UTF8ToUnicodeText("", /*do_copy=*/false);
    189   std::unique_ptr<UniLib::BreakIterator> iterator =
    190       unilib.CreateBreakIterator(text);
    191   std::vector<int> break_indices;
    192   int break_index = 0;
    193   while ((break_index = iterator->Next()) != UniLib::BreakIterator::kDone) {
    194     break_indices.push_back(break_index);
    195   }
    196   EXPECT_THAT(break_indices, ElementsAre(1, 2, 3));
    197 }
    198 #endif  // LIBTEXTCLASSIFIER_UNILIB_ICU
    199 
    200 #ifndef LIBTEXTCLASSIFIER_UNILIB_JAVAICU
    201 TEST(UniLibTest, IntegerParse) {
    202   CREATE_UNILIB_FOR_TESTING;
    203   int result;
    204   EXPECT_TRUE(
    205       unilib.ParseInt32(UTF8ToUnicodeText("123", /*do_copy=*/false), &result));
    206   EXPECT_EQ(result, 123);
    207 }
    208 #endif  // ndef LIBTEXTCLASSIFIER_UNILIB_JAVAICU
    209 
    210 #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU
    211 TEST(UniLibTest, IntegerParseFullWidth) {
    212   CREATE_UNILIB_FOR_TESTING;
    213   int result;
    214   // The input string here is full width
    215   EXPECT_TRUE(unilib.ParseInt32(UTF8ToUnicodeText("", /*do_copy=*/false),
    216                                 &result));
    217   EXPECT_EQ(result, 123);
    218 }
    219 #endif  // LIBTEXTCLASSIFIER_UNILIB_ICU
    220 
    221 #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU
    222 TEST(UniLibTest, IntegerParseFullWidthWithAlpha) {
    223   CREATE_UNILIB_FOR_TESTING;
    224   int result;
    225   // The input string here is full width
    226   EXPECT_FALSE(unilib.ParseInt32(UTF8ToUnicodeText("a", /*do_copy=*/false),
    227                                  &result));
    228 }
    229 #endif  // LIBTEXTCLASSIFIER_UNILIB_ICU
    230 
    231 }  // namespace
    232 }  // namespace libtextclassifier2
    233