Home | History | Annotate | Download | only in tests
      1 /*
      2  * Copyright (C) 2017 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "smartselect/feature-processor.h"
     18 
     19 #include "gmock/gmock.h"
     20 #include "gtest/gtest.h"
     21 
     22 namespace libtextclassifier {
     23 namespace {
     24 
     25 using testing::ElementsAreArray;
     26 using testing::FloatEq;
     27 
     28 TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesMiddle) {
     29   std::vector<Token> tokens{Token("Hll", 0, 5),
     30                             Token("fba@google.com", 6, 23),
     31                             Token("hee!", 24, 29)};
     32 
     33   internal::SplitTokensOnSelectionBoundaries({9, 12}, &tokens);
     34 
     35   // clang-format off
     36   EXPECT_THAT(tokens, ElementsAreArray(
     37                           {Token("Hll", 0, 5),
     38                            Token("f", 6, 9),
     39                            Token("ba", 9, 12),
     40                            Token("@google.com", 12, 23),
     41                            Token("hee!", 24, 29)}));
     42   // clang-format on
     43 }
     44 
     45 TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesBegin) {
     46   std::vector<Token> tokens{Token("Hll", 0, 5),
     47                             Token("fba@google.com", 6, 23),
     48                             Token("hee!", 24, 29)};
     49 
     50   internal::SplitTokensOnSelectionBoundaries({6, 12}, &tokens);
     51 
     52   // clang-format off
     53   EXPECT_THAT(tokens, ElementsAreArray(
     54                           {Token("Hll", 0, 5),
     55                            Token("fba", 6, 12),
     56                            Token("@google.com", 12, 23),
     57                            Token("hee!", 24, 29)}));
     58   // clang-format on
     59 }
     60 
     61 TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesEnd) {
     62   std::vector<Token> tokens{Token("Hll", 0, 5),
     63                             Token("fba@google.com", 6, 23),
     64                             Token("hee!", 24, 29)};
     65 
     66   internal::SplitTokensOnSelectionBoundaries({9, 23}, &tokens);
     67 
     68   // clang-format off
     69   EXPECT_THAT(tokens, ElementsAreArray(
     70                           {Token("Hll", 0, 5),
     71                            Token("f", 6, 9),
     72                            Token("ba@google.com", 9, 23),
     73                            Token("hee!", 24, 29)}));
     74   // clang-format on
     75 }
     76 
     77 TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesWhole) {
     78   std::vector<Token> tokens{Token("Hll", 0, 5),
     79                             Token("fba@google.com", 6, 23),
     80                             Token("hee!", 24, 29)};
     81 
     82   internal::SplitTokensOnSelectionBoundaries({6, 23}, &tokens);
     83 
     84   // clang-format off
     85   EXPECT_THAT(tokens, ElementsAreArray(
     86                           {Token("Hll", 0, 5),
     87                            Token("fba@google.com", 6, 23),
     88                            Token("hee!", 24, 29)}));
     89   // clang-format on
     90 }
     91 
     92 TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesCrossToken) {
     93   std::vector<Token> tokens{Token("Hll", 0, 5),
     94                             Token("fba@google.com", 6, 23),
     95                             Token("hee!", 24, 29)};
     96 
     97   internal::SplitTokensOnSelectionBoundaries({2, 9}, &tokens);
     98 
     99   // clang-format off
    100   EXPECT_THAT(tokens, ElementsAreArray(
    101                           {Token("H", 0, 2),
    102                            Token("ll", 2, 5),
    103                            Token("f", 6, 9),
    104                            Token("ba@google.com", 9, 23),
    105                            Token("hee!", 24, 29)}));
    106   // clang-format on
    107 }
    108 
    109 TEST(FeatureProcessorTest, KeepLineWithClickFirst) {
    110   const std::string context = "Fist Lin\nScond Lin\nThid Lin";
    111   const CodepointSpan span = {0, 5};
    112   // clang-format off
    113   std::vector<Token> tokens = {Token("Fist", 0, 5),
    114                                Token("Lin", 6, 10),
    115                                Token("Scond", 11, 17),
    116                                Token("Lin", 18, 22),
    117                                Token("Thid", 23, 28),
    118                                Token("Lin", 29, 33)};
    119   // clang-format on
    120 
    121   // Keeps the first line.
    122   internal::StripTokensFromOtherLines(context, span, &tokens);
    123   EXPECT_THAT(tokens,
    124               ElementsAreArray({Token("Fist", 0, 5), Token("Lin", 6, 10)}));
    125 }
    126 
    127 TEST(FeatureProcessorTest, KeepLineWithClickSecond) {
    128   const std::string context = "Fist Lin\nScond Lin\nThid Lin";
    129   const CodepointSpan span = {18, 22};
    130   // clang-format off
    131   std::vector<Token> tokens = {Token("Fist", 0, 5),
    132                                Token("Lin", 6, 10),
    133                                Token("Scond", 11, 17),
    134                                Token("Lin", 18, 22),
    135                                Token("Thid", 23, 28),
    136                                Token("Lin", 29, 33)};
    137   // clang-format on
    138 
    139   // Keeps the first line.
    140   internal::StripTokensFromOtherLines(context, span, &tokens);
    141   EXPECT_THAT(tokens, ElementsAreArray(
    142                           {Token("Scond", 11, 17), Token("Lin", 18, 22)}));
    143 }
    144 
    145 TEST(FeatureProcessorTest, KeepLineWithClickThird) {
    146   const std::string context = "Fist Lin\nScond Lin\nThid Lin";
    147   const CodepointSpan span = {24, 33};
    148   // clang-format off
    149   std::vector<Token> tokens = {Token("Fist", 0, 5),
    150                                Token("Lin", 6, 10),
    151                                Token("Scond", 11, 17),
    152                                Token("Lin", 18, 22),
    153                                Token("Thid", 23, 28),
    154                                Token("Lin", 29, 33)};
    155   // clang-format on
    156 
    157   // Keeps the first line.
    158   internal::StripTokensFromOtherLines(context, span, &tokens);
    159   EXPECT_THAT(tokens, ElementsAreArray(
    160                           {Token("Thid", 23, 28), Token("Lin", 29, 33)}));
    161 }
    162 
    163 TEST(FeatureProcessorTest, KeepLineWithClickSecondWithPipe) {
    164   const std::string context = "Fist Lin|Scond Lin\nThid Lin";
    165   const CodepointSpan span = {18, 22};
    166   // clang-format off
    167   std::vector<Token> tokens = {Token("Fist", 0, 5),
    168                                Token("Lin", 6, 10),
    169                                Token("Scond", 11, 17),
    170                                Token("Lin", 18, 22),
    171                                Token("Thid", 23, 28),
    172                                Token("Lin", 29, 33)};
    173   // clang-format on
    174 
    175   // Keeps the first line.
    176   internal::StripTokensFromOtherLines(context, span, &tokens);
    177   EXPECT_THAT(tokens, ElementsAreArray(
    178                           {Token("Scond", 11, 17), Token("Lin", 18, 22)}));
    179 }
    180 
    181 TEST(FeatureProcessorTest, KeepLineWithCrosslineClick) {
    182   const std::string context = "Fist Lin\nScond Lin\nThid Lin";
    183   const CodepointSpan span = {5, 23};
    184   // clang-format off
    185   std::vector<Token> tokens = {Token("Fist", 0, 5),
    186                                Token("Lin", 6, 10),
    187                                Token("Scond", 18, 23),
    188                                Token("Lin", 19, 23),
    189                                Token("Thid", 23, 28),
    190                                Token("Lin", 29, 33)};
    191   // clang-format on
    192 
    193   // Keeps the first line.
    194   internal::StripTokensFromOtherLines(context, span, &tokens);
    195   EXPECT_THAT(tokens, ElementsAreArray(
    196                           {Token("Fist", 0, 5), Token("Lin", 6, 10),
    197                            Token("Scond", 18, 23), Token("Lin", 19, 23),
    198                            Token("Thid", 23, 28), Token("Lin", 29, 33)}));
    199 }
    200 
    201 class TestingFeatureProcessor : public FeatureProcessor {
    202  public:
    203   using FeatureProcessor::FeatureProcessor;
    204   using FeatureProcessor::SpanToLabel;
    205   using FeatureProcessor::SupportedCodepointsRatio;
    206   using FeatureProcessor::IsCodepointInRanges;
    207   using FeatureProcessor::ICUTokenize;
    208   using FeatureProcessor::supported_codepoint_ranges_;
    209 };
    210 
    211 TEST(FeatureProcessorTest, SpanToLabel) {
    212   FeatureProcessorOptions options;
    213   options.set_context_size(1);
    214   options.set_max_selection_span(1);
    215   options.set_snap_label_span_boundaries_to_containing_tokens(false);
    216 
    217   TokenizationCodepointRange* config =
    218       options.add_tokenization_codepoint_config();
    219   config->set_start(32);
    220   config->set_end(33);
    221   config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
    222 
    223   TestingFeatureProcessor feature_processor(options);
    224   std::vector<Token> tokens = feature_processor.Tokenize("one, two, three");
    225   ASSERT_EQ(3, tokens.size());
    226   int label;
    227   ASSERT_TRUE(feature_processor.SpanToLabel({5, 8}, tokens, &label));
    228   EXPECT_EQ(kInvalidLabel, label);
    229   ASSERT_TRUE(feature_processor.SpanToLabel({5, 9}, tokens, &label));
    230   EXPECT_NE(kInvalidLabel, label);
    231   TokenSpan token_span;
    232   feature_processor.LabelToTokenSpan(label, &token_span);
    233   EXPECT_EQ(0, token_span.first);
    234   EXPECT_EQ(0, token_span.second);
    235 
    236   // Reconfigure with snapping enabled.
    237   options.set_snap_label_span_boundaries_to_containing_tokens(true);
    238   TestingFeatureProcessor feature_processor2(options);
    239   int label2;
    240   ASSERT_TRUE(feature_processor2.SpanToLabel({5, 8}, tokens, &label2));
    241   EXPECT_EQ(label, label2);
    242   ASSERT_TRUE(feature_processor2.SpanToLabel({6, 9}, tokens, &label2));
    243   EXPECT_EQ(label, label2);
    244   ASSERT_TRUE(feature_processor2.SpanToLabel({5, 9}, tokens, &label2));
    245   EXPECT_EQ(label, label2);
    246 
    247   // Cross a token boundary.
    248   ASSERT_TRUE(feature_processor2.SpanToLabel({4, 9}, tokens, &label2));
    249   EXPECT_EQ(kInvalidLabel, label2);
    250   ASSERT_TRUE(feature_processor2.SpanToLabel({5, 10}, tokens, &label2));
    251   EXPECT_EQ(kInvalidLabel, label2);
    252 
    253   // Multiple tokens.
    254   options.set_context_size(2);
    255   options.set_max_selection_span(2);
    256   TestingFeatureProcessor feature_processor3(options);
    257   tokens = feature_processor3.Tokenize("zero, one, two, three, four");
    258   ASSERT_TRUE(feature_processor3.SpanToLabel({6, 15}, tokens, &label2));
    259   EXPECT_NE(kInvalidLabel, label2);
    260   feature_processor3.LabelToTokenSpan(label2, &token_span);
    261   EXPECT_EQ(1, token_span.first);
    262   EXPECT_EQ(0, token_span.second);
    263 
    264   int label3;
    265   ASSERT_TRUE(feature_processor3.SpanToLabel({6, 14}, tokens, &label3));
    266   EXPECT_EQ(label2, label3);
    267   ASSERT_TRUE(feature_processor3.SpanToLabel({6, 13}, tokens, &label3));
    268   EXPECT_EQ(label2, label3);
    269   ASSERT_TRUE(feature_processor3.SpanToLabel({7, 13}, tokens, &label3));
    270   EXPECT_EQ(label2, label3);
    271 }
    272 
    273 TEST(FeatureProcessorTest, CenterTokenFromClick) {
    274   int token_index;
    275 
    276   // Exactly aligned indices.
    277   token_index = internal::CenterTokenFromClick(
    278       {6, 11},
    279       {Token("Hll", 0, 5), Token("world", 6, 11), Token("hee!", 12, 17)});
    280   EXPECT_EQ(token_index, 1);
    281 
    282   // Click is contained in a token.
    283   token_index = internal::CenterTokenFromClick(
    284       {13, 17},
    285       {Token("Hll", 0, 5), Token("world", 6, 11), Token("hee!", 12, 17)});
    286   EXPECT_EQ(token_index, 2);
    287 
    288   // Click spans two tokens.
    289   token_index = internal::CenterTokenFromClick(
    290       {6, 17},
    291       {Token("Hll", 0, 5), Token("world", 6, 11), Token("hee!", 12, 17)});
    292   EXPECT_EQ(token_index, kInvalidIndex);
    293 }
    294 
    295 TEST(FeatureProcessorTest, CenterTokenFromMiddleOfSelection) {
    296   int token_index;
    297 
    298   // Selection of length 3. Exactly aligned indices.
    299   token_index = internal::CenterTokenFromMiddleOfSelection(
    300       {7, 27},
    301       {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
    302        Token("Token4", 21, 27), Token("Token5", 28, 34)});
    303   EXPECT_EQ(token_index, 2);
    304 
    305   // Selection of length 1 token. Exactly aligned indices.
    306   token_index = internal::CenterTokenFromMiddleOfSelection(
    307       {21, 27},
    308       {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
    309        Token("Token4", 21, 27), Token("Token5", 28, 34)});
    310   EXPECT_EQ(token_index, 3);
    311 
    312   // Selection marks sub-token range, with no tokens in it.
    313   token_index = internal::CenterTokenFromMiddleOfSelection(
    314       {29, 33},
    315       {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
    316        Token("Token4", 21, 27), Token("Token5", 28, 34)});
    317   EXPECT_EQ(token_index, kInvalidIndex);
    318 
    319   // Selection of length 2. Sub-token indices.
    320   token_index = internal::CenterTokenFromMiddleOfSelection(
    321       {3, 25},
    322       {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
    323        Token("Token4", 21, 27), Token("Token5", 28, 34)});
    324   EXPECT_EQ(token_index, 1);
    325 
    326   // Selection of length 1. Sub-token indices.
    327   token_index = internal::CenterTokenFromMiddleOfSelection(
    328       {22, 34},
    329       {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
    330        Token("Token4", 21, 27), Token("Token5", 28, 34)});
    331   EXPECT_EQ(token_index, 4);
    332 
    333   // Some invalid ones.
    334   token_index = internal::CenterTokenFromMiddleOfSelection({7, 27}, {});
    335   EXPECT_EQ(token_index, -1);
    336 }
    337 
    338 TEST(FeatureProcessorTest, SupportedCodepointsRatio) {
    339   FeatureProcessorOptions options;
    340   options.set_context_size(2);
    341   options.set_max_selection_span(2);
    342   options.set_snap_label_span_boundaries_to_containing_tokens(false);
    343 
    344   TokenizationCodepointRange* config =
    345       options.add_tokenization_codepoint_config();
    346   config->set_start(32);
    347   config->set_end(33);
    348   config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
    349 
    350   FeatureProcessorOptions::CodepointRange* range;
    351   range = options.add_supported_codepoint_ranges();
    352   range->set_start(0);
    353   range->set_end(128);
    354 
    355   range = options.add_supported_codepoint_ranges();
    356   range->set_start(10000);
    357   range->set_end(10001);
    358 
    359   range = options.add_supported_codepoint_ranges();
    360   range->set_start(20000);
    361   range->set_end(30000);
    362 
    363   TestingFeatureProcessor feature_processor(options);
    364   EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
    365                   1, feature_processor.Tokenize("aaa bbb ccc")),
    366               FloatEq(1.0));
    367   EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
    368                   1, feature_processor.Tokenize("aaa bbb ")),
    369               FloatEq(2.0 / 3));
    370   EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
    371                   1, feature_processor.Tokenize("  ")),
    372               FloatEq(0.0));
    373   EXPECT_FALSE(feature_processor.IsCodepointInRanges(
    374       -1, feature_processor.supported_codepoint_ranges_));
    375   EXPECT_TRUE(feature_processor.IsCodepointInRanges(
    376       0, feature_processor.supported_codepoint_ranges_));
    377   EXPECT_TRUE(feature_processor.IsCodepointInRanges(
    378       10, feature_processor.supported_codepoint_ranges_));
    379   EXPECT_TRUE(feature_processor.IsCodepointInRanges(
    380       127, feature_processor.supported_codepoint_ranges_));
    381   EXPECT_FALSE(feature_processor.IsCodepointInRanges(
    382       128, feature_processor.supported_codepoint_ranges_));
    383   EXPECT_FALSE(feature_processor.IsCodepointInRanges(
    384       9999, feature_processor.supported_codepoint_ranges_));
    385   EXPECT_TRUE(feature_processor.IsCodepointInRanges(
    386       10000, feature_processor.supported_codepoint_ranges_));
    387   EXPECT_FALSE(feature_processor.IsCodepointInRanges(
    388       10001, feature_processor.supported_codepoint_ranges_));
    389   EXPECT_TRUE(feature_processor.IsCodepointInRanges(
    390       25000, feature_processor.supported_codepoint_ranges_));
    391 
    392   std::vector<Token> tokens;
    393   int click_pos;
    394   std::vector<float> extra_features;
    395   std::unique_ptr<CachedFeatures> cached_features;
    396 
    397   auto feature_fn = [](const std::vector<int>& sparse_features,
    398                        const std::vector<float>& dense_features,
    399                        float* embedding) { return true; };
    400 
    401   options.set_min_supported_codepoint_ratio(0.0);
    402   TestingFeatureProcessor feature_processor2(options);
    403   EXPECT_TRUE(feature_processor2.ExtractFeatures("  eee", {4, 7}, {0, 0},
    404                                                  feature_fn, 2, &tokens,
    405                                                  &click_pos, &cached_features));
    406 
    407   options.set_min_supported_codepoint_ratio(0.2);
    408   TestingFeatureProcessor feature_processor3(options);
    409   EXPECT_TRUE(feature_processor3.ExtractFeatures("  eee", {4, 7}, {0, 0},
    410                                                  feature_fn, 2, &tokens,
    411                                                  &click_pos, &cached_features));
    412 
    413   options.set_min_supported_codepoint_ratio(0.5);
    414   TestingFeatureProcessor feature_processor4(options);
    415   EXPECT_FALSE(feature_processor4.ExtractFeatures(
    416       "  eee", {4, 7}, {0, 0}, feature_fn, 2, &tokens, &click_pos,
    417       &cached_features));
    418 }
    419 
    420 TEST(FeatureProcessorTest, StripUnusedTokensWithNoRelativeClick) {
    421   std::vector<Token> tokens_orig{
    422       Token("0", 0, 0), Token("1", 0, 0), Token("2", 0, 0),  Token("3", 0, 0),
    423       Token("4", 0, 0), Token("5", 0, 0), Token("6", 0, 0),  Token("7", 0, 0),
    424       Token("8", 0, 0), Token("9", 0, 0), Token("10", 0, 0), Token("11", 0, 0),
    425       Token("12", 0, 0)};
    426 
    427   std::vector<Token> tokens;
    428   int click_index;
    429 
    430   // Try to click first token and see if it gets padded from left.
    431   tokens = tokens_orig;
    432   click_index = 0;
    433   internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
    434   // clang-format off
    435   EXPECT_EQ(tokens, std::vector<Token>({Token(),
    436                                         Token(),
    437                                         Token("0", 0, 0),
    438                                         Token("1", 0, 0),
    439                                         Token("2", 0, 0)}));
    440   // clang-format on
    441   EXPECT_EQ(click_index, 2);
    442 
    443   // When we click the second token nothing should get padded.
    444   tokens = tokens_orig;
    445   click_index = 2;
    446   internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
    447   // clang-format off
    448   EXPECT_EQ(tokens, std::vector<Token>({Token("0", 0, 0),
    449                                         Token("1", 0, 0),
    450                                         Token("2", 0, 0),
    451                                         Token("3", 0, 0),
    452                                         Token("4", 0, 0)}));
    453   // clang-format on
    454   EXPECT_EQ(click_index, 2);
    455 
    456   // When we click the last token tokens should get padded from the right.
    457   tokens = tokens_orig;
    458   click_index = 12;
    459   internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
    460   // clang-format off
    461   EXPECT_EQ(tokens, std::vector<Token>({Token("10", 0, 0),
    462                                         Token("11", 0, 0),
    463                                         Token("12", 0, 0),
    464                                         Token(),
    465                                         Token()}));
    466   // clang-format on
    467   EXPECT_EQ(click_index, 2);
    468 }
    469 
    470 TEST(FeatureProcessorTest, StripUnusedTokensWithRelativeClick) {
    471   std::vector<Token> tokens_orig{
    472       Token("0", 0, 0), Token("1", 0, 0), Token("2", 0, 0),  Token("3", 0, 0),
    473       Token("4", 0, 0), Token("5", 0, 0), Token("6", 0, 0),  Token("7", 0, 0),
    474       Token("8", 0, 0), Token("9", 0, 0), Token("10", 0, 0), Token("11", 0, 0),
    475       Token("12", 0, 0)};
    476 
    477   std::vector<Token> tokens;
    478   int click_index;
    479 
    480   // Try to click first token and see if it gets padded from left to maximum
    481   // context_size.
    482   tokens = tokens_orig;
    483   click_index = 0;
    484   internal::StripOrPadTokens({2, 3}, 2, &tokens, &click_index);
    485   // clang-format off
    486   EXPECT_EQ(tokens, std::vector<Token>({Token(),
    487                                         Token(),
    488                                         Token("0", 0, 0),
    489                                         Token("1", 0, 0),
    490                                         Token("2", 0, 0),
    491                                         Token("3", 0, 0),
    492                                         Token("4", 0, 0),
    493                                         Token("5", 0, 0)}));
    494   // clang-format on
    495   EXPECT_EQ(click_index, 2);
    496 
    497   // Clicking to the middle with enough context should not produce any padding.
    498   tokens = tokens_orig;
    499   click_index = 6;
    500   internal::StripOrPadTokens({3, 1}, 2, &tokens, &click_index);
    501   // clang-format off
    502   EXPECT_EQ(tokens, std::vector<Token>({Token("1", 0, 0),
    503                                         Token("2", 0, 0),
    504                                         Token("3", 0, 0),
    505                                         Token("4", 0, 0),
    506                                         Token("5", 0, 0),
    507                                         Token("6", 0, 0),
    508                                         Token("7", 0, 0),
    509                                         Token("8", 0, 0),
    510                                         Token("9", 0, 0)}));
    511   // clang-format on
    512   EXPECT_EQ(click_index, 5);
    513 
    514   // Clicking at the end should pad right to maximum context_size.
    515   tokens = tokens_orig;
    516   click_index = 11;
    517   internal::StripOrPadTokens({3, 1}, 2, &tokens, &click_index);
    518   // clang-format off
    519   EXPECT_EQ(tokens, std::vector<Token>({Token("6", 0, 0),
    520                                         Token("7", 0, 0),
    521                                         Token("8", 0, 0),
    522                                         Token("9", 0, 0),
    523                                         Token("10", 0, 0),
    524                                         Token("11", 0, 0),
    525                                         Token("12", 0, 0),
    526                                         Token(),
    527                                         Token()}));
    528   // clang-format on
    529   EXPECT_EQ(click_index, 5);
    530 }
    531 
    532 TEST(FeatureProcessorTest, ICUTokenize) {
    533   FeatureProcessorOptions options;
    534   options.set_tokenization_type(
    535       libtextclassifier::FeatureProcessorOptions::ICU);
    536 
    537   TestingFeatureProcessor feature_processor(options);
    538   std::vector<Token> tokens = feature_processor.Tokenize("");
    539   ASSERT_EQ(tokens,
    540             // clang-format off
    541             std::vector<Token>({Token("", 0, 6),
    542                                 Token("", 6, 12),
    543                                 Token("", 12, 15),
    544                                 Token("", 15, 17),
    545                                 Token("", 17, 19)}));
    546   // clang-format on
    547 }
    548 
    549 TEST(FeatureProcessorTest, ICUTokenizeWithWhitespaces) {
    550   FeatureProcessorOptions options;
    551   options.set_tokenization_type(
    552       libtextclassifier::FeatureProcessorOptions::ICU);
    553   options.set_icu_preserve_whitespace_tokens(true);
    554 
    555   TestingFeatureProcessor feature_processor(options);
    556   std::vector<Token> tokens =
    557       feature_processor.Tokenize("    ");
    558   ASSERT_EQ(tokens,
    559             // clang-format off
    560             std::vector<Token>({Token("", 0, 6),
    561                                 Token(" ", 6, 7),
    562                                 Token("", 7, 13),
    563                                 Token(" ", 13, 14),
    564                                 Token("", 14, 17),
    565                                 Token(" ", 17, 18),
    566                                 Token("", 18, 20),
    567                                 Token(" ", 20, 21),
    568                                 Token("", 21, 23)}));
    569   // clang-format on
    570 }
    571 
    572 TEST(FeatureProcessorTest, MixedTokenize) {
    573   FeatureProcessorOptions options;
    574   options.set_tokenization_type(
    575       libtextclassifier::FeatureProcessorOptions::MIXED);
    576 
    577   TokenizationCodepointRange* config =
    578       options.add_tokenization_codepoint_config();
    579   config->set_start(32);
    580   config->set_end(33);
    581   config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
    582 
    583   FeatureProcessorOptions::CodepointRange* range;
    584   range = options.add_internal_tokenizer_codepoint_ranges();
    585   range->set_start(0);
    586   range->set_end(128);
    587 
    588   range = options.add_internal_tokenizer_codepoint_ranges();
    589   range->set_start(128);
    590   range->set_end(256);
    591 
    592   range = options.add_internal_tokenizer_codepoint_ranges();
    593   range->set_start(256);
    594   range->set_end(384);
    595 
    596   range = options.add_internal_tokenizer_codepoint_ranges();
    597   range->set_start(384);
    598   range->set_end(592);
    599 
    600   TestingFeatureProcessor feature_processor(options);
    601   std::vector<Token> tokens = feature_processor.Tokenize(
    602       "Japanese-lnguag text  http://www.google.com/");
    603   ASSERT_EQ(tokens,
    604             // clang-format off
    605             std::vector<Token>({Token("", 0, 5),
    606                                 Token("Japanese-lnguag", 5, 22),
    607                                 Token("text", 23, 27),
    608                                 Token("", 28, 30),
    609                                 Token("http://www.google.com/", 31, 53)}));
    610   // clang-format on
    611 }
    612 
    613 }  // namespace
    614 }  // namespace libtextclassifier
    615