1 /* 2 * Copyright (C) 2017 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "smartselect/feature-processor.h" 18 19 #include "gmock/gmock.h" 20 #include "gtest/gtest.h" 21 22 namespace libtextclassifier { 23 namespace { 24 25 using testing::ElementsAreArray; 26 using testing::FloatEq; 27 28 TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesMiddle) { 29 std::vector<Token> tokens{Token("Hll", 0, 5), 30 Token("fba@google.com", 6, 23), 31 Token("hee!", 24, 29)}; 32 33 internal::SplitTokensOnSelectionBoundaries({9, 12}, &tokens); 34 35 // clang-format off 36 EXPECT_THAT(tokens, ElementsAreArray( 37 {Token("Hll", 0, 5), 38 Token("f", 6, 9), 39 Token("ba", 9, 12), 40 Token("@google.com", 12, 23), 41 Token("hee!", 24, 29)})); 42 // clang-format on 43 } 44 45 TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesBegin) { 46 std::vector<Token> tokens{Token("Hll", 0, 5), 47 Token("fba@google.com", 6, 23), 48 Token("hee!", 24, 29)}; 49 50 internal::SplitTokensOnSelectionBoundaries({6, 12}, &tokens); 51 52 // clang-format off 53 EXPECT_THAT(tokens, ElementsAreArray( 54 {Token("Hll", 0, 5), 55 Token("fba", 6, 12), 56 Token("@google.com", 12, 23), 57 Token("hee!", 24, 29)})); 58 // clang-format on 59 } 60 61 TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesEnd) { 62 std::vector<Token> tokens{Token("Hll", 0, 5), 63 Token("fba@google.com", 6, 23), 64 Token("hee!", 24, 29)}; 65 66 internal::SplitTokensOnSelectionBoundaries({9, 23}, &tokens); 67 68 // clang-format off 69 EXPECT_THAT(tokens, ElementsAreArray( 70 {Token("Hll", 0, 5), 71 Token("f", 6, 9), 72 Token("ba@google.com", 9, 23), 73 Token("hee!", 24, 29)})); 74 // clang-format on 75 } 76 77 TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesWhole) { 78 std::vector<Token> tokens{Token("Hll", 0, 5), 79 Token("fba@google.com", 6, 23), 80 Token("hee!", 24, 29)}; 81 82 internal::SplitTokensOnSelectionBoundaries({6, 23}, &tokens); 83 84 // clang-format off 85 EXPECT_THAT(tokens, ElementsAreArray( 86 {Token("Hll", 0, 5), 87 Token("fba@google.com", 6, 23), 88 Token("hee!", 24, 29)})); 89 // clang-format on 90 } 91 92 TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesCrossToken) { 93 std::vector<Token> tokens{Token("Hll", 0, 5), 94 Token("fba@google.com", 6, 23), 95 Token("hee!", 24, 29)}; 96 97 internal::SplitTokensOnSelectionBoundaries({2, 9}, &tokens); 98 99 // clang-format off 100 EXPECT_THAT(tokens, ElementsAreArray( 101 {Token("H", 0, 2), 102 Token("ll", 2, 5), 103 Token("f", 6, 9), 104 Token("ba@google.com", 9, 23), 105 Token("hee!", 24, 29)})); 106 // clang-format on 107 } 108 109 TEST(FeatureProcessorTest, KeepLineWithClickFirst) { 110 const std::string context = "Fist Lin\nScond Lin\nThid Lin"; 111 const CodepointSpan span = {0, 5}; 112 // clang-format off 113 std::vector<Token> tokens = {Token("Fist", 0, 5), 114 Token("Lin", 6, 10), 115 Token("Scond", 11, 17), 116 Token("Lin", 18, 22), 117 Token("Thid", 23, 28), 118 Token("Lin", 29, 33)}; 119 // clang-format on 120 121 // Keeps the first line. 122 internal::StripTokensFromOtherLines(context, span, &tokens); 123 EXPECT_THAT(tokens, 124 ElementsAreArray({Token("Fist", 0, 5), Token("Lin", 6, 10)})); 125 } 126 127 TEST(FeatureProcessorTest, KeepLineWithClickSecond) { 128 const std::string context = "Fist Lin\nScond Lin\nThid Lin"; 129 const CodepointSpan span = {18, 22}; 130 // clang-format off 131 std::vector<Token> tokens = {Token("Fist", 0, 5), 132 Token("Lin", 6, 10), 133 Token("Scond", 11, 17), 134 Token("Lin", 18, 22), 135 Token("Thid", 23, 28), 136 Token("Lin", 29, 33)}; 137 // clang-format on 138 139 // Keeps the first line. 140 internal::StripTokensFromOtherLines(context, span, &tokens); 141 EXPECT_THAT(tokens, ElementsAreArray( 142 {Token("Scond", 11, 17), Token("Lin", 18, 22)})); 143 } 144 145 TEST(FeatureProcessorTest, KeepLineWithClickThird) { 146 const std::string context = "Fist Lin\nScond Lin\nThid Lin"; 147 const CodepointSpan span = {24, 33}; 148 // clang-format off 149 std::vector<Token> tokens = {Token("Fist", 0, 5), 150 Token("Lin", 6, 10), 151 Token("Scond", 11, 17), 152 Token("Lin", 18, 22), 153 Token("Thid", 23, 28), 154 Token("Lin", 29, 33)}; 155 // clang-format on 156 157 // Keeps the first line. 158 internal::StripTokensFromOtherLines(context, span, &tokens); 159 EXPECT_THAT(tokens, ElementsAreArray( 160 {Token("Thid", 23, 28), Token("Lin", 29, 33)})); 161 } 162 163 TEST(FeatureProcessorTest, KeepLineWithClickSecondWithPipe) { 164 const std::string context = "Fist Lin|Scond Lin\nThid Lin"; 165 const CodepointSpan span = {18, 22}; 166 // clang-format off 167 std::vector<Token> tokens = {Token("Fist", 0, 5), 168 Token("Lin", 6, 10), 169 Token("Scond", 11, 17), 170 Token("Lin", 18, 22), 171 Token("Thid", 23, 28), 172 Token("Lin", 29, 33)}; 173 // clang-format on 174 175 // Keeps the first line. 176 internal::StripTokensFromOtherLines(context, span, &tokens); 177 EXPECT_THAT(tokens, ElementsAreArray( 178 {Token("Scond", 11, 17), Token("Lin", 18, 22)})); 179 } 180 181 TEST(FeatureProcessorTest, KeepLineWithCrosslineClick) { 182 const std::string context = "Fist Lin\nScond Lin\nThid Lin"; 183 const CodepointSpan span = {5, 23}; 184 // clang-format off 185 std::vector<Token> tokens = {Token("Fist", 0, 5), 186 Token("Lin", 6, 10), 187 Token("Scond", 18, 23), 188 Token("Lin", 19, 23), 189 Token("Thid", 23, 28), 190 Token("Lin", 29, 33)}; 191 // clang-format on 192 193 // Keeps the first line. 194 internal::StripTokensFromOtherLines(context, span, &tokens); 195 EXPECT_THAT(tokens, ElementsAreArray( 196 {Token("Fist", 0, 5), Token("Lin", 6, 10), 197 Token("Scond", 18, 23), Token("Lin", 19, 23), 198 Token("Thid", 23, 28), Token("Lin", 29, 33)})); 199 } 200 201 class TestingFeatureProcessor : public FeatureProcessor { 202 public: 203 using FeatureProcessor::FeatureProcessor; 204 using FeatureProcessor::SpanToLabel; 205 using FeatureProcessor::SupportedCodepointsRatio; 206 using FeatureProcessor::IsCodepointInRanges; 207 using FeatureProcessor::ICUTokenize; 208 using FeatureProcessor::supported_codepoint_ranges_; 209 }; 210 211 TEST(FeatureProcessorTest, SpanToLabel) { 212 FeatureProcessorOptions options; 213 options.set_context_size(1); 214 options.set_max_selection_span(1); 215 options.set_snap_label_span_boundaries_to_containing_tokens(false); 216 217 TokenizationCodepointRange* config = 218 options.add_tokenization_codepoint_config(); 219 config->set_start(32); 220 config->set_end(33); 221 config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR); 222 223 TestingFeatureProcessor feature_processor(options); 224 std::vector<Token> tokens = feature_processor.Tokenize("one, two, three"); 225 ASSERT_EQ(3, tokens.size()); 226 int label; 227 ASSERT_TRUE(feature_processor.SpanToLabel({5, 8}, tokens, &label)); 228 EXPECT_EQ(kInvalidLabel, label); 229 ASSERT_TRUE(feature_processor.SpanToLabel({5, 9}, tokens, &label)); 230 EXPECT_NE(kInvalidLabel, label); 231 TokenSpan token_span; 232 feature_processor.LabelToTokenSpan(label, &token_span); 233 EXPECT_EQ(0, token_span.first); 234 EXPECT_EQ(0, token_span.second); 235 236 // Reconfigure with snapping enabled. 237 options.set_snap_label_span_boundaries_to_containing_tokens(true); 238 TestingFeatureProcessor feature_processor2(options); 239 int label2; 240 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 8}, tokens, &label2)); 241 EXPECT_EQ(label, label2); 242 ASSERT_TRUE(feature_processor2.SpanToLabel({6, 9}, tokens, &label2)); 243 EXPECT_EQ(label, label2); 244 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 9}, tokens, &label2)); 245 EXPECT_EQ(label, label2); 246 247 // Cross a token boundary. 248 ASSERT_TRUE(feature_processor2.SpanToLabel({4, 9}, tokens, &label2)); 249 EXPECT_EQ(kInvalidLabel, label2); 250 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 10}, tokens, &label2)); 251 EXPECT_EQ(kInvalidLabel, label2); 252 253 // Multiple tokens. 254 options.set_context_size(2); 255 options.set_max_selection_span(2); 256 TestingFeatureProcessor feature_processor3(options); 257 tokens = feature_processor3.Tokenize("zero, one, two, three, four"); 258 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 15}, tokens, &label2)); 259 EXPECT_NE(kInvalidLabel, label2); 260 feature_processor3.LabelToTokenSpan(label2, &token_span); 261 EXPECT_EQ(1, token_span.first); 262 EXPECT_EQ(0, token_span.second); 263 264 int label3; 265 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 14}, tokens, &label3)); 266 EXPECT_EQ(label2, label3); 267 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 13}, tokens, &label3)); 268 EXPECT_EQ(label2, label3); 269 ASSERT_TRUE(feature_processor3.SpanToLabel({7, 13}, tokens, &label3)); 270 EXPECT_EQ(label2, label3); 271 } 272 273 TEST(FeatureProcessorTest, CenterTokenFromClick) { 274 int token_index; 275 276 // Exactly aligned indices. 277 token_index = internal::CenterTokenFromClick( 278 {6, 11}, 279 {Token("Hll", 0, 5), Token("world", 6, 11), Token("hee!", 12, 17)}); 280 EXPECT_EQ(token_index, 1); 281 282 // Click is contained in a token. 283 token_index = internal::CenterTokenFromClick( 284 {13, 17}, 285 {Token("Hll", 0, 5), Token("world", 6, 11), Token("hee!", 12, 17)}); 286 EXPECT_EQ(token_index, 2); 287 288 // Click spans two tokens. 289 token_index = internal::CenterTokenFromClick( 290 {6, 17}, 291 {Token("Hll", 0, 5), Token("world", 6, 11), Token("hee!", 12, 17)}); 292 EXPECT_EQ(token_index, kInvalidIndex); 293 } 294 295 TEST(FeatureProcessorTest, CenterTokenFromMiddleOfSelection) { 296 int token_index; 297 298 // Selection of length 3. Exactly aligned indices. 299 token_index = internal::CenterTokenFromMiddleOfSelection( 300 {7, 27}, 301 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20), 302 Token("Token4", 21, 27), Token("Token5", 28, 34)}); 303 EXPECT_EQ(token_index, 2); 304 305 // Selection of length 1 token. Exactly aligned indices. 306 token_index = internal::CenterTokenFromMiddleOfSelection( 307 {21, 27}, 308 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20), 309 Token("Token4", 21, 27), Token("Token5", 28, 34)}); 310 EXPECT_EQ(token_index, 3); 311 312 // Selection marks sub-token range, with no tokens in it. 313 token_index = internal::CenterTokenFromMiddleOfSelection( 314 {29, 33}, 315 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20), 316 Token("Token4", 21, 27), Token("Token5", 28, 34)}); 317 EXPECT_EQ(token_index, kInvalidIndex); 318 319 // Selection of length 2. Sub-token indices. 320 token_index = internal::CenterTokenFromMiddleOfSelection( 321 {3, 25}, 322 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20), 323 Token("Token4", 21, 27), Token("Token5", 28, 34)}); 324 EXPECT_EQ(token_index, 1); 325 326 // Selection of length 1. Sub-token indices. 327 token_index = internal::CenterTokenFromMiddleOfSelection( 328 {22, 34}, 329 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20), 330 Token("Token4", 21, 27), Token("Token5", 28, 34)}); 331 EXPECT_EQ(token_index, 4); 332 333 // Some invalid ones. 334 token_index = internal::CenterTokenFromMiddleOfSelection({7, 27}, {}); 335 EXPECT_EQ(token_index, -1); 336 } 337 338 TEST(FeatureProcessorTest, SupportedCodepointsRatio) { 339 FeatureProcessorOptions options; 340 options.set_context_size(2); 341 options.set_max_selection_span(2); 342 options.set_snap_label_span_boundaries_to_containing_tokens(false); 343 344 TokenizationCodepointRange* config = 345 options.add_tokenization_codepoint_config(); 346 config->set_start(32); 347 config->set_end(33); 348 config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR); 349 350 FeatureProcessorOptions::CodepointRange* range; 351 range = options.add_supported_codepoint_ranges(); 352 range->set_start(0); 353 range->set_end(128); 354 355 range = options.add_supported_codepoint_ranges(); 356 range->set_start(10000); 357 range->set_end(10001); 358 359 range = options.add_supported_codepoint_ranges(); 360 range->set_start(20000); 361 range->set_end(30000); 362 363 TestingFeatureProcessor feature_processor(options); 364 EXPECT_THAT(feature_processor.SupportedCodepointsRatio( 365 1, feature_processor.Tokenize("aaa bbb ccc")), 366 FloatEq(1.0)); 367 EXPECT_THAT(feature_processor.SupportedCodepointsRatio( 368 1, feature_processor.Tokenize("aaa bbb ")), 369 FloatEq(2.0 / 3)); 370 EXPECT_THAT(feature_processor.SupportedCodepointsRatio( 371 1, feature_processor.Tokenize(" ")), 372 FloatEq(0.0)); 373 EXPECT_FALSE(feature_processor.IsCodepointInRanges( 374 -1, feature_processor.supported_codepoint_ranges_)); 375 EXPECT_TRUE(feature_processor.IsCodepointInRanges( 376 0, feature_processor.supported_codepoint_ranges_)); 377 EXPECT_TRUE(feature_processor.IsCodepointInRanges( 378 10, feature_processor.supported_codepoint_ranges_)); 379 EXPECT_TRUE(feature_processor.IsCodepointInRanges( 380 127, feature_processor.supported_codepoint_ranges_)); 381 EXPECT_FALSE(feature_processor.IsCodepointInRanges( 382 128, feature_processor.supported_codepoint_ranges_)); 383 EXPECT_FALSE(feature_processor.IsCodepointInRanges( 384 9999, feature_processor.supported_codepoint_ranges_)); 385 EXPECT_TRUE(feature_processor.IsCodepointInRanges( 386 10000, feature_processor.supported_codepoint_ranges_)); 387 EXPECT_FALSE(feature_processor.IsCodepointInRanges( 388 10001, feature_processor.supported_codepoint_ranges_)); 389 EXPECT_TRUE(feature_processor.IsCodepointInRanges( 390 25000, feature_processor.supported_codepoint_ranges_)); 391 392 std::vector<Token> tokens; 393 int click_pos; 394 std::vector<float> extra_features; 395 std::unique_ptr<CachedFeatures> cached_features; 396 397 auto feature_fn = [](const std::vector<int>& sparse_features, 398 const std::vector<float>& dense_features, 399 float* embedding) { return true; }; 400 401 options.set_min_supported_codepoint_ratio(0.0); 402 TestingFeatureProcessor feature_processor2(options); 403 EXPECT_TRUE(feature_processor2.ExtractFeatures(" eee", {4, 7}, {0, 0}, 404 feature_fn, 2, &tokens, 405 &click_pos, &cached_features)); 406 407 options.set_min_supported_codepoint_ratio(0.2); 408 TestingFeatureProcessor feature_processor3(options); 409 EXPECT_TRUE(feature_processor3.ExtractFeatures(" eee", {4, 7}, {0, 0}, 410 feature_fn, 2, &tokens, 411 &click_pos, &cached_features)); 412 413 options.set_min_supported_codepoint_ratio(0.5); 414 TestingFeatureProcessor feature_processor4(options); 415 EXPECT_FALSE(feature_processor4.ExtractFeatures( 416 " eee", {4, 7}, {0, 0}, feature_fn, 2, &tokens, &click_pos, 417 &cached_features)); 418 } 419 420 TEST(FeatureProcessorTest, StripUnusedTokensWithNoRelativeClick) { 421 std::vector<Token> tokens_orig{ 422 Token("0", 0, 0), Token("1", 0, 0), Token("2", 0, 0), Token("3", 0, 0), 423 Token("4", 0, 0), Token("5", 0, 0), Token("6", 0, 0), Token("7", 0, 0), 424 Token("8", 0, 0), Token("9", 0, 0), Token("10", 0, 0), Token("11", 0, 0), 425 Token("12", 0, 0)}; 426 427 std::vector<Token> tokens; 428 int click_index; 429 430 // Try to click first token and see if it gets padded from left. 431 tokens = tokens_orig; 432 click_index = 0; 433 internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index); 434 // clang-format off 435 EXPECT_EQ(tokens, std::vector<Token>({Token(), 436 Token(), 437 Token("0", 0, 0), 438 Token("1", 0, 0), 439 Token("2", 0, 0)})); 440 // clang-format on 441 EXPECT_EQ(click_index, 2); 442 443 // When we click the second token nothing should get padded. 444 tokens = tokens_orig; 445 click_index = 2; 446 internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index); 447 // clang-format off 448 EXPECT_EQ(tokens, std::vector<Token>({Token("0", 0, 0), 449 Token("1", 0, 0), 450 Token("2", 0, 0), 451 Token("3", 0, 0), 452 Token("4", 0, 0)})); 453 // clang-format on 454 EXPECT_EQ(click_index, 2); 455 456 // When we click the last token tokens should get padded from the right. 457 tokens = tokens_orig; 458 click_index = 12; 459 internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index); 460 // clang-format off 461 EXPECT_EQ(tokens, std::vector<Token>({Token("10", 0, 0), 462 Token("11", 0, 0), 463 Token("12", 0, 0), 464 Token(), 465 Token()})); 466 // clang-format on 467 EXPECT_EQ(click_index, 2); 468 } 469 470 TEST(FeatureProcessorTest, StripUnusedTokensWithRelativeClick) { 471 std::vector<Token> tokens_orig{ 472 Token("0", 0, 0), Token("1", 0, 0), Token("2", 0, 0), Token("3", 0, 0), 473 Token("4", 0, 0), Token("5", 0, 0), Token("6", 0, 0), Token("7", 0, 0), 474 Token("8", 0, 0), Token("9", 0, 0), Token("10", 0, 0), Token("11", 0, 0), 475 Token("12", 0, 0)}; 476 477 std::vector<Token> tokens; 478 int click_index; 479 480 // Try to click first token and see if it gets padded from left to maximum 481 // context_size. 482 tokens = tokens_orig; 483 click_index = 0; 484 internal::StripOrPadTokens({2, 3}, 2, &tokens, &click_index); 485 // clang-format off 486 EXPECT_EQ(tokens, std::vector<Token>({Token(), 487 Token(), 488 Token("0", 0, 0), 489 Token("1", 0, 0), 490 Token("2", 0, 0), 491 Token("3", 0, 0), 492 Token("4", 0, 0), 493 Token("5", 0, 0)})); 494 // clang-format on 495 EXPECT_EQ(click_index, 2); 496 497 // Clicking to the middle with enough context should not produce any padding. 498 tokens = tokens_orig; 499 click_index = 6; 500 internal::StripOrPadTokens({3, 1}, 2, &tokens, &click_index); 501 // clang-format off 502 EXPECT_EQ(tokens, std::vector<Token>({Token("1", 0, 0), 503 Token("2", 0, 0), 504 Token("3", 0, 0), 505 Token("4", 0, 0), 506 Token("5", 0, 0), 507 Token("6", 0, 0), 508 Token("7", 0, 0), 509 Token("8", 0, 0), 510 Token("9", 0, 0)})); 511 // clang-format on 512 EXPECT_EQ(click_index, 5); 513 514 // Clicking at the end should pad right to maximum context_size. 515 tokens = tokens_orig; 516 click_index = 11; 517 internal::StripOrPadTokens({3, 1}, 2, &tokens, &click_index); 518 // clang-format off 519 EXPECT_EQ(tokens, std::vector<Token>({Token("6", 0, 0), 520 Token("7", 0, 0), 521 Token("8", 0, 0), 522 Token("9", 0, 0), 523 Token("10", 0, 0), 524 Token("11", 0, 0), 525 Token("12", 0, 0), 526 Token(), 527 Token()})); 528 // clang-format on 529 EXPECT_EQ(click_index, 5); 530 } 531 532 TEST(FeatureProcessorTest, ICUTokenize) { 533 FeatureProcessorOptions options; 534 options.set_tokenization_type( 535 libtextclassifier::FeatureProcessorOptions::ICU); 536 537 TestingFeatureProcessor feature_processor(options); 538 std::vector<Token> tokens = feature_processor.Tokenize(""); 539 ASSERT_EQ(tokens, 540 // clang-format off 541 std::vector<Token>({Token("", 0, 6), 542 Token("", 6, 12), 543 Token("", 12, 15), 544 Token("", 15, 17), 545 Token("", 17, 19)})); 546 // clang-format on 547 } 548 549 TEST(FeatureProcessorTest, ICUTokenizeWithWhitespaces) { 550 FeatureProcessorOptions options; 551 options.set_tokenization_type( 552 libtextclassifier::FeatureProcessorOptions::ICU); 553 options.set_icu_preserve_whitespace_tokens(true); 554 555 TestingFeatureProcessor feature_processor(options); 556 std::vector<Token> tokens = 557 feature_processor.Tokenize(" "); 558 ASSERT_EQ(tokens, 559 // clang-format off 560 std::vector<Token>({Token("", 0, 6), 561 Token(" ", 6, 7), 562 Token("", 7, 13), 563 Token(" ", 13, 14), 564 Token("", 14, 17), 565 Token(" ", 17, 18), 566 Token("", 18, 20), 567 Token(" ", 20, 21), 568 Token("", 21, 23)})); 569 // clang-format on 570 } 571 572 TEST(FeatureProcessorTest, MixedTokenize) { 573 FeatureProcessorOptions options; 574 options.set_tokenization_type( 575 libtextclassifier::FeatureProcessorOptions::MIXED); 576 577 TokenizationCodepointRange* config = 578 options.add_tokenization_codepoint_config(); 579 config->set_start(32); 580 config->set_end(33); 581 config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR); 582 583 FeatureProcessorOptions::CodepointRange* range; 584 range = options.add_internal_tokenizer_codepoint_ranges(); 585 range->set_start(0); 586 range->set_end(128); 587 588 range = options.add_internal_tokenizer_codepoint_ranges(); 589 range->set_start(128); 590 range->set_end(256); 591 592 range = options.add_internal_tokenizer_codepoint_ranges(); 593 range->set_start(256); 594 range->set_end(384); 595 596 range = options.add_internal_tokenizer_codepoint_ranges(); 597 range->set_start(384); 598 range->set_end(592); 599 600 TestingFeatureProcessor feature_processor(options); 601 std::vector<Token> tokens = feature_processor.Tokenize( 602 "Japanese-lnguag text http://www.google.com/"); 603 ASSERT_EQ(tokens, 604 // clang-format off 605 std::vector<Token>({Token("", 0, 5), 606 Token("Japanese-lnguag", 5, 22), 607 Token("text", 23, 27), 608 Token("", 28, 30), 609 Token("http://www.google.com/", 31, 53)})); 610 // clang-format on 611 } 612 613 } // namespace 614 } // namespace libtextclassifier 615