1 /* 2 * Copyright (C) 2017 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef LIBTEXTCLASSIFIER_TOKENIZER_H_ 18 #define LIBTEXTCLASSIFIER_TOKENIZER_H_ 19 20 #include <string> 21 #include <vector> 22 23 #include "model_generated.h" 24 #include "types.h" 25 #include "util/base/integral_types.h" 26 #include "util/utf8/unicodetext.h" 27 28 namespace libtextclassifier2 { 29 30 const int kInvalidScript = -1; 31 const int kUnknownScript = -2; 32 33 // Tokenizer splits the input string into a sequence of tokens, according to the 34 // configuration. 35 class Tokenizer { 36 public: 37 explicit Tokenizer( 38 const std::vector<const TokenizationCodepointRange*>& codepoint_ranges, 39 bool split_on_script_change); 40 41 // Tokenizes the input string using the selected tokenization method. 42 std::vector<Token> Tokenize(const std::string& text) const; 43 44 // Same as above but takes UnicodeText. 45 std::vector<Token> Tokenize(const UnicodeText& text_unicode) const; 46 47 protected: 48 // Finds the tokenization codepoint range config for given codepoint. 49 // Internally uses binary search so should be O(log(# of codepoint_ranges)). 50 const TokenizationCodepointRangeT* FindTokenizationRange(int codepoint) const; 51 52 // Finds the role and script for given codepoint. If not found, DEFAULT_ROLE 53 // and kUnknownScript are assigned. 54 void GetScriptAndRole(char32 codepoint, 55 TokenizationCodepointRange_::Role* role, 56 int* script) const; 57 58 private: 59 // Codepoint ranges that determine how different codepoints are tokenized. 60 // The ranges must not overlap. 61 std::vector<std::unique_ptr<const TokenizationCodepointRangeT>> 62 codepoint_ranges_; 63 64 // If true, tokens will be additionally split when the codepoint's script_id 65 // changes. 66 bool split_on_script_change_; 67 }; 68 69 } // namespace libtextclassifier2 70 71 #endif // LIBTEXTCLASSIFIER_TOKENIZER_H_ 72