Home | History | Annotate | Download | only in smartselect
      1 /*
      2  * Copyright (C) 2017 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #ifndef LIBTEXTCLASSIFIER_SMARTSELECT_TOKENIZER_H_
     18 #define LIBTEXTCLASSIFIER_SMARTSELECT_TOKENIZER_H_
     19 
     20 #include <string>
     21 #include <vector>
     22 
     23 #include "smartselect/tokenizer.pb.h"
     24 #include "smartselect/types.h"
     25 #include "util/base/integral_types.h"
     26 
     27 namespace libtextclassifier {
     28 
     29 // Tokenizer splits the input string into a sequence of tokens, according to the
     30 // configuration.
     31 class Tokenizer {
     32  public:
     33   explicit Tokenizer(
     34       const std::vector<TokenizationCodepointRange>& codepoint_range_configs) {
     35     PrepareTokenizationCodepointRanges(codepoint_range_configs);
     36   }
     37 
     38   // Tokenizes the input string using the selected tokenization method.
     39   std::vector<Token> Tokenize(const std::string& utf8_text) const;
     40 
     41  protected:
     42   // Represents a codepoint range [start, end) with its role for tokenization.
     43   struct CodepointRange {
     44     int32 start;
     45     int32 end;
     46     TokenizationCodepointRange::Role role;
     47 
     48     CodepointRange(int32 arg_start, int32 arg_end,
     49                    TokenizationCodepointRange::Role arg_role)
     50         : start(arg_start), end(arg_end), role(arg_role) {}
     51   };
     52 
     53   // Prepares tokenization codepoint ranges for use in tokenization.
     54   void PrepareTokenizationCodepointRanges(
     55       const std::vector<TokenizationCodepointRange>& codepoint_range_configs);
     56 
     57   // Finds the tokenization role for given codepoint.
     58   // If the character is not found returns DEFAULT_ROLE.
     59   // Internally uses binary search so should be O(log(# of codepoint_ranges)).
     60   TokenizationCodepointRange::Role FindTokenizationRole(int codepoint) const;
     61 
     62  private:
     63   // Codepoint ranges that determine how different codepoints are tokenized.
     64   // The ranges must not overlap.
     65   std::vector<CodepointRange> codepoint_ranges_;
     66 };
     67 
     68 }  // namespace libtextclassifier
     69 
     70 #endif  // LIBTEXTCLASSIFIER_SMARTSELECT_TOKENIZER_H_
     71