Home | History | Annotate | Download | only in lang_id
      1 /*
      2  * Copyright (C) 2018 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #ifndef NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_CUSTOM_TOKENIZER_H_
     18 #define NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_CUSTOM_TOKENIZER_H_
     19 
     20 #include <string>
     21 
     22 #include "lang_id/common/fel/task-context.h"
     23 #include "lang_id/common/lite_strings/stringpiece.h"
     24 #include "lang_id/light-sentence.h"
     25 
     26 namespace libtextclassifier3 {
     27 namespace mobile {
     28 namespace lang_id {
     29 
     30 // Custom tokenizer for the LangId model.
     31 class TokenizerForLangId {
     32  public:
     33   void Setup(TaskContext *context);
     34 
     35   // Tokenizes |text|, placing the tokens into |sentence|.  Customized for
     36   // LangId.  Currently (Sep 15, 2016) we tokenize on space, newline, tab, and
     37   // any other 1-byte UTF8 character which is not a letter, ignore all empty
     38   // tokens, and (for each of the remaining tokens) prepend "^" (special token
     39   // begin marker) and append "$" (special token end marker).
     40   //
     41   // Tokens are stored into the "repeated Token token;" field of *sentence.
     42   void Tokenize(StringPiece text, LightSentence *sentence) const;
     43 
     44  private:
     45   // If true, during tokenization, we use the lowercase version of each Unicode
     46   // character from the text to tokenize.  E.g., if this is true, the text "Foo
     47   // bar" is tokenized as ["foo", "bar"]; otherwise, we get ["Foo", "bar"].
     48   bool lowercase_input_ = false;
     49 };
     50 
     51 }  // namespace lang_id
     52 }  // namespace mobile
     53 }  // namespace nlp_saft
     54 
     55 #endif  // NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_CUSTOM_TOKENIZER_H_
     56