Home | History | Annotate | Download | only in sentencepiece
      1 /*
      2  * Copyright (C) 2018 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #ifndef LIBTEXTCLASSIFIER_UTILS_SENTENCEPIECE_NORMALIZER_H_
     18 #define LIBTEXTCLASSIFIER_UTILS_SENTENCEPIECE_NORMALIZER_H_
     19 
     20 #include <memory>
     21 #include <string>
     22 
     23 #include "utils/sentencepiece/double_array_trie.h"
     24 #include "utils/strings/stringpiece.h"
     25 
     26 namespace libtextclassifier3 {
     27 
     28 // Normalizer implements a simple text normalizer with user-defined
     29 // string-to-string rules and leftmost longest matching.
     30 class SentencePieceNormalizer {
     31  public:
     32   // charsmap_trie and charsmap_normalized specify the normalization/replacement
     33   // string-to-string rules in the following way:
     34   // A match in the trie for a string will return the offset in
     35   // charsmap_normalized that contains the replacement string.
     36   //
     37   // add_dummy_prefix: Whether to add dummy whitespace at the beginning of the
     38   //   text in order to treat "world" in "world" and "hello world" uniformly.
     39   //
     40   // remove_extra_whitespaces: Whether to remove leading, trailing and duplicate
     41   //   internal whitespace.
     42   //
     43   // escape_whitespaces: Whether to replace whitespace with a meta symbol.
     44   SentencePieceNormalizer(const DoubleArrayTrie& charsmap_trie,
     45                           StringPiece charsmap_normalized,
     46                           bool add_dummy_prefix = true,
     47                           bool remove_extra_whitespaces = true,
     48                           bool escape_whitespaces = true)
     49       : charsmap_trie_(charsmap_trie),
     50         charsmap_normalized_(charsmap_normalized),
     51         add_dummy_prefix_(add_dummy_prefix),
     52         remove_extra_whitespaces_(remove_extra_whitespaces),
     53         escape_whitespaces_(escape_whitespaces) {}
     54 
     55   // Normalizes a plain utf8 string into an internal representation for
     56   // Sentencepiece model.
     57   bool Normalize(StringPiece input, std::string* normalized_input) const;
     58 
     59  private:
     60   // Normalizes the prefix of `input` and returns the pair of
     61   // normalized prefix and the length of the prefix of `input` processed in the
     62   // normalization.
     63   bool NormalizePrefix(StringPiece input,
     64                        std::pair<StringPiece, int>* prefix) const;
     65 
     66   // Internal trie for efficient longest prefix string matching.
     67   DoubleArrayTrie charsmap_trie_;
     68 
     69   // "\0" delimitered concatenated normalized strings.
     70   // the value of `charsmap_trie_` stores offsets into this string.
     71   StringPiece charsmap_normalized_;
     72 
     73   const bool add_dummy_prefix_;
     74   const bool remove_extra_whitespaces_;
     75   const bool escape_whitespaces_;
     76 };
     77 
     78 }  // namespace libtextclassifier3
     79 
     80 #endif  // LIBTEXTCLASSIFIER_UTILS_SENTENCEPIECE_NORMALIZER_H_
     81