Home | History | Annotate | Download | only in smartselect
      1 // Copyright (C) 2017 The Android Open Source Project
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //      http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 // Text classification model configuration.
     16 
     17 syntax = "proto2";
     18 option optimize_for = LITE_RUNTIME;
     19 
     20 import "external/libtextclassifier/common/embedding-network.proto";
     21 import "external/libtextclassifier/smartselect/tokenizer.proto";
     22 
     23 package libtextclassifier;
     24 
     25 // Generic options of a model, non-specific to selection or sharing.
     26 message ModelOptions {
     27   // If true, will use embeddings from a different model. This is mainly useful
     28   // for the Sharing model using the embeddings from the Selection model.
     29   optional bool use_shared_embeddings = 1;
     30 
     31   // Language of the model.
     32   optional string language = 2;
     33 
     34   // Version of the model.
     35   optional int32 version = 3;
     36 }
     37 
     38 message SelectionModelOptions {
     39   // A list of Unicode codepoints to strip from predicted selections.
     40   repeated int32 punctuation_to_strip = 1;
     41 
     42   // Whether to strip punctuation after the selection is made.
     43   optional bool strip_punctuation = 2;
     44 
     45   // Enforce symmetrical selections.
     46   optional bool enforce_symmetry = 3;
     47 
     48   // Number of inferences made around the click position (to one side), for
     49   // enforcing symmetry.
     50   optional int32 symmetry_context_size = 4;
     51 }
     52 
     53 message SharingModelOptions {
     54   // If true, will always return "url" when the url hint is passed in.
     55   optional bool always_accept_url_hint = 1;
     56 
     57   // If true, will always return "email" when the e-mail hint is passed in.
     58   optional bool always_accept_email_hint = 2;
     59 
     60   // Limits for phone numbers.
     61   optional int32 phone_min_num_digits = 3 [default = 7];
     62   optional int32 phone_max_num_digits = 4 [default = 15];
     63 }
     64 
     65 message FeatureProcessorOptions {
     66   // Number of buckets used for hashing charactergrams.
     67   optional int32 num_buckets = 1 [default = -1];
     68 
     69   // Context size defines the number of words to the left and to the right of
     70   // the selected word to be used as context. For example, if context size is
     71   // N, then we take N words to the left and N words to the right of the
     72   // selected word as its context.
     73   optional int32 context_size = 2 [default = -1];
     74 
     75   // Maximum number of words of the context to select in total.
     76   optional int32 max_selection_span = 3 [default = -1];
     77 
     78   // Orders of charactergrams to extract. E.g., 2 means character bigrams, 3
     79   // character trigrams etc.
     80   repeated int32 chargram_orders = 4;
     81 
     82   // Maximum length of a word, in codepoints.
     83   optional int32 max_word_length = 21 [default = 20];
     84 
     85   // If true, will use the unicode-aware functionality for extracting features.
     86   optional bool unicode_aware_features = 19 [default = false];
     87 
     88   // Whether to extract the token case feature.
     89   optional bool extract_case_feature = 5 [default = false];
     90 
     91   // Whether to extract the selection mask feature.
     92   optional bool extract_selection_mask_feature = 6 [default = false];
     93 
     94   // List of regexps to run over each token. For each regexp, if there is a
     95   // match, a dense feature of 1.0 is emitted. Otherwise -1.0 is used.
     96   repeated string regexp_feature = 22;
     97 
     98   // Whether to remap all digits to a single number.
     99   optional bool remap_digits = 20 [default = false];
    100 
    101   // Whether to lower-case each token before generating hashgrams.
    102   optional bool lowercase_tokens = 33;
    103 
    104   // If true, the selection classifier output will contain only the selections
    105   // that are feasible (e.g., those that are shorter than max_selection_span),
    106   // if false, the output will be a complete cross-product of possible
    107   // selections to the left and posible selections to the right, including the
    108   // infeasible ones.
    109   // NOTE: Exists mainly for compatibility with older models that were trained
    110   // with the non-reduced output space.
    111   optional bool selection_reduced_output_space = 8 [default = true];
    112 
    113   // Collection names.
    114   repeated string collections = 9;
    115 
    116   // An index of collection in collections to be used if a collection name can't
    117   // be mapped to an id.
    118   optional int32 default_collection = 10 [default = -1];
    119 
    120   // If true, will split the input by lines, and only use the line that contains
    121   // the clicked token.
    122   optional bool only_use_line_with_click = 13 [default = false];
    123 
    124   // If true, will split tokens that contain the selection boundary, at the
    125   // position of the boundary.
    126   // E.g. "foo{bar}@google.com" -> "foo", "bar", "@google.com"
    127   optional bool split_tokens_on_selection_boundaries = 14 [default = false];
    128 
    129   // Codepoint ranges that determine how different codepoints are tokenized.
    130   // The ranges must not overlap.
    131   repeated TokenizationCodepointRange tokenization_codepoint_config = 15;
    132 
    133   // Method for selecting the center token.
    134   enum CenterTokenSelectionMethod {
    135     DEFAULT_CENTER_TOKEN_METHOD = 0;  // Invalid option.
    136 
    137     // Use click indices to determine the center token.
    138     CENTER_TOKEN_FROM_CLICK = 1;
    139 
    140     // Use selection indices to get a token range, and select the middle of it
    141     // as the center token.
    142     CENTER_TOKEN_MIDDLE_OF_SELECTION = 2;
    143   }
    144   optional CenterTokenSelectionMethod center_token_selection_method = 16;
    145 
    146   // If true, span boundaries will be snapped to containing tokens and not
    147   // required to exactly match token boundaries.
    148   optional bool snap_label_span_boundaries_to_containing_tokens = 18;
    149 
    150   // Range of codepoints start - end, where end is exclusive.
    151   message CodepointRange {
    152     optional int32 start = 1;
    153     optional int32 end = 2;
    154   }
    155 
    156   // A set of codepoint ranges supported by the model.
    157   repeated CodepointRange supported_codepoint_ranges = 23;
    158 
    159   // A set of codepoint ranges to use in the mixed tokenization mode to identify
    160   // stretches of tokens to re-tokenize using the internal tokenizer.
    161   repeated CodepointRange internal_tokenizer_codepoint_ranges = 34;
    162 
    163   // Minimum ratio of supported codepoints in the input context. If the ratio
    164   // is lower than this, the feature computation will fail.
    165   optional float min_supported_codepoint_ratio = 24 [default = 0.0];
    166 
    167   // Used for versioning the format of features the model expects.
    168   //  - feature_version == 0:
    169   //      For each token the features consist of:
    170   //       - chargram embeddings
    171   //       - dense features
    172   //      Chargram embeddings for tokens are concatenated first together,
    173   //      and at the end, the dense features for the tokens are concatenated
    174   //      to it. So the resulting feature vector has two regions.
    175   optional int32 feature_version = 25 [default = 0];
    176 
    177   // Controls the type of tokenization the model will use for the input text.
    178   enum TokenizationType {
    179     INVALID_TOKENIZATION_TYPE = 0;
    180 
    181     // Use the internal tokenizer for tokenization.
    182     INTERNAL_TOKENIZER = 1;
    183 
    184     // Use ICU for tokenization.
    185     ICU = 2;
    186 
    187     // First apply ICU tokenization. Then identify stretches of tokens
    188     // consisting only of codepoints in internal_tokenizer_codepoint_ranges
    189     // and re-tokenize them using the internal tokenizer.
    190     MIXED = 3;
    191   }
    192   optional TokenizationType tokenization_type = 30
    193       [default = INTERNAL_TOKENIZER];
    194   optional bool icu_preserve_whitespace_tokens = 31 [default = false];
    195 
    196   reserved 7, 11, 12, 17, 26, 27, 28, 29, 32;
    197 };
    198 
    199 extend nlp_core.EmbeddingNetworkProto {
    200   optional ModelOptions model_options_in_embedding_network_proto = 150063045;
    201   optional FeatureProcessorOptions
    202       feature_processor_options_in_embedding_network_proto = 146230910;
    203   optional SelectionModelOptions
    204       selection_model_options_in_embedding_network_proto = 148190899;
    205   optional SharingModelOptions
    206       sharing_model_options_in_embedding_network_proto = 151445439;
    207 }
    208