1 // Copyright (C) 2017 The Android Open Source Project 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Text classification model configuration. 16 17 syntax = "proto2"; 18 option optimize_for = LITE_RUNTIME; 19 20 import "external/libtextclassifier/common/embedding-network.proto"; 21 import "external/libtextclassifier/smartselect/tokenizer.proto"; 22 23 package libtextclassifier; 24 25 // Generic options of a model, non-specific to selection or sharing. 26 message ModelOptions { 27 // If true, will use embeddings from a different model. This is mainly useful 28 // for the Sharing model using the embeddings from the Selection model. 29 optional bool use_shared_embeddings = 1; 30 31 // Language of the model. 32 optional string language = 2; 33 34 // Version of the model. 35 optional int32 version = 3; 36 } 37 38 message SelectionModelOptions { 39 // A list of Unicode codepoints to strip from predicted selections. 40 repeated int32 punctuation_to_strip = 1; 41 42 // Whether to strip punctuation after the selection is made. 43 optional bool strip_punctuation = 2; 44 45 // Enforce symmetrical selections. 46 optional bool enforce_symmetry = 3; 47 48 // Number of inferences made around the click position (to one side), for 49 // enforcing symmetry. 50 optional int32 symmetry_context_size = 4; 51 } 52 53 message SharingModelOptions { 54 // If true, will always return "url" when the url hint is passed in. 55 optional bool always_accept_url_hint = 1; 56 57 // If true, will always return "email" when the e-mail hint is passed in. 58 optional bool always_accept_email_hint = 2; 59 60 // Limits for phone numbers. 61 optional int32 phone_min_num_digits = 3 [default = 7]; 62 optional int32 phone_max_num_digits = 4 [default = 15]; 63 } 64 65 message FeatureProcessorOptions { 66 // Number of buckets used for hashing charactergrams. 67 optional int32 num_buckets = 1 [default = -1]; 68 69 // Context size defines the number of words to the left and to the right of 70 // the selected word to be used as context. For example, if context size is 71 // N, then we take N words to the left and N words to the right of the 72 // selected word as its context. 73 optional int32 context_size = 2 [default = -1]; 74 75 // Maximum number of words of the context to select in total. 76 optional int32 max_selection_span = 3 [default = -1]; 77 78 // Orders of charactergrams to extract. E.g., 2 means character bigrams, 3 79 // character trigrams etc. 80 repeated int32 chargram_orders = 4; 81 82 // Maximum length of a word, in codepoints. 83 optional int32 max_word_length = 21 [default = 20]; 84 85 // If true, will use the unicode-aware functionality for extracting features. 86 optional bool unicode_aware_features = 19 [default = false]; 87 88 // Whether to extract the token case feature. 89 optional bool extract_case_feature = 5 [default = false]; 90 91 // Whether to extract the selection mask feature. 92 optional bool extract_selection_mask_feature = 6 [default = false]; 93 94 // List of regexps to run over each token. For each regexp, if there is a 95 // match, a dense feature of 1.0 is emitted. Otherwise -1.0 is used. 96 repeated string regexp_feature = 22; 97 98 // Whether to remap all digits to a single number. 99 optional bool remap_digits = 20 [default = false]; 100 101 // Whether to lower-case each token before generating hashgrams. 102 optional bool lowercase_tokens = 33; 103 104 // If true, the selection classifier output will contain only the selections 105 // that are feasible (e.g., those that are shorter than max_selection_span), 106 // if false, the output will be a complete cross-product of possible 107 // selections to the left and posible selections to the right, including the 108 // infeasible ones. 109 // NOTE: Exists mainly for compatibility with older models that were trained 110 // with the non-reduced output space. 111 optional bool selection_reduced_output_space = 8 [default = true]; 112 113 // Collection names. 114 repeated string collections = 9; 115 116 // An index of collection in collections to be used if a collection name can't 117 // be mapped to an id. 118 optional int32 default_collection = 10 [default = -1]; 119 120 // If true, will split the input by lines, and only use the line that contains 121 // the clicked token. 122 optional bool only_use_line_with_click = 13 [default = false]; 123 124 // If true, will split tokens that contain the selection boundary, at the 125 // position of the boundary. 126 // E.g. "foo{bar}@google.com" -> "foo", "bar", "@google.com" 127 optional bool split_tokens_on_selection_boundaries = 14 [default = false]; 128 129 // Codepoint ranges that determine how different codepoints are tokenized. 130 // The ranges must not overlap. 131 repeated TokenizationCodepointRange tokenization_codepoint_config = 15; 132 133 // Method for selecting the center token. 134 enum CenterTokenSelectionMethod { 135 DEFAULT_CENTER_TOKEN_METHOD = 0; // Invalid option. 136 137 // Use click indices to determine the center token. 138 CENTER_TOKEN_FROM_CLICK = 1; 139 140 // Use selection indices to get a token range, and select the middle of it 141 // as the center token. 142 CENTER_TOKEN_MIDDLE_OF_SELECTION = 2; 143 } 144 optional CenterTokenSelectionMethod center_token_selection_method = 16; 145 146 // If true, span boundaries will be snapped to containing tokens and not 147 // required to exactly match token boundaries. 148 optional bool snap_label_span_boundaries_to_containing_tokens = 18; 149 150 // Range of codepoints start - end, where end is exclusive. 151 message CodepointRange { 152 optional int32 start = 1; 153 optional int32 end = 2; 154 } 155 156 // A set of codepoint ranges supported by the model. 157 repeated CodepointRange supported_codepoint_ranges = 23; 158 159 // A set of codepoint ranges to use in the mixed tokenization mode to identify 160 // stretches of tokens to re-tokenize using the internal tokenizer. 161 repeated CodepointRange internal_tokenizer_codepoint_ranges = 34; 162 163 // Minimum ratio of supported codepoints in the input context. If the ratio 164 // is lower than this, the feature computation will fail. 165 optional float min_supported_codepoint_ratio = 24 [default = 0.0]; 166 167 // Used for versioning the format of features the model expects. 168 // - feature_version == 0: 169 // For each token the features consist of: 170 // - chargram embeddings 171 // - dense features 172 // Chargram embeddings for tokens are concatenated first together, 173 // and at the end, the dense features for the tokens are concatenated 174 // to it. So the resulting feature vector has two regions. 175 optional int32 feature_version = 25 [default = 0]; 176 177 // Controls the type of tokenization the model will use for the input text. 178 enum TokenizationType { 179 INVALID_TOKENIZATION_TYPE = 0; 180 181 // Use the internal tokenizer for tokenization. 182 INTERNAL_TOKENIZER = 1; 183 184 // Use ICU for tokenization. 185 ICU = 2; 186 187 // First apply ICU tokenization. Then identify stretches of tokens 188 // consisting only of codepoints in internal_tokenizer_codepoint_ranges 189 // and re-tokenize them using the internal tokenizer. 190 MIXED = 3; 191 } 192 optional TokenizationType tokenization_type = 30 193 [default = INTERNAL_TOKENIZER]; 194 optional bool icu_preserve_whitespace_tokens = 31 [default = false]; 195 196 reserved 7, 11, 12, 17, 26, 27, 28, 29, 32; 197 }; 198 199 extend nlp_core.EmbeddingNetworkProto { 200 optional ModelOptions model_options_in_embedding_network_proto = 150063045; 201 optional FeatureProcessorOptions 202 feature_processor_options_in_embedding_network_proto = 146230910; 203 optional SelectionModelOptions 204 selection_model_options_in_embedding_network_proto = 148190899; 205 optional SharingModelOptions 206 sharing_model_options_in_embedding_network_proto = 151445439; 207 } 208