1 // 2 // Copyright (C) 2017 The Android Open Source Project 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 // 16 17 file_identifier "TC2 "; 18 19 // The possible model modes, represents a bit field. 20 namespace libtextclassifier2; 21 enum ModeFlag : int { 22 NONE = 0, 23 ANNOTATION = 1, 24 CLASSIFICATION = 2, 25 ANNOTATION_AND_CLASSIFICATION = 3, 26 SELECTION = 4, 27 ANNOTATION_AND_SELECTION = 5, 28 CLASSIFICATION_AND_SELECTION = 6, 29 ALL = 7, 30 } 31 32 namespace libtextclassifier2; 33 enum DatetimeExtractorType : int { 34 UNKNOWN_DATETIME_EXTRACTOR_TYPE = 0, 35 AM = 1, 36 PM = 2, 37 JANUARY = 3, 38 FEBRUARY = 4, 39 MARCH = 5, 40 APRIL = 6, 41 MAY = 7, 42 JUNE = 8, 43 JULY = 9, 44 AUGUST = 10, 45 SEPTEMBER = 11, 46 OCTOBER = 12, 47 NOVEMBER = 13, 48 DECEMBER = 14, 49 NEXT = 15, 50 NEXT_OR_SAME = 16, 51 LAST = 17, 52 NOW = 18, 53 TOMORROW = 19, 54 YESTERDAY = 20, 55 PAST = 21, 56 FUTURE = 22, 57 DAY = 23, 58 WEEK = 24, 59 MONTH = 25, 60 YEAR = 26, 61 MONDAY = 27, 62 TUESDAY = 28, 63 WEDNESDAY = 29, 64 THURSDAY = 30, 65 FRIDAY = 31, 66 SATURDAY = 32, 67 SUNDAY = 33, 68 DAYS = 34, 69 WEEKS = 35, 70 MONTHS = 36, 71 HOURS = 37, 72 MINUTES = 38, 73 SECONDS = 39, 74 YEARS = 40, 75 DIGITS = 41, 76 SIGNEDDIGITS = 42, 77 ZERO = 43, 78 ONE = 44, 79 TWO = 45, 80 THREE = 46, 81 FOUR = 47, 82 FIVE = 48, 83 SIX = 49, 84 SEVEN = 50, 85 EIGHT = 51, 86 NINE = 52, 87 TEN = 53, 88 ELEVEN = 54, 89 TWELVE = 55, 90 THIRTEEN = 56, 91 FOURTEEN = 57, 92 FIFTEEN = 58, 93 SIXTEEN = 59, 94 SEVENTEEN = 60, 95 EIGHTEEN = 61, 96 NINETEEN = 62, 97 TWENTY = 63, 98 THIRTY = 64, 99 FORTY = 65, 100 FIFTY = 66, 101 SIXTY = 67, 102 SEVENTY = 68, 103 EIGHTY = 69, 104 NINETY = 70, 105 HUNDRED = 71, 106 THOUSAND = 72, 107 } 108 109 namespace libtextclassifier2; 110 enum DatetimeGroupType : int { 111 GROUP_UNKNOWN = 0, 112 GROUP_UNUSED = 1, 113 GROUP_YEAR = 2, 114 GROUP_MONTH = 3, 115 GROUP_DAY = 4, 116 GROUP_HOUR = 5, 117 GROUP_MINUTE = 6, 118 GROUP_SECOND = 7, 119 GROUP_AMPM = 8, 120 GROUP_RELATIONDISTANCE = 9, 121 GROUP_RELATION = 10, 122 GROUP_RELATIONTYPE = 11, 123 124 // Dummy groups serve just as an inflator of the selection. E.g. we might want 125 // to select more text than was contained in an envelope of all extractor 126 // spans. 127 GROUP_DUMMY1 = 12, 128 129 GROUP_DUMMY2 = 13, 130 } 131 132 namespace libtextclassifier2; 133 table CompressedBuffer { 134 buffer:[ubyte]; 135 uncompressed_size:int; 136 } 137 138 // Options for the model that predicts text selection. 139 namespace libtextclassifier2; 140 table SelectionModelOptions { 141 // If true, before the selection is returned, the unpaired brackets contained 142 // in the predicted selection are stripped from the both selection ends. 143 // The bracket codepoints are defined in the Unicode standard: 144 // http://www.unicode.org/Public/UNIDATA/BidiBrackets.txt 145 strip_unpaired_brackets:bool = 1; 146 147 // Number of hypothetical click positions on either side of the actual click 148 // to consider in order to enforce symmetry. 149 symmetry_context_size:int; 150 151 // Number of examples to bundle in one batch for inference. 152 batch_size:int = 1024; 153 154 // Whether to always classify a suggested selection or only on demand. 155 always_classify_suggested_selection:bool = 0; 156 } 157 158 // Options for the model that classifies a text selection. 159 namespace libtextclassifier2; 160 table ClassificationModelOptions { 161 // Limits for phone numbers. 162 phone_min_num_digits:int = 7; 163 164 phone_max_num_digits:int = 15; 165 166 // Limits for addresses. 167 address_min_num_tokens:int; 168 169 // Maximum number of tokens to attempt a classification (-1 is unlimited). 170 max_num_tokens:int = -1; 171 } 172 173 // List of regular expression matchers to check. 174 namespace libtextclassifier2.RegexModel_; 175 table Pattern { 176 // The name of the collection of a match. 177 collection_name:string; 178 179 // The pattern to check. 180 // Can specify a single capturing group used as match boundaries. 181 pattern:string; 182 183 // The modes for which to apply the patterns. 184 enabled_modes:libtextclassifier2.ModeFlag = ALL; 185 186 // The final score to assign to the results of this pattern. 187 target_classification_score:float = 1; 188 189 // Priority score used for conflict resolution with the other models. 190 priority_score:float = 0; 191 192 // If true, will use an approximate matching implementation implemented 193 // using Find() instead of the true Match(). This approximate matching will 194 // use the first Find() result and then check that it spans the whole input. 195 use_approximate_matching:bool = 0; 196 197 compressed_pattern:libtextclassifier2.CompressedBuffer; 198 } 199 200 namespace libtextclassifier2; 201 table RegexModel { 202 patterns:[libtextclassifier2.RegexModel_.Pattern]; 203 } 204 205 // List of regex patterns. 206 namespace libtextclassifier2.DatetimeModelPattern_; 207 table Regex { 208 pattern:string; 209 210 // The ith entry specifies the type of the ith capturing group. 211 // This is used to decide how the matched content has to be parsed. 212 groups:[libtextclassifier2.DatetimeGroupType]; 213 214 compressed_pattern:libtextclassifier2.CompressedBuffer; 215 } 216 217 namespace libtextclassifier2; 218 table DatetimeModelPattern { 219 regexes:[libtextclassifier2.DatetimeModelPattern_.Regex]; 220 221 // List of locale indices in DatetimeModel that represent the locales that 222 // these patterns should be used for. If empty, can be used for all locales. 223 locales:[int]; 224 225 // The final score to assign to the results of this pattern. 226 target_classification_score:float = 1; 227 228 // Priority score used for conflict resulution with the other models. 229 priority_score:float = 0; 230 231 // The modes for which to apply the patterns. 232 enabled_modes:libtextclassifier2.ModeFlag = ALL; 233 } 234 235 namespace libtextclassifier2; 236 table DatetimeModelExtractor { 237 extractor:libtextclassifier2.DatetimeExtractorType; 238 pattern:string; 239 locales:[int]; 240 compressed_pattern:libtextclassifier2.CompressedBuffer; 241 } 242 243 namespace libtextclassifier2; 244 table DatetimeModel { 245 // List of BCP 47 locale strings representing all locales supported by the 246 // model. The individual patterns refer back to them using an index. 247 locales:[string]; 248 249 patterns:[libtextclassifier2.DatetimeModelPattern]; 250 extractors:[libtextclassifier2.DatetimeModelExtractor]; 251 252 // If true, will use the extractors for determining the match location as 253 // opposed to using the location where the global pattern matched. 254 use_extractors_for_locating:bool = 1; 255 256 // List of locale ids, rules of whose are always run, after the requested 257 // ones. 258 default_locales:[int]; 259 } 260 261 namespace libtextclassifier2.DatetimeModelLibrary_; 262 table Item { 263 key:string; 264 value:libtextclassifier2.DatetimeModel; 265 } 266 267 // A set of named DateTime models. 268 namespace libtextclassifier2; 269 table DatetimeModelLibrary { 270 models:[libtextclassifier2.DatetimeModelLibrary_.Item]; 271 } 272 273 // Options controlling the output of the Tensorflow Lite models. 274 namespace libtextclassifier2; 275 table ModelTriggeringOptions { 276 // Lower bound threshold for filtering annotation model outputs. 277 min_annotate_confidence:float = 0; 278 279 // The modes for which to enable the models. 280 enabled_modes:libtextclassifier2.ModeFlag = ALL; 281 } 282 283 // Options controlling the output of the classifier. 284 namespace libtextclassifier2; 285 table OutputOptions { 286 // Lists of collection names that will be filtered out at the output: 287 // - For annotation, the spans of given collection are simply dropped. 288 // - For classification, the result is mapped to the class "other". 289 // - For selection, the spans of given class are returned as 290 // single-selection. 291 filtered_collections_annotation:[string]; 292 293 filtered_collections_classification:[string]; 294 filtered_collections_selection:[string]; 295 } 296 297 namespace libtextclassifier2; 298 table Model { 299 // Comma-separated list of locales supported by the model as BCP 47 tags. 300 locales:string; 301 302 version:int; 303 304 // A name for the model that can be used for e.g. logging. 305 name:string; 306 307 selection_feature_options:libtextclassifier2.FeatureProcessorOptions; 308 classification_feature_options:libtextclassifier2.FeatureProcessorOptions; 309 310 // Tensorflow Lite models. 311 selection_model:[ubyte] (force_align: 16); 312 313 classification_model:[ubyte] (force_align: 16); 314 embedding_model:[ubyte] (force_align: 16); 315 316 // Options for the different models. 317 selection_options:libtextclassifier2.SelectionModelOptions; 318 319 classification_options:libtextclassifier2.ClassificationModelOptions; 320 regex_model:libtextclassifier2.RegexModel; 321 datetime_model:libtextclassifier2.DatetimeModel; 322 323 // Options controlling the output of the models. 324 triggering_options:libtextclassifier2.ModelTriggeringOptions; 325 326 // Global switch that controls if SuggestSelection(), ClassifyText() and 327 // Annotate() will run. If a mode is disabled it returns empty/no-op results. 328 enabled_modes:libtextclassifier2.ModeFlag = ALL; 329 330 // If true, will snap the selections that consist only of whitespaces to the 331 // containing suggested span. Otherwise, no suggestion is proposed, since the 332 // selections are not part of any token. 333 snap_whitespace_selections:bool = 1; 334 335 // Global configuration for the output of SuggestSelection(), ClassifyText() 336 // and Annotate(). 337 output_options:libtextclassifier2.OutputOptions; 338 } 339 340 // Role of the codepoints in the range. 341 namespace libtextclassifier2.TokenizationCodepointRange_; 342 enum Role : int { 343 // Concatenates the codepoint to the current run of codepoints. 344 DEFAULT_ROLE = 0, 345 346 // Splits a run of codepoints before the current codepoint. 347 SPLIT_BEFORE = 1, 348 349 // Splits a run of codepoints after the current codepoint. 350 SPLIT_AFTER = 2, 351 352 // Each codepoint will be a separate token. Good e.g. for Chinese 353 // characters. 354 TOKEN_SEPARATOR = 3, 355 356 // Discards the codepoint. 357 DISCARD_CODEPOINT = 4, 358 359 // Common values: 360 // Splits on the characters and discards them. Good e.g. for the space 361 // character. 362 WHITESPACE_SEPARATOR = 7, 363 } 364 365 // Represents a codepoint range [start, end) with its role for tokenization. 366 namespace libtextclassifier2; 367 table TokenizationCodepointRange { 368 start:int; 369 end:int; 370 role:libtextclassifier2.TokenizationCodepointRange_.Role; 371 372 // Integer identifier of the script this range denotes. Negative values are 373 // reserved for Tokenizer's internal use. 374 script_id:int; 375 } 376 377 // Method for selecting the center token. 378 namespace libtextclassifier2.FeatureProcessorOptions_; 379 enum CenterTokenSelectionMethod : int { 380 DEFAULT_CENTER_TOKEN_METHOD = 0, 381 382 // Use click indices to determine the center token. 383 CENTER_TOKEN_FROM_CLICK = 1, 384 385 // Use selection indices to get a token range, and select the middle of it 386 // as the center token. 387 CENTER_TOKEN_MIDDLE_OF_SELECTION = 2, 388 } 389 390 // Controls the type of tokenization the model will use for the input text. 391 namespace libtextclassifier2.FeatureProcessorOptions_; 392 enum TokenizationType : int { 393 INVALID_TOKENIZATION_TYPE = 0, 394 395 // Use the internal tokenizer for tokenization. 396 INTERNAL_TOKENIZER = 1, 397 398 // Use ICU for tokenization. 399 ICU = 2, 400 401 // First apply ICU tokenization. Then identify stretches of tokens 402 // consisting only of codepoints in internal_tokenizer_codepoint_ranges 403 // and re-tokenize them using the internal tokenizer. 404 MIXED = 3, 405 } 406 407 // Range of codepoints start - end, where end is exclusive. 408 namespace libtextclassifier2.FeatureProcessorOptions_; 409 table CodepointRange { 410 start:int; 411 end:int; 412 } 413 414 // Bounds-sensitive feature extraction configuration. 415 namespace libtextclassifier2.FeatureProcessorOptions_; 416 table BoundsSensitiveFeatures { 417 // Enables the extraction of bounds-sensitive features, instead of the click 418 // context features. 419 enabled:bool; 420 421 // The numbers of tokens to extract in specific locations relative to the 422 // bounds. 423 // Immediately before the span. 424 num_tokens_before:int; 425 426 // Inside the span, aligned with the beginning. 427 num_tokens_inside_left:int; 428 429 // Inside the span, aligned with the end. 430 num_tokens_inside_right:int; 431 432 // Immediately after the span. 433 num_tokens_after:int; 434 435 // If true, also extracts the tokens of the entire span and adds up their 436 // features forming one "token" to include in the extracted features. 437 include_inside_bag:bool; 438 439 // If true, includes the selection length (in the number of tokens) as a 440 // feature. 441 include_inside_length:bool; 442 443 // If true, for selection, single token spans are not run through the model 444 // and their score is assumed to be zero. 445 score_single_token_spans_as_zero:bool; 446 } 447 448 namespace libtextclassifier2.FeatureProcessorOptions_; 449 table AlternativeCollectionMapEntry { 450 key:string; 451 value:string; 452 } 453 454 namespace libtextclassifier2; 455 table FeatureProcessorOptions { 456 // Number of buckets used for hashing charactergrams. 457 num_buckets:int = -1; 458 459 // Size of the embedding. 460 embedding_size:int = -1; 461 462 // Number of bits for quantization for embeddings. 463 embedding_quantization_bits:int = 8; 464 465 // Context size defines the number of words to the left and to the right of 466 // the selected word to be used as context. For example, if context size is 467 // N, then we take N words to the left and N words to the right of the 468 // selected word as its context. 469 context_size:int = -1; 470 471 // Maximum number of words of the context to select in total. 472 max_selection_span:int = -1; 473 474 // Orders of charactergrams to extract. E.g., 2 means character bigrams, 3 475 // character trigrams etc. 476 chargram_orders:[int]; 477 478 // Maximum length of a word, in codepoints. 479 max_word_length:int = 20; 480 481 // If true, will use the unicode-aware functionality for extracting features. 482 unicode_aware_features:bool = 0; 483 484 // Whether to extract the token case feature. 485 extract_case_feature:bool = 0; 486 487 // Whether to extract the selection mask feature. 488 extract_selection_mask_feature:bool = 0; 489 490 // List of regexps to run over each token. For each regexp, if there is a 491 // match, a dense feature of 1.0 is emitted. Otherwise -1.0 is used. 492 regexp_feature:[string]; 493 494 // Whether to remap all digits to a single number. 495 remap_digits:bool = 0; 496 497 // Whether to lower-case each token before generating hashgrams. 498 lowercase_tokens:bool; 499 500 // If true, the selection classifier output will contain only the selections 501 // that are feasible (e.g., those that are shorter than max_selection_span), 502 // if false, the output will be a complete cross-product of possible 503 // selections to the left and possible selections to the right, including the 504 // infeasible ones. 505 // NOTE: Exists mainly for compatibility with older models that were trained 506 // with the non-reduced output space. 507 selection_reduced_output_space:bool = 1; 508 509 // Collection names. 510 collections:[string]; 511 512 // An index of collection in collections to be used if a collection name can't 513 // be mapped to an id. 514 default_collection:int = -1; 515 516 // If true, will split the input by lines, and only use the line that contains 517 // the clicked token. 518 only_use_line_with_click:bool = 0; 519 520 // If true, will split tokens that contain the selection boundary, at the 521 // position of the boundary. 522 // E.g. "foo{bar}@google.com" -> "foo", "bar", "@google.com" 523 split_tokens_on_selection_boundaries:bool = 0; 524 525 // Codepoint ranges that determine how different codepoints are tokenized. 526 // The ranges must not overlap. 527 tokenization_codepoint_config:[libtextclassifier2.TokenizationCodepointRange]; 528 529 center_token_selection_method:libtextclassifier2.FeatureProcessorOptions_.CenterTokenSelectionMethod; 530 531 // If true, span boundaries will be snapped to containing tokens and not 532 // required to exactly match token boundaries. 533 snap_label_span_boundaries_to_containing_tokens:bool; 534 535 // A set of codepoint ranges supported by the model. 536 supported_codepoint_ranges:[libtextclassifier2.FeatureProcessorOptions_.CodepointRange]; 537 538 // A set of codepoint ranges to use in the mixed tokenization mode to identify 539 // stretches of tokens to re-tokenize using the internal tokenizer. 540 internal_tokenizer_codepoint_ranges:[libtextclassifier2.FeatureProcessorOptions_.CodepointRange]; 541 542 // Minimum ratio of supported codepoints in the input context. If the ratio 543 // is lower than this, the feature computation will fail. 544 min_supported_codepoint_ratio:float = 0; 545 546 // Used for versioning the format of features the model expects. 547 // - feature_version == 0: 548 // For each token the features consist of: 549 // - chargram embeddings 550 // - dense features 551 // Chargram embeddings for tokens are concatenated first together, 552 // and at the end, the dense features for the tokens are concatenated 553 // to it. So the resulting feature vector has two regions. 554 feature_version:int = 0; 555 556 tokenization_type:libtextclassifier2.FeatureProcessorOptions_.TokenizationType = INTERNAL_TOKENIZER; 557 icu_preserve_whitespace_tokens:bool = 0; 558 559 // List of codepoints that will be stripped from beginning and end of 560 // predicted spans. 561 ignored_span_boundary_codepoints:[int]; 562 563 bounds_sensitive_features:libtextclassifier2.FeatureProcessorOptions_.BoundsSensitiveFeatures; 564 565 // List of allowed charactergrams. The extracted charactergrams are filtered 566 // using this list, and charactergrams that are not present are interpreted as 567 // out-of-vocabulary. 568 // If no allowed_chargrams are specified, all charactergrams are allowed. 569 // The field is typed as bytes type to allow non-UTF8 chargrams. 570 allowed_chargrams:[string]; 571 572 // If true, tokens will be also split when the codepoint's script_id changes 573 // as defined in TokenizationCodepointRange. 574 tokenize_on_script_change:bool = 0; 575 } 576 577 root_type libtextclassifier2.Model; 578