Home | History | Annotate | Download | only in libtextclassifier
      1 //
      2 // Copyright (C) 2017 The Android Open Source Project
      3 //
      4 // Licensed under the Apache License, Version 2.0 (the "License");
      5 // you may not use this file except in compliance with the License.
      6 // You may obtain a copy of the License at
      7 //
      8 //      http://www.apache.org/licenses/LICENSE-2.0
      9 //
     10 // Unless required by applicable law or agreed to in writing, software
     11 // distributed under the License is distributed on an "AS IS" BASIS,
     12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 // See the License for the specific language governing permissions and
     14 // limitations under the License.
     15 //
     16 
     17 file_identifier "TC2 ";
     18 
     19 // The possible model modes, represents a bit field.
     20 namespace libtextclassifier2;
     21 enum ModeFlag : int {
     22   NONE = 0,
     23   ANNOTATION = 1,
     24   CLASSIFICATION = 2,
     25   ANNOTATION_AND_CLASSIFICATION = 3,
     26   SELECTION = 4,
     27   ANNOTATION_AND_SELECTION = 5,
     28   CLASSIFICATION_AND_SELECTION = 6,
     29   ALL = 7,
     30 }
     31 
     32 namespace libtextclassifier2;
     33 enum DatetimeExtractorType : int {
     34   UNKNOWN_DATETIME_EXTRACTOR_TYPE = 0,
     35   AM = 1,
     36   PM = 2,
     37   JANUARY = 3,
     38   FEBRUARY = 4,
     39   MARCH = 5,
     40   APRIL = 6,
     41   MAY = 7,
     42   JUNE = 8,
     43   JULY = 9,
     44   AUGUST = 10,
     45   SEPTEMBER = 11,
     46   OCTOBER = 12,
     47   NOVEMBER = 13,
     48   DECEMBER = 14,
     49   NEXT = 15,
     50   NEXT_OR_SAME = 16,
     51   LAST = 17,
     52   NOW = 18,
     53   TOMORROW = 19,
     54   YESTERDAY = 20,
     55   PAST = 21,
     56   FUTURE = 22,
     57   DAY = 23,
     58   WEEK = 24,
     59   MONTH = 25,
     60   YEAR = 26,
     61   MONDAY = 27,
     62   TUESDAY = 28,
     63   WEDNESDAY = 29,
     64   THURSDAY = 30,
     65   FRIDAY = 31,
     66   SATURDAY = 32,
     67   SUNDAY = 33,
     68   DAYS = 34,
     69   WEEKS = 35,
     70   MONTHS = 36,
     71   HOURS = 37,
     72   MINUTES = 38,
     73   SECONDS = 39,
     74   YEARS = 40,
     75   DIGITS = 41,
     76   SIGNEDDIGITS = 42,
     77   ZERO = 43,
     78   ONE = 44,
     79   TWO = 45,
     80   THREE = 46,
     81   FOUR = 47,
     82   FIVE = 48,
     83   SIX = 49,
     84   SEVEN = 50,
     85   EIGHT = 51,
     86   NINE = 52,
     87   TEN = 53,
     88   ELEVEN = 54,
     89   TWELVE = 55,
     90   THIRTEEN = 56,
     91   FOURTEEN = 57,
     92   FIFTEEN = 58,
     93   SIXTEEN = 59,
     94   SEVENTEEN = 60,
     95   EIGHTEEN = 61,
     96   NINETEEN = 62,
     97   TWENTY = 63,
     98   THIRTY = 64,
     99   FORTY = 65,
    100   FIFTY = 66,
    101   SIXTY = 67,
    102   SEVENTY = 68,
    103   EIGHTY = 69,
    104   NINETY = 70,
    105   HUNDRED = 71,
    106   THOUSAND = 72,
    107 }
    108 
    109 namespace libtextclassifier2;
    110 enum DatetimeGroupType : int {
    111   GROUP_UNKNOWN = 0,
    112   GROUP_UNUSED = 1,
    113   GROUP_YEAR = 2,
    114   GROUP_MONTH = 3,
    115   GROUP_DAY = 4,
    116   GROUP_HOUR = 5,
    117   GROUP_MINUTE = 6,
    118   GROUP_SECOND = 7,
    119   GROUP_AMPM = 8,
    120   GROUP_RELATIONDISTANCE = 9,
    121   GROUP_RELATION = 10,
    122   GROUP_RELATIONTYPE = 11,
    123 
    124   // Dummy groups serve just as an inflator of the selection. E.g. we might want
    125   // to select more text than was contained in an envelope of all extractor
    126   // spans.
    127   GROUP_DUMMY1 = 12,
    128 
    129   GROUP_DUMMY2 = 13,
    130 }
    131 
    132 namespace libtextclassifier2;
    133 table CompressedBuffer {
    134   buffer:[ubyte];
    135   uncompressed_size:int;
    136 }
    137 
    138 // Options for the model that predicts text selection.
    139 namespace libtextclassifier2;
    140 table SelectionModelOptions {
    141   // If true, before the selection is returned, the unpaired brackets contained
    142   // in the predicted selection are stripped from the both selection ends.
    143   // The bracket codepoints are defined in the Unicode standard:
    144   // http://www.unicode.org/Public/UNIDATA/BidiBrackets.txt
    145   strip_unpaired_brackets:bool = 1;
    146 
    147   // Number of hypothetical click positions on either side of the actual click
    148   // to consider in order to enforce symmetry.
    149   symmetry_context_size:int;
    150 
    151   // Number of examples to bundle in one batch for inference.
    152   batch_size:int = 1024;
    153 
    154   // Whether to always classify a suggested selection or only on demand.
    155   always_classify_suggested_selection:bool = 0;
    156 }
    157 
    158 // Options for the model that classifies a text selection.
    159 namespace libtextclassifier2;
    160 table ClassificationModelOptions {
    161   // Limits for phone numbers.
    162   phone_min_num_digits:int = 7;
    163 
    164   phone_max_num_digits:int = 15;
    165 
    166   // Limits for addresses.
    167   address_min_num_tokens:int;
    168 
    169   // Maximum number of tokens to attempt a classification (-1 is unlimited).
    170   max_num_tokens:int = -1;
    171 }
    172 
    173 // List of regular expression matchers to check.
    174 namespace libtextclassifier2.RegexModel_;
    175 table Pattern {
    176   // The name of the collection of a match.
    177   collection_name:string;
    178 
    179   // The pattern to check.
    180   // Can specify a single capturing group used as match boundaries.
    181   pattern:string;
    182 
    183   // The modes for which to apply the patterns.
    184   enabled_modes:libtextclassifier2.ModeFlag = ALL;
    185 
    186   // The final score to assign to the results of this pattern.
    187   target_classification_score:float = 1;
    188 
    189   // Priority score used for conflict resolution with the other models.
    190   priority_score:float = 0;
    191 
    192   // If true, will use an approximate matching implementation implemented
    193   // using Find() instead of the true Match(). This approximate matching will
    194   // use the first Find() result and then check that it spans the whole input.
    195   use_approximate_matching:bool = 0;
    196 
    197   compressed_pattern:libtextclassifier2.CompressedBuffer;
    198 }
    199 
    200 namespace libtextclassifier2;
    201 table RegexModel {
    202   patterns:[libtextclassifier2.RegexModel_.Pattern];
    203 }
    204 
    205 // List of regex patterns.
    206 namespace libtextclassifier2.DatetimeModelPattern_;
    207 table Regex {
    208   pattern:string;
    209 
    210   // The ith entry specifies the type of the ith capturing group.
    211   // This is used to decide how the matched content has to be parsed.
    212   groups:[libtextclassifier2.DatetimeGroupType];
    213 
    214   compressed_pattern:libtextclassifier2.CompressedBuffer;
    215 }
    216 
    217 namespace libtextclassifier2;
    218 table DatetimeModelPattern {
    219   regexes:[libtextclassifier2.DatetimeModelPattern_.Regex];
    220 
    221   // List of locale indices in DatetimeModel that represent the locales that
    222   // these patterns should be used for. If empty, can be used for all locales.
    223   locales:[int];
    224 
    225   // The final score to assign to the results of this pattern.
    226   target_classification_score:float = 1;
    227 
    228   // Priority score used for conflict resulution with the other models.
    229   priority_score:float = 0;
    230 
    231   // The modes for which to apply the patterns.
    232   enabled_modes:libtextclassifier2.ModeFlag = ALL;
    233 }
    234 
    235 namespace libtextclassifier2;
    236 table DatetimeModelExtractor {
    237   extractor:libtextclassifier2.DatetimeExtractorType;
    238   pattern:string;
    239   locales:[int];
    240   compressed_pattern:libtextclassifier2.CompressedBuffer;
    241 }
    242 
    243 namespace libtextclassifier2;
    244 table DatetimeModel {
    245   // List of BCP 47 locale strings representing all locales supported by the
    246   // model. The individual patterns refer back to them using an index.
    247   locales:[string];
    248 
    249   patterns:[libtextclassifier2.DatetimeModelPattern];
    250   extractors:[libtextclassifier2.DatetimeModelExtractor];
    251 
    252   // If true, will use the extractors for determining the match location as
    253   // opposed to using the location where the global pattern matched.
    254   use_extractors_for_locating:bool = 1;
    255 
    256   // List of locale ids, rules of whose are always run, after the requested
    257   // ones.
    258   default_locales:[int];
    259 }
    260 
    261 namespace libtextclassifier2.DatetimeModelLibrary_;
    262 table Item {
    263   key:string;
    264   value:libtextclassifier2.DatetimeModel;
    265 }
    266 
    267 // A set of named DateTime models.
    268 namespace libtextclassifier2;
    269 table DatetimeModelLibrary {
    270   models:[libtextclassifier2.DatetimeModelLibrary_.Item];
    271 }
    272 
    273 // Options controlling the output of the Tensorflow Lite models.
    274 namespace libtextclassifier2;
    275 table ModelTriggeringOptions {
    276   // Lower bound threshold for filtering annotation model outputs.
    277   min_annotate_confidence:float = 0;
    278 
    279   // The modes for which to enable the models.
    280   enabled_modes:libtextclassifier2.ModeFlag = ALL;
    281 }
    282 
    283 // Options controlling the output of the classifier.
    284 namespace libtextclassifier2;
    285 table OutputOptions {
    286   // Lists of collection names that will be filtered out at the output:
    287   // - For annotation, the spans of given collection are simply dropped.
    288   // - For classification, the result is mapped to the class "other".
    289   // - For selection, the spans of given class are returned as
    290   // single-selection.
    291   filtered_collections_annotation:[string];
    292 
    293   filtered_collections_classification:[string];
    294   filtered_collections_selection:[string];
    295 }
    296 
    297 namespace libtextclassifier2;
    298 table Model {
    299   // Comma-separated list of locales supported by the model as BCP 47 tags.
    300   locales:string;
    301 
    302   version:int;
    303 
    304   // A name for the model that can be used for e.g. logging.
    305   name:string;
    306 
    307   selection_feature_options:libtextclassifier2.FeatureProcessorOptions;
    308   classification_feature_options:libtextclassifier2.FeatureProcessorOptions;
    309 
    310   // Tensorflow Lite models.
    311   selection_model:[ubyte] (force_align: 16);
    312 
    313   classification_model:[ubyte] (force_align: 16);
    314   embedding_model:[ubyte] (force_align: 16);
    315 
    316   // Options for the different models.
    317   selection_options:libtextclassifier2.SelectionModelOptions;
    318 
    319   classification_options:libtextclassifier2.ClassificationModelOptions;
    320   regex_model:libtextclassifier2.RegexModel;
    321   datetime_model:libtextclassifier2.DatetimeModel;
    322 
    323   // Options controlling the output of the models.
    324   triggering_options:libtextclassifier2.ModelTriggeringOptions;
    325 
    326   // Global switch that controls if SuggestSelection(), ClassifyText() and
    327   // Annotate() will run. If a mode is disabled it returns empty/no-op results.
    328   enabled_modes:libtextclassifier2.ModeFlag = ALL;
    329 
    330   // If true, will snap the selections that consist only of whitespaces to the
    331   // containing suggested span. Otherwise, no suggestion is proposed, since the
    332   // selections are not part of any token.
    333   snap_whitespace_selections:bool = 1;
    334 
    335   // Global configuration for the output of SuggestSelection(), ClassifyText()
    336   // and Annotate().
    337   output_options:libtextclassifier2.OutputOptions;
    338 }
    339 
    340 // Role of the codepoints in the range.
    341 namespace libtextclassifier2.TokenizationCodepointRange_;
    342 enum Role : int {
    343   // Concatenates the codepoint to the current run of codepoints.
    344   DEFAULT_ROLE = 0,
    345 
    346   // Splits a run of codepoints before the current codepoint.
    347   SPLIT_BEFORE = 1,
    348 
    349   // Splits a run of codepoints after the current codepoint.
    350   SPLIT_AFTER = 2,
    351 
    352   // Each codepoint will be a separate token. Good e.g. for Chinese
    353   // characters.
    354   TOKEN_SEPARATOR = 3,
    355 
    356   // Discards the codepoint.
    357   DISCARD_CODEPOINT = 4,
    358 
    359   // Common values:
    360   // Splits on the characters and discards them. Good e.g. for the space
    361   // character.
    362   WHITESPACE_SEPARATOR = 7,
    363 }
    364 
    365 // Represents a codepoint range [start, end) with its role for tokenization.
    366 namespace libtextclassifier2;
    367 table TokenizationCodepointRange {
    368   start:int;
    369   end:int;
    370   role:libtextclassifier2.TokenizationCodepointRange_.Role;
    371 
    372   // Integer identifier of the script this range denotes. Negative values are
    373   // reserved for Tokenizer's internal use.
    374   script_id:int;
    375 }
    376 
    377 // Method for selecting the center token.
    378 namespace libtextclassifier2.FeatureProcessorOptions_;
    379 enum CenterTokenSelectionMethod : int {
    380   DEFAULT_CENTER_TOKEN_METHOD = 0,
    381 
    382   // Use click indices to determine the center token.
    383   CENTER_TOKEN_FROM_CLICK = 1,
    384 
    385   // Use selection indices to get a token range, and select the middle of it
    386   // as the center token.
    387   CENTER_TOKEN_MIDDLE_OF_SELECTION = 2,
    388 }
    389 
    390 // Controls the type of tokenization the model will use for the input text.
    391 namespace libtextclassifier2.FeatureProcessorOptions_;
    392 enum TokenizationType : int {
    393   INVALID_TOKENIZATION_TYPE = 0,
    394 
    395   // Use the internal tokenizer for tokenization.
    396   INTERNAL_TOKENIZER = 1,
    397 
    398   // Use ICU for tokenization.
    399   ICU = 2,
    400 
    401   // First apply ICU tokenization. Then identify stretches of tokens
    402   // consisting only of codepoints in internal_tokenizer_codepoint_ranges
    403   // and re-tokenize them using the internal tokenizer.
    404   MIXED = 3,
    405 }
    406 
    407 // Range of codepoints start - end, where end is exclusive.
    408 namespace libtextclassifier2.FeatureProcessorOptions_;
    409 table CodepointRange {
    410   start:int;
    411   end:int;
    412 }
    413 
    414 // Bounds-sensitive feature extraction configuration.
    415 namespace libtextclassifier2.FeatureProcessorOptions_;
    416 table BoundsSensitiveFeatures {
    417   // Enables the extraction of bounds-sensitive features, instead of the click
    418   // context features.
    419   enabled:bool;
    420 
    421   // The numbers of tokens to extract in specific locations relative to the
    422   // bounds.
    423   // Immediately before the span.
    424   num_tokens_before:int;
    425 
    426   // Inside the span, aligned with the beginning.
    427   num_tokens_inside_left:int;
    428 
    429   // Inside the span, aligned with the end.
    430   num_tokens_inside_right:int;
    431 
    432   // Immediately after the span.
    433   num_tokens_after:int;
    434 
    435   // If true, also extracts the tokens of the entire span and adds up their
    436   // features forming one "token" to include in the extracted features.
    437   include_inside_bag:bool;
    438 
    439   // If true, includes the selection length (in the number of tokens) as a
    440   // feature.
    441   include_inside_length:bool;
    442 
    443   // If true, for selection, single token spans are not run through the model
    444   // and their score is assumed to be zero.
    445   score_single_token_spans_as_zero:bool;
    446 }
    447 
    448 namespace libtextclassifier2.FeatureProcessorOptions_;
    449 table AlternativeCollectionMapEntry {
    450   key:string;
    451   value:string;
    452 }
    453 
    454 namespace libtextclassifier2;
    455 table FeatureProcessorOptions {
    456   // Number of buckets used for hashing charactergrams.
    457   num_buckets:int = -1;
    458 
    459   // Size of the embedding.
    460   embedding_size:int = -1;
    461 
    462   // Number of bits for quantization for embeddings.
    463   embedding_quantization_bits:int = 8;
    464 
    465   // Context size defines the number of words to the left and to the right of
    466   // the selected word to be used as context. For example, if context size is
    467   // N, then we take N words to the left and N words to the right of the
    468   // selected word as its context.
    469   context_size:int = -1;
    470 
    471   // Maximum number of words of the context to select in total.
    472   max_selection_span:int = -1;
    473 
    474   // Orders of charactergrams to extract. E.g., 2 means character bigrams, 3
    475   // character trigrams etc.
    476   chargram_orders:[int];
    477 
    478   // Maximum length of a word, in codepoints.
    479   max_word_length:int = 20;
    480 
    481   // If true, will use the unicode-aware functionality for extracting features.
    482   unicode_aware_features:bool = 0;
    483 
    484   // Whether to extract the token case feature.
    485   extract_case_feature:bool = 0;
    486 
    487   // Whether to extract the selection mask feature.
    488   extract_selection_mask_feature:bool = 0;
    489 
    490   // List of regexps to run over each token. For each regexp, if there is a
    491   // match, a dense feature of 1.0 is emitted. Otherwise -1.0 is used.
    492   regexp_feature:[string];
    493 
    494   // Whether to remap all digits to a single number.
    495   remap_digits:bool = 0;
    496 
    497   // Whether to lower-case each token before generating hashgrams.
    498   lowercase_tokens:bool;
    499 
    500   // If true, the selection classifier output will contain only the selections
    501   // that are feasible (e.g., those that are shorter than max_selection_span),
    502   // if false, the output will be a complete cross-product of possible
    503   // selections to the left and possible selections to the right, including the
    504   // infeasible ones.
    505   // NOTE: Exists mainly for compatibility with older models that were trained
    506   // with the non-reduced output space.
    507   selection_reduced_output_space:bool = 1;
    508 
    509   // Collection names.
    510   collections:[string];
    511 
    512   // An index of collection in collections to be used if a collection name can't
    513   // be mapped to an id.
    514   default_collection:int = -1;
    515 
    516   // If true, will split the input by lines, and only use the line that contains
    517   // the clicked token.
    518   only_use_line_with_click:bool = 0;
    519 
    520   // If true, will split tokens that contain the selection boundary, at the
    521   // position of the boundary.
    522   // E.g. "foo{bar}@google.com" -> "foo", "bar", "@google.com"
    523   split_tokens_on_selection_boundaries:bool = 0;
    524 
    525   // Codepoint ranges that determine how different codepoints are tokenized.
    526   // The ranges must not overlap.
    527   tokenization_codepoint_config:[libtextclassifier2.TokenizationCodepointRange];
    528 
    529   center_token_selection_method:libtextclassifier2.FeatureProcessorOptions_.CenterTokenSelectionMethod;
    530 
    531   // If true, span boundaries will be snapped to containing tokens and not
    532   // required to exactly match token boundaries.
    533   snap_label_span_boundaries_to_containing_tokens:bool;
    534 
    535   // A set of codepoint ranges supported by the model.
    536   supported_codepoint_ranges:[libtextclassifier2.FeatureProcessorOptions_.CodepointRange];
    537 
    538   // A set of codepoint ranges to use in the mixed tokenization mode to identify
    539   // stretches of tokens to re-tokenize using the internal tokenizer.
    540   internal_tokenizer_codepoint_ranges:[libtextclassifier2.FeatureProcessorOptions_.CodepointRange];
    541 
    542   // Minimum ratio of supported codepoints in the input context. If the ratio
    543   // is lower than this, the feature computation will fail.
    544   min_supported_codepoint_ratio:float = 0;
    545 
    546   // Used for versioning the format of features the model expects.
    547   // - feature_version == 0:
    548   // For each token the features consist of:
    549   // - chargram embeddings
    550   // - dense features
    551   // Chargram embeddings for tokens are concatenated first together,
    552   // and at the end, the dense features for the tokens are concatenated
    553   // to it. So the resulting feature vector has two regions.
    554   feature_version:int = 0;
    555 
    556   tokenization_type:libtextclassifier2.FeatureProcessorOptions_.TokenizationType = INTERNAL_TOKENIZER;
    557   icu_preserve_whitespace_tokens:bool = 0;
    558 
    559   // List of codepoints that will be stripped from beginning and end of
    560   // predicted spans.
    561   ignored_span_boundary_codepoints:[int];
    562 
    563   bounds_sensitive_features:libtextclassifier2.FeatureProcessorOptions_.BoundsSensitiveFeatures;
    564 
    565   // List of allowed charactergrams. The extracted charactergrams are filtered
    566   // using this list, and charactergrams that are not present are interpreted as
    567   // out-of-vocabulary.
    568   // If no allowed_chargrams are specified, all charactergrams are allowed.
    569   // The field is typed as bytes type to allow non-UTF8 chargrams.
    570   allowed_chargrams:[string];
    571 
    572   // If true, tokens will be also split when the codepoint's script_id changes
    573   // as defined in TokenizationCodepointRange.
    574   tokenize_on_script_change:bool = 0;
    575 }
    576 
    577 root_type libtextclassifier2.Model;
    578