Home | History | Annotate | Download | only in number
      1 /*
      2  * Copyright (C) 2018 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #ifndef LIBTEXTCLASSIFIER_ANNOTATOR_NUMBER_NUMBER_H_
     18 #define LIBTEXTCLASSIFIER_ANNOTATOR_NUMBER_NUMBER_H_
     19 
     20 #include <string>
     21 #include <unordered_set>
     22 #include <vector>
     23 
     24 #include "annotator/feature-processor.h"
     25 #include "annotator/model_generated.h"
     26 #include "annotator/types.h"
     27 #include "utils/utf8/unicodetext.h"
     28 
     29 namespace libtextclassifier3 {
     30 
     31 // Annotator of numbers in text.
     32 //
     33 // Only supports values in range [-999 999 999, 999 999 999] (inclusive).
     34 //
     35 // TODO(zilka): Add support for non-ASCII digits.
     36 // TODO(zilka): Add support for written-out numbers.
     37 class NumberAnnotator {
     38  public:
     39   explicit NumberAnnotator(const NumberAnnotatorOptions* options,
     40                            const FeatureProcessor* feature_processor)
     41       : options_(options),
     42         feature_processor_(feature_processor),
     43         allowed_prefix_codepoints_(
     44             FlatbuffersVectorToSet(options->allowed_prefix_codepoints())),
     45         allowed_suffix_codepoints_(
     46             FlatbuffersVectorToSet(options->allowed_suffix_codepoints())) {}
     47 
     48   // Classifies given text, and if it is a number, it passes the result in
     49   // 'classification_result' and returns true, otherwise returns false.
     50   bool ClassifyText(const UnicodeText& context, CodepointSpan selection_indices,
     51                     AnnotationUsecase annotation_usecase,
     52                     ClassificationResult* classification_result) const;
     53 
     54   // Finds all number instances in the input text.
     55   bool FindAll(const UnicodeText& context_unicode,
     56                AnnotationUsecase annotation_usecase,
     57                std::vector<AnnotatedSpan>* result) const;
     58 
     59  private:
     60   static std::unordered_set<int> FlatbuffersVectorToSet(
     61       const flatbuffers::Vector<int32_t>* codepoints);
     62 
     63   // Parses the text to an int64 value and returns true if succeeded, otherwise
     64   // false. Also returns the number of prefix/suffix codepoints that were
     65   // stripped from the number.
     66   bool ParseNumber(const UnicodeText& text, int64* result,
     67                    int* num_prefix_codepoints,
     68                    int* num_suffix_codepoints) const;
     69 
     70   const NumberAnnotatorOptions* options_;
     71   const FeatureProcessor* feature_processor_;
     72   const std::unordered_set<int> allowed_prefix_codepoints_;
     73   const std::unordered_set<int> allowed_suffix_codepoints_;
     74 };
     75 
     76 }  // namespace libtextclassifier3
     77 
     78 #endif  // LIBTEXTCLASSIFIER_ANNOTATOR_NUMBER_NUMBER_H_
     79