1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef LIBTEXTCLASSIFIER_ANNOTATOR_NUMBER_NUMBER_H_ 18 #define LIBTEXTCLASSIFIER_ANNOTATOR_NUMBER_NUMBER_H_ 19 20 #include <string> 21 #include <unordered_set> 22 #include <vector> 23 24 #include "annotator/feature-processor.h" 25 #include "annotator/model_generated.h" 26 #include "annotator/types.h" 27 #include "utils/utf8/unicodetext.h" 28 29 namespace libtextclassifier3 { 30 31 // Annotator of numbers in text. 32 // 33 // Only supports values in range [-999 999 999, 999 999 999] (inclusive). 34 // 35 // TODO(zilka): Add support for non-ASCII digits. 36 // TODO(zilka): Add support for written-out numbers. 37 class NumberAnnotator { 38 public: 39 explicit NumberAnnotator(const NumberAnnotatorOptions* options, 40 const FeatureProcessor* feature_processor) 41 : options_(options), 42 feature_processor_(feature_processor), 43 allowed_prefix_codepoints_( 44 FlatbuffersVectorToSet(options->allowed_prefix_codepoints())), 45 allowed_suffix_codepoints_( 46 FlatbuffersVectorToSet(options->allowed_suffix_codepoints())) {} 47 48 // Classifies given text, and if it is a number, it passes the result in 49 // 'classification_result' and returns true, otherwise returns false. 50 bool ClassifyText(const UnicodeText& context, CodepointSpan selection_indices, 51 AnnotationUsecase annotation_usecase, 52 ClassificationResult* classification_result) const; 53 54 // Finds all number instances in the input text. 55 bool FindAll(const UnicodeText& context_unicode, 56 AnnotationUsecase annotation_usecase, 57 std::vector<AnnotatedSpan>* result) const; 58 59 private: 60 static std::unordered_set<int> FlatbuffersVectorToSet( 61 const flatbuffers::Vector<int32_t>* codepoints); 62 63 // Parses the text to an int64 value and returns true if succeeded, otherwise 64 // false. Also returns the number of prefix/suffix codepoints that were 65 // stripped from the number. 66 bool ParseNumber(const UnicodeText& text, int64* result, 67 int* num_prefix_codepoints, 68 int* num_suffix_codepoints) const; 69 70 const NumberAnnotatorOptions* options_; 71 const FeatureProcessor* feature_processor_; 72 const std::unordered_set<int> allowed_prefix_codepoints_; 73 const std::unordered_set<int> allowed_suffix_codepoints_; 74 }; 75 76 } // namespace libtextclassifier3 77 78 #endif // LIBTEXTCLASSIFIER_ANNOTATOR_NUMBER_NUMBER_H_ 79