Home | History | Annotate | Download | only in lang_id
      1 /*
      2  * Copyright (C) 2018 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #ifndef NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_LANG_ID_H_
     18 #define NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_LANG_ID_H_
     19 
     20 
     21 #include <stddef.h>
     22 
     23 #include <memory>
     24 #include <string>
     25 #include <utility>
     26 #include <vector>
     27 
     28 #include "lang_id/common/lite_base/macros.h"
     29 #include "lang_id/model-provider.h"
     30 
     31 namespace libtextclassifier3 {
     32 namespace mobile {
     33 namespace lang_id {
     34 
     35 // Forward-declaration of the class that performs all underlying work.
     36 class LangIdImpl;
     37 
     38 struct LangIdResult {
     39   // An n-best list of possible language codes for a given input sorted in
     40   // descending order according to each code's respective probability.
     41   //
     42   // This list is guaranteed to be non-empty after calling
     43   // LangId::FindLanguages.  The most likely language code is always the first
     44   // item in this array.
     45   //
     46   // If the model cannot make a prediction, this array contains a single result:
     47   // a language code LangId::kUnknownLanguageCode with probability 1.
     48   std::vector<std::pair<string, float>> predictions;
     49 };
     50 
     51 // Class for detecting the language of a document.
     52 //
     53 // Note: this class does not handle the details of loading the actual model.
     54 // Those details have been "outsourced" to the ModelProvider class.
     55 //
     56 // This class is thread safe.
     57 class LangId {
     58  public:
     59   // Standard BCP-47 language code for Unknown/Undetermined language.
     60   static const char kUnknownLanguageCode[];
     61 
     62   // Constructs a LangId object, based on |model_provider|.
     63   //
     64   // Note: we don't crash if we detect a problem at construction time (e.g., the
     65   // model provider can't read an underlying file).  Instead, we mark the
     66   // newly-constructed object as invalid; clients can invoke FindLanguage() on
     67   // an invalid object: nothing crashes, but accuracy will be bad.
     68   explicit LangId(std::unique_ptr<ModelProvider> model_provider);
     69 
     70   virtual ~LangId();
     71 
     72   // Computes the an n-best list of language codes and probabilities
     73   // corresponding to the most likely languages the given input text is written
     74   // in. The list is sorted in descending order by language probability.
     75   //
     76   // The input text consists of the |num_bytes| bytes that starts at |data|.
     77   //
     78   // Note: If this LangId object is not valid (see is_valid()) or if this LangId
     79   // object can't make a prediction, this method sets the LangIdResult to
     80   // contain a single entry with kUnknownLanguageCode with probability 1.
     81   void FindLanguages(const char *data, size_t num_bytes,
     82                      LangIdResult *result) const;
     83 
     84   // Convenience version of FindLanguages(const char *, size_t, LangIdResult *).
     85   void FindLanguages(const string &text, LangIdResult *result) const {
     86     FindLanguages(text.data(), text.size(), result);
     87   }
     88 
     89   // Returns language code for the most likely language for a piece of text.
     90   //
     91   // The input text consists of the |num_bytes| bytes that start at |data|.
     92   //
     93   // Note: this method reports the most likely (1-best) language only if its
     94   // probability is high enough; otherwise, it returns
     95   // LangId::kUnknownLanguageCode.  The specific probability threshold is tuned
     96   // to the needs of an early client.  If you need a different threshold, you
     97   // can use FindLanguages (plural) to get the full LangIdResult, and apply your
     98   // own threshold.
     99   //
    100   // Note: if this LangId object is not valid (see is_valid()) or if this LangId
    101   // object can't make a prediction, then this method returns
    102   // LangId::kUnknownLanguageCode.
    103   //
    104   string FindLanguage(const char *data, size_t num_bytes) const;
    105 
    106   // Convenience version of FindLanguage(const char *, size_t).
    107   string FindLanguage(const string &text) const {
    108     return FindLanguage(text.data(), text.size());
    109   }
    110 
    111   // Returns true if this object has been correctly initialized and is ready to
    112   // perform predictions.  For more info, see doc for LangId
    113   // constructor above.
    114   bool is_valid() const;
    115 
    116   // Returns the version of the model used by this LangId object.  On success,
    117   // the returned version number is a strictly positive integer.  Returns 0 if
    118   // the model version can not be determined (e.g., for old models that do not
    119   // specify a version number).
    120   int GetModelVersion() const;
    121 
    122   // Returns a typed property stored in the model file.
    123   float GetFloatProperty(const string &property, float default_value) const;
    124 
    125  private:
    126   // Pimpl ("pointer to implementation") pattern, to hide all internals from our
    127   // clients.
    128   std::unique_ptr<LangIdImpl> pimpl_;
    129 
    130   SAFTM_DISALLOW_COPY_AND_ASSIGN(LangId);
    131 };
    132 
    133 }  // namespace lang_id
    134 }  // namespace mobile
    135 }  // namespace nlp_saft
    136 
    137 #endif  // NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_LANG_ID_H_
    138