/* * Copyright (C) 2018 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_LANG_ID_H_ #define NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_LANG_ID_H_ #include <stddef.h> #include <memory> #include <string> #include <utility> #include <vector> #include "lang_id/common/lite_base/macros.h" #include "lang_id/model-provider.h" namespace libtextclassifier3 { namespace mobile { namespace lang_id { // Forward-declaration of the class that performs all underlying work. class LangIdImpl; struct LangIdResult { // An n-best list of possible language codes for a given input sorted in // descending order according to each code's respective probability. // // This list is guaranteed to be non-empty after calling // LangId::FindLanguages. The most likely language code is always the first // item in this array. // // If the model cannot make a prediction, this array contains a single result: // a language code LangId::kUnknownLanguageCode with probability 1. std::vector<std::pair<string, float>> predictions; }; // Class for detecting the language of a document. // // Note: this class does not handle the details of loading the actual model. // Those details have been "outsourced" to the ModelProvider class. // // This class is thread safe. class LangId { public: // Standard BCP-47 language code for Unknown/Undetermined language. static const char kUnknownLanguageCode[]; // Constructs a LangId object, based on |model_provider|. // // Note: we don't crash if we detect a problem at construction time (e.g., the // model provider can't read an underlying file). Instead, we mark the // newly-constructed object as invalid; clients can invoke FindLanguage() on // an invalid object: nothing crashes, but accuracy will be bad. explicit LangId(std::unique_ptr<ModelProvider> model_provider); virtual ~LangId(); // Computes the an n-best list of language codes and probabilities // corresponding to the most likely languages the given input text is written // in. The list is sorted in descending order by language probability. // // The input text consists of the |num_bytes| bytes that starts at |data|. // // Note: If this LangId object is not valid (see is_valid()) or if this LangId // object can't make a prediction, this method sets the LangIdResult to // contain a single entry with kUnknownLanguageCode with probability 1. void FindLanguages(const char *data, size_t num_bytes, LangIdResult *result) const; // Convenience version of FindLanguages(const char *, size_t, LangIdResult *). void FindLanguages(const string &text, LangIdResult *result) const { FindLanguages(text.data(), text.size(), result); } // Returns language code for the most likely language for a piece of text. // // The input text consists of the |num_bytes| bytes that start at |data|. // // Note: this method reports the most likely (1-best) language only if its // probability is high enough; otherwise, it returns // LangId::kUnknownLanguageCode. The specific probability threshold is tuned // to the needs of an early client. If you need a different threshold, you // can use FindLanguages (plural) to get the full LangIdResult, and apply your // own threshold. // // Note: if this LangId object is not valid (see is_valid()) or if this LangId // object can't make a prediction, then this method returns // LangId::kUnknownLanguageCode. // string FindLanguage(const char *data, size_t num_bytes) const; // Convenience version of FindLanguage(const char *, size_t). string FindLanguage(const string &text) const { return FindLanguage(text.data(), text.size()); } // Returns true if this object has been correctly initialized and is ready to // perform predictions. For more info, see doc for LangId // constructor above. bool is_valid() const; // Returns the version of the model used by this LangId object. On success, // the returned version number is a strictly positive integer. Returns 0 if // the model version can not be determined (e.g., for old models that do not // specify a version number). int GetModelVersion() const; // Returns a typed property stored in the model file. float GetFloatProperty(const string &property, float default_value) const; private: // Pimpl ("pointer to implementation") pattern, to hide all internals from our // clients. std::unique_ptr<LangIdImpl> pimpl_; SAFTM_DISALLOW_COPY_AND_ASSIGN(LangId); }; } // namespace lang_id } // namespace mobile } // namespace nlp_saft #endif // NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_LANG_ID_H_