C++程序  |  138行  |  5.1 KB

/*
 * Copyright (C) 2018 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_LANG_ID_H_
#define NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_LANG_ID_H_


#include <stddef.h>

#include <memory>
#include <string>
#include <utility>
#include <vector>

#include "lang_id/common/lite_base/macros.h"
#include "lang_id/model-provider.h"

namespace libtextclassifier3 {
namespace mobile {
namespace lang_id {

// Forward-declaration of the class that performs all underlying work.
class LangIdImpl;

struct LangIdResult {
  // An n-best list of possible language codes for a given input sorted in
  // descending order according to each code's respective probability.
  //
  // This list is guaranteed to be non-empty after calling
  // LangId::FindLanguages.  The most likely language code is always the first
  // item in this array.
  //
  // If the model cannot make a prediction, this array contains a single result:
  // a language code LangId::kUnknownLanguageCode with probability 1.
  std::vector<std::pair<string, float>> predictions;
};

// Class for detecting the language of a document.
//
// Note: this class does not handle the details of loading the actual model.
// Those details have been "outsourced" to the ModelProvider class.
//
// This class is thread safe.
class LangId {
 public:
  // Standard BCP-47 language code for Unknown/Undetermined language.
  static const char kUnknownLanguageCode[];

  // Constructs a LangId object, based on |model_provider|.
  //
  // Note: we don't crash if we detect a problem at construction time (e.g., the
  // model provider can't read an underlying file).  Instead, we mark the
  // newly-constructed object as invalid; clients can invoke FindLanguage() on
  // an invalid object: nothing crashes, but accuracy will be bad.
  explicit LangId(std::unique_ptr<ModelProvider> model_provider);

  virtual ~LangId();

  // Computes the an n-best list of language codes and probabilities
  // corresponding to the most likely languages the given input text is written
  // in. The list is sorted in descending order by language probability.
  //
  // The input text consists of the |num_bytes| bytes that starts at |data|.
  //
  // Note: If this LangId object is not valid (see is_valid()) or if this LangId
  // object can't make a prediction, this method sets the LangIdResult to
  // contain a single entry with kUnknownLanguageCode with probability 1.
  void FindLanguages(const char *data, size_t num_bytes,
                     LangIdResult *result) const;

  // Convenience version of FindLanguages(const char *, size_t, LangIdResult *).
  void FindLanguages(const string &text, LangIdResult *result) const {
    FindLanguages(text.data(), text.size(), result);
  }

  // Returns language code for the most likely language for a piece of text.
  //
  // The input text consists of the |num_bytes| bytes that start at |data|.
  //
  // Note: this method reports the most likely (1-best) language only if its
  // probability is high enough; otherwise, it returns
  // LangId::kUnknownLanguageCode.  The specific probability threshold is tuned
  // to the needs of an early client.  If you need a different threshold, you
  // can use FindLanguages (plural) to get the full LangIdResult, and apply your
  // own threshold.
  //
  // Note: if this LangId object is not valid (see is_valid()) or if this LangId
  // object can't make a prediction, then this method returns
  // LangId::kUnknownLanguageCode.
  //
  string FindLanguage(const char *data, size_t num_bytes) const;

  // Convenience version of FindLanguage(const char *, size_t).
  string FindLanguage(const string &text) const {
    return FindLanguage(text.data(), text.size());
  }

  // Returns true if this object has been correctly initialized and is ready to
  // perform predictions.  For more info, see doc for LangId
  // constructor above.
  bool is_valid() const;

  // Returns the version of the model used by this LangId object.  On success,
  // the returned version number is a strictly positive integer.  Returns 0 if
  // the model version can not be determined (e.g., for old models that do not
  // specify a version number).
  int GetModelVersion() const;

  // Returns a typed property stored in the model file.
  float GetFloatProperty(const string &property, float default_value) const;

 private:
  // Pimpl ("pointer to implementation") pattern, to hide all internals from our
  // clients.
  std::unique_ptr<LangIdImpl> pimpl_;

  SAFTM_DISALLOW_COPY_AND_ASSIGN(LangId);
};

}  // namespace lang_id
}  // namespace mobile
}  // namespace nlp_saft

#endif  // NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_LANG_ID_H_