普通文本  |  403行  |  13.26 KB

/*
 * Copyright (C) 2017 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "lang_id/lang-id.h"

#include <stdio.h>

#include <algorithm>
#include <limits>
#include <memory>
#include <string>
#include <vector>

#include "common/algorithm.h"
#include "common/embedding-network-params-from-proto.h"
#include "common/embedding-network.pb.h"
#include "common/embedding-network.h"
#include "common/feature-extractor.h"
#include "common/file-utils.h"
#include "common/list-of-strings.pb.h"
#include "common/memory_image/in-memory-model-data.h"
#include "common/mmap.h"
#include "common/softmax.h"
#include "common/task-context.h"
#include "lang_id/custom-tokenizer.h"
#include "lang_id/lang-id-brain-interface.h"
#include "lang_id/language-identifier-features.h"
#include "lang_id/light-sentence-features.h"
#include "lang_id/light-sentence.h"
#include "lang_id/relevant-script-feature.h"
#include "util/base/logging.h"
#include "util/base/macros.h"

using ::libtextclassifier::nlp_core::file_utils::ParseProtoFromMemory;

namespace libtextclassifier {
namespace nlp_core {
namespace lang_id {

namespace {
// Default value for the probability threshold; see comments for
// LangId::SetProbabilityThreshold().
static const float kDefaultProbabilityThreshold = 0.50;

// Default value for min text size below which our model can't provide a
// meaningful prediction.
static const int kDefaultMinTextSizeInBytes = 20;

// Initial value for the default language for LangId::FindLanguage().  The
// default language can be changed (for an individual LangId object) using
// LangId::SetDefaultLanguage().
static const char kInitialDefaultLanguage[] = "";

// Returns total number of bytes of the words from sentence, without the ^
// (start-of-word) and $ (end-of-word) markers.  Note: "real text" means that
// this ignores whitespace and punctuation characters from the original text.
int GetRealTextSize(const LightSentence &sentence) {
  int total = 0;
  for (int i = 0; i < sentence.num_words(); ++i) {
    TC_DCHECK(!sentence.word(i).empty());
    TC_DCHECK_EQ('^', sentence.word(i).front());
    TC_DCHECK_EQ('$', sentence.word(i).back());
    total += sentence.word(i).size() - 2;
  }
  return total;
}

}  // namespace

// Class that performs all work behind LangId.
class LangIdImpl {
 public:
  explicit LangIdImpl(const std::string &filename) {
    // Using mmap as a fast way to read the model bytes.
    ScopedMmap scoped_mmap(filename);
    MmapHandle mmap_handle = scoped_mmap.handle();
    if (!mmap_handle.ok()) {
      TC_LOG(ERROR) << "Unable to read model bytes.";
      return;
    }

    Initialize(mmap_handle.to_stringpiece());
  }

  explicit LangIdImpl(int fd) {
    // Using mmap as a fast way to read the model bytes.
    ScopedMmap scoped_mmap(fd);
    MmapHandle mmap_handle = scoped_mmap.handle();
    if (!mmap_handle.ok()) {
      TC_LOG(ERROR) << "Unable to read model bytes.";
      return;
    }

    Initialize(mmap_handle.to_stringpiece());
  }

  LangIdImpl(const char *ptr, size_t length) {
    Initialize(StringPiece(ptr, length));
  }

  void Initialize(StringPiece model_bytes) {
    // Will set valid_ to true only on successful initialization.
    valid_ = false;

    // Make sure all relevant features are registered:
    ContinuousBagOfNgramsFunction::RegisterClass();
    RelevantScriptFeature::RegisterClass();

    // NOTE(salcianu): code below relies on the fact that the current features
    // do not rely on data from a TaskInput.  Otherwise, one would have to use
    // the more complex model registration mechanism, which requires more code.
    InMemoryModelData model_data(model_bytes);
    TaskContext context;
    if (!model_data.GetTaskSpec(context.mutable_spec())) {
      TC_LOG(ERROR) << "Unable to get model TaskSpec";
      return;
    }

    if (!ParseNetworkParams(model_data, &context)) {
      return;
    }
    if (!ParseListOfKnownLanguages(model_data, &context)) {
      return;
    }

    network_.reset(new EmbeddingNetwork(network_params_.get()));
    if (!network_->is_valid()) {
      return;
    }

    probability_threshold_ =
        context.Get("reliability_thresh", kDefaultProbabilityThreshold);
    min_text_size_in_bytes_ =
        context.Get("min_text_size_in_bytes", kDefaultMinTextSizeInBytes);
    version_ = context.Get("version", 0);

    if (!lang_id_brain_interface_.Init(&context)) {
      return;
    }
    valid_ = true;
  }

  void SetProbabilityThreshold(float threshold) {
    probability_threshold_ = threshold;
  }

  void SetDefaultLanguage(const std::string &lang) { default_language_ = lang; }

  std::string FindLanguage(const std::string &text) const {
    std::vector<float> scores = ScoreLanguages(text);
    if (scores.empty()) {
      return default_language_;
    }

    // Softmax label with max score.
    int label = GetArgMax(scores);
    float probability = scores[label];
    if (probability < probability_threshold_) {
      return default_language_;
    }
    return GetLanguageForSoftmaxLabel(label);
  }

  std::vector<std::pair<std::string, float>> FindLanguages(
      const std::string &text) const {
    std::vector<float> scores = ScoreLanguages(text);

    std::vector<std::pair<std::string, float>> result;
    for (int i = 0; i < scores.size(); i++) {
      result.push_back({GetLanguageForSoftmaxLabel(i), scores[i]});
    }

    // To avoid crashing clients that always expect at least one predicted
    // language, we promised (see doc for this method) that the result always
    // contains at least one element.
    if (result.empty()) {
      // We use a tiny probability, such that any client that uses a meaningful
      // probability threshold ignores this prediction.  We don't use 0.0f, to
      // avoid crashing clients that normalize the probabilities we return here.
      result.push_back({default_language_, 0.001f});
    }
    return result;
  }

  std::vector<float> ScoreLanguages(const std::string &text) const {
    if (!is_valid()) {
      return {};
    }

    // Create a Sentence storing the input text.
    LightSentence sentence;
    TokenizeTextForLangId(text, &sentence);

    if (GetRealTextSize(sentence) < min_text_size_in_bytes_) {
      return {};
    }

    // TODO(salcianu): reuse vector<FeatureVector>.
    std::vector<FeatureVector> features(
        lang_id_brain_interface_.NumEmbeddings());
    lang_id_brain_interface_.GetFeatures(&sentence, &features);

    // Predict language.
    EmbeddingNetwork::Vector scores;
    network_->ComputeFinalScores(features, &scores);

    return ComputeSoftmax(scores);
  }

  bool is_valid() const { return valid_; }

  int version() const { return version_; }

 private:
  // Returns name of the (in-memory) file for the indicated TaskInput from
  // context.
  static std::string GetInMemoryFileNameForTaskInput(
      const std::string &input_name, TaskContext *context) {
    TaskInput *task_input = context->GetInput(input_name);
    if (task_input->part_size() != 1) {
      TC_LOG(ERROR) << "TaskInput " << input_name << " has "
                    << task_input->part_size() << " parts";
      return "";
    }
    return task_input->part(0).file_pattern();
  }

  bool ParseNetworkParams(const InMemoryModelData &model_data,
                          TaskContext *context) {
    const std::string input_name = "language-identifier-network";
    const std::string input_file_name =
        GetInMemoryFileNameForTaskInput(input_name, context);
    if (input_file_name.empty()) {
      TC_LOG(ERROR) << "No input file name for TaskInput " << input_name;
      return false;
    }
    StringPiece bytes = model_data.GetBytesForInputFile(input_file_name);
    if (bytes.data() == nullptr) {
      TC_LOG(ERROR) << "Unable to get bytes for TaskInput " << input_name;
      return false;
    }
    std::unique_ptr<EmbeddingNetworkProto> proto(new EmbeddingNetworkProto());
    if (!ParseProtoFromMemory(bytes, proto.get())) {
      TC_LOG(ERROR) << "Unable to parse EmbeddingNetworkProto";
      return false;
    }
    network_params_.reset(
        new EmbeddingNetworkParamsFromProto(std::move(proto)));
    if (!network_params_->is_valid()) {
      TC_LOG(ERROR) << "EmbeddingNetworkParamsFromProto not valid";
      return false;
    }
    return true;
  }

  // Parses dictionary with known languages (i.e., field languages_) from a
  // TaskInput of context.  Note: that TaskInput should be a ListOfStrings proto
  // with a single element, the serialized form of a ListOfStrings.
  //
  bool ParseListOfKnownLanguages(const InMemoryModelData &model_data,
                                 TaskContext *context) {
    const std::string input_name = "language-name-id-map";
    const std::string input_file_name =
        GetInMemoryFileNameForTaskInput(input_name, context);
    if (input_file_name.empty()) {
      TC_LOG(ERROR) << "No input file name for TaskInput " << input_name;
      return false;
    }
    StringPiece bytes = model_data.GetBytesForInputFile(input_file_name);
    if (bytes.data() == nullptr) {
      TC_LOG(ERROR) << "Unable to get bytes for TaskInput " << input_name;
      return false;
    }
    ListOfStrings records;
    if (!ParseProtoFromMemory(bytes, &records)) {
      TC_LOG(ERROR) << "Unable to parse ListOfStrings from TaskInput "
                    << input_name;
      return false;
    }
    if (records.element_size() != 1) {
      TC_LOG(ERROR) << "Wrong number of records in TaskInput " << input_name
                    << " : " << records.element_size();
      return false;
    }
    if (!ParseProtoFromMemory(std::string(records.element(0)), &languages_)) {
      TC_LOG(ERROR) << "Unable to parse dictionary with known languages";
      return false;
    }
    return true;
  }

  // Returns language code for a softmax label.  See comments for languages_
  // field.  If label is out of range, returns default_language_.
  std::string GetLanguageForSoftmaxLabel(int label) const {
    if ((label >= 0) && (label < languages_.element_size())) {
      return languages_.element(label);
    } else {
      TC_LOG(ERROR) << "Softmax label " << label << " outside range [0, "
                    << languages_.element_size() << ")";
      return default_language_;
    }
  }

  LangIdBrainInterface lang_id_brain_interface_;

  // Parameters for the neural network network_ (see below).
  std::unique_ptr<EmbeddingNetworkParamsFromProto> network_params_;

  // Neural network to use for scoring.
  std::unique_ptr<EmbeddingNetwork> network_;

  // True if this object is ready to perform language predictions.
  bool valid_;

  // Only predictions with a probability (confidence) above this threshold are
  // reported.  Otherwise, we report default_language_.
  float probability_threshold_ = kDefaultProbabilityThreshold;

  // Min size of the input text for our predictions to be meaningful.  Below
  // this threshold, the underlying model may report a wrong language and a high
  // confidence score.
  int min_text_size_in_bytes_ = kDefaultMinTextSizeInBytes;

  // Version of the model.
  int version_ = -1;

  // Known languages: softmax label i (an integer) means languages_.element(i)
  // (something like "en", "fr", "ru", etc).
  ListOfStrings languages_;

  // Language code to return in case of errors.
  std::string default_language_ = kInitialDefaultLanguage;

  TC_DISALLOW_COPY_AND_ASSIGN(LangIdImpl);
};

LangId::LangId(const std::string &filename) : pimpl_(new LangIdImpl(filename)) {
  if (!pimpl_->is_valid()) {
    TC_LOG(ERROR) << "Unable to construct a valid LangId based "
                  << "on the data from " << filename
                  << "; nothing should crash, but "
                  << "accuracy will be bad.";
  }
}

LangId::LangId(int fd) : pimpl_(new LangIdImpl(fd)) {
  if (!pimpl_->is_valid()) {
    TC_LOG(ERROR) << "Unable to construct a valid LangId based "
                  << "on the data from descriptor " << fd
                  << "; nothing should crash, "
                  << "but accuracy will be bad.";
  }
}

LangId::LangId(const char *ptr, size_t length)
    : pimpl_(new LangIdImpl(ptr, length)) {
  if (!pimpl_->is_valid()) {
    TC_LOG(ERROR) << "Unable to construct a valid LangId based "
                  << "on the memory region; nothing should crash, "
                  << "but accuracy will be bad.";
  }
}

LangId::~LangId() = default;

void LangId::SetProbabilityThreshold(float threshold) {
  pimpl_->SetProbabilityThreshold(threshold);
}

void LangId::SetDefaultLanguage(const std::string &lang) {
  pimpl_->SetDefaultLanguage(lang);
}

std::string LangId::FindLanguage(const std::string &text) const {
  return pimpl_->FindLanguage(text);
}

std::vector<std::pair<std::string, float>> LangId::FindLanguages(
    const std::string &text) const {
  return pimpl_->FindLanguages(text);
}

bool LangId::is_valid() const { return pimpl_->is_valid(); }

int LangId::version() const { return pimpl_->version(); }

}  // namespace lang_id
}  // namespace nlp_core
}  // namespace libtextclassifier