普通文本  |  133行  |  5.38 KB

/*
 * Copyright (C) 2018 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "actions/feature-processor.h"

namespace libtextclassifier3 {
namespace {
TokenFeatureExtractorOptions BuildTokenFeatureExtractorOptions(
    const ActionsTokenFeatureProcessorOptions* const options) {
  TokenFeatureExtractorOptions extractor_options;
  extractor_options.num_buckets = options->num_buckets();
  if (options->chargram_orders() != nullptr) {
    for (int order : *options->chargram_orders()) {
      extractor_options.chargram_orders.push_back(order);
    }
  }
  extractor_options.max_word_length = options->max_token_length();
  extractor_options.extract_case_feature = options->extract_case_feature();
  extractor_options.unicode_aware_features = options->unicode_aware_features();
  extractor_options.extract_selection_mask_feature = false;
  if (options->regexp_features() != nullptr) {
    for (const auto& regexp_feauture : *options->regexp_features()) {
      extractor_options.regexp_features.push_back(regexp_feauture->str());
    }
  }
  extractor_options.remap_digits = options->remap_digits();
  extractor_options.lowercase_tokens = options->lowercase_tokens();
  return extractor_options;
}
}  // namespace

std::unique_ptr<Tokenizer> CreateTokenizer(
    const ActionsTokenizerOptions* options, const UniLib* unilib) {
  std::vector<const TokenizationCodepointRange*> codepoint_config;
  if (options->tokenization_codepoint_config() != nullptr) {
    codepoint_config.insert(codepoint_config.end(),
                            options->tokenization_codepoint_config()->begin(),
                            options->tokenization_codepoint_config()->end());
  }
  std::vector<const CodepointRange*> internal_codepoint_config;
  if (options->internal_tokenizer_codepoint_ranges() != nullptr) {
    internal_codepoint_config.insert(
        internal_codepoint_config.end(),
        options->internal_tokenizer_codepoint_ranges()->begin(),
        options->internal_tokenizer_codepoint_ranges()->end());
  }
  const bool tokenize_on_script_change =
      options->tokenization_codepoint_config() != nullptr &&
      options->tokenize_on_script_change();
  return std::unique_ptr<Tokenizer>(new Tokenizer(
      options->type(), unilib, codepoint_config, internal_codepoint_config,
      tokenize_on_script_change, options->icu_preserve_whitespace_tokens()));
}

ActionsFeatureProcessor::ActionsFeatureProcessor(
    const ActionsTokenFeatureProcessorOptions* options, const UniLib* unilib)
    : options_(options),
      tokenizer_(CreateTokenizer(options->tokenizer_options(), unilib)),
      token_feature_extractor_(BuildTokenFeatureExtractorOptions(options),
                               *unilib) {}

int ActionsFeatureProcessor::GetTokenEmbeddingSize() const {
  return options_->embedding_size() +
         token_feature_extractor_.DenseFeaturesCount();
}

bool ActionsFeatureProcessor::AppendFeatures(
    const std::vector<int>& sparse_features,
    const std::vector<float>& dense_features,
    const EmbeddingExecutor* embedding_executor,
    std::vector<float>* output_features) const {
  // Embed the sparse features, appending them directly to the output.
  const int embedding_size = options_->embedding_size();
  output_features->resize(output_features->size() + embedding_size);
  float* output_features_end =
      output_features->data() + output_features->size();
  if (!embedding_executor->AddEmbedding(
          TensorView<int>(sparse_features.data(),
                          {static_cast<int>(sparse_features.size())}),
          /*dest=*/output_features_end - embedding_size,
          /*dest_size=*/embedding_size)) {
    TC3_LOG(ERROR) << "Could not embed token's sparse features.";
    return false;
  }

  // Append the dense features to the output.
  output_features->insert(output_features->end(), dense_features.begin(),
                          dense_features.end());
  return true;
}

bool ActionsFeatureProcessor::AppendTokenFeatures(
    const Token& token, const EmbeddingExecutor* embedding_executor,
    std::vector<float>* output_features) const {
  // Extract the sparse and dense features.
  std::vector<int> sparse_features;
  std::vector<float> dense_features;
  if (!token_feature_extractor_.Extract(token, /*(unused) is_in_span=*/false,
                                        &sparse_features, &dense_features)) {
    TC3_LOG(ERROR) << "Could not extract token's features.";
    return false;
  }
  return AppendFeatures(sparse_features, dense_features, embedding_executor,
                        output_features);
}

bool ActionsFeatureProcessor::AppendTokenFeatures(
    const std::vector<Token>& tokens,
    const EmbeddingExecutor* embedding_executor,
    std::vector<float>* output_features) const {
  for (const Token& token : tokens) {
    if (!AppendTokenFeatures(token, embedding_executor, output_features)) {
      return false;
    }
  }
  return true;
}

}  // namespace libtextclassifier3