普通文本  |  199行  |  6.04 KB

/*
 * Copyright (C) 2018 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <fstream>
#include <string>

#include "gmock/gmock.h"
#include "gtest/gtest.h"

#include "utils/sentencepiece/double_array_trie.h"
#include "utils/sentencepiece/normalizer.h"
#include "utils/sentencepiece/test_utils.h"
#include "utils/strings/stringpiece.h"

namespace libtextclassifier3 {
namespace {

std::string GetTestConfigPath() {
  return "";
}

TEST(NormalizerTest, NormalizesAsReferenceNormalizer) {
  std::ifstream test_config_stream(GetTestConfigPath());
  std::string config((std::istreambuf_iterator<char>(test_config_stream)),
                     (std::istreambuf_iterator<char>()));
  SentencePieceNormalizer normalizer =
      NormalizerFromSpec(config, /*add_dummy_prefix=*/true,
                         /*remove_extra_whitespaces=*/true,
                         /*escape_whitespaces=*/true);
  {
    std::string normalized;
    EXPECT_TRUE(normalizer.Normalize("hello there", &normalized));
    EXPECT_EQ(normalized, "▁hello▁there");
  }

  // Redundant whitespace.
  {
    std::string normalized;
    EXPECT_TRUE(normalizer.Normalize("when is  the  world cup?", &normalized));
    EXPECT_EQ(normalized, "▁when▁is▁the▁world▁cup?");
  }

  // Different whitespace.
  {
    std::string normalized;
    EXPECT_TRUE(normalizer.Normalize("general\tkenobi", &normalized));
    EXPECT_EQ(normalized, "▁general▁kenobi");
  }

  // NFKC char to multi-char normalization.
  {
    std::string normalized;
    EXPECT_TRUE(normalizer.Normalize("㍿", &normalized));
    EXPECT_EQ(normalized, "▁株式会社");
  }

  // Half width katakana, character composition happens.
  {
    std::string normalized;
    EXPECT_TRUE(normalizer.Normalize(" グーグル ", &normalized));
    EXPECT_EQ(normalized, "▁グーグル");
  }

  // NFKC char to char normalization.
  {
    std::string normalized;
    EXPECT_TRUE(normalizer.Normalize("①②③", &normalized));
    EXPECT_EQ(normalized, "▁123");
  }
}

TEST(NormalizerTest, NoDummyPrefix) {
  std::ifstream test_config_stream(GetTestConfigPath());
  std::string config((std::istreambuf_iterator<char>(test_config_stream)),
                     (std::istreambuf_iterator<char>()));
  SentencePieceNormalizer normalizer =
      NormalizerFromSpec(config, /*add_dummy_prefix=*/false,
                         /*remove_extra_whitespaces=*/true,
                         /*escape_whitespaces=*/true);

  // NFKC char to char normalization.
  {
    std::string normalized;
    EXPECT_TRUE(normalizer.Normalize("hello there", &normalized));
    EXPECT_EQ(normalized, "hello▁there");
  }

  // Redundant whitespace.
  {
    std::string normalized;
    EXPECT_TRUE(normalizer.Normalize("when is  the  world cup?", &normalized));
    EXPECT_EQ(normalized, "when▁is▁the▁world▁cup?");
  }

  // Different whitespace.
  {
    std::string normalized;
    EXPECT_TRUE(normalizer.Normalize("general\tkenobi", &normalized));
    EXPECT_EQ(normalized, "general▁kenobi");
  }

  // NFKC char to multi-char normalization.
  {
    std::string normalized;
    EXPECT_TRUE(normalizer.Normalize("㍿", &normalized));
    EXPECT_EQ(normalized, "株式会社");
  }

  // Half width katakana, character composition happens.
  {
    std::string normalized;
    EXPECT_TRUE(normalizer.Normalize(" グーグル ", &normalized));
    EXPECT_EQ(normalized, "グーグル");
  }

  // NFKC char to char normalization.
  {
    std::string normalized;
    EXPECT_TRUE(normalizer.Normalize("①②③", &normalized));
    EXPECT_EQ(normalized, "123");
  }
}

TEST(NormalizerTest, NoRemoveExtraWhitespace) {
  std::ifstream test_config_stream(GetTestConfigPath());
  std::string config((std::istreambuf_iterator<char>(test_config_stream)),
                     (std::istreambuf_iterator<char>()));
  SentencePieceNormalizer normalizer =
      NormalizerFromSpec(config, /*add_dummy_prefix=*/false,
                         /*remove_extra_whitespaces=*/false,
                         /*escape_whitespaces=*/true);

  {
    std::string normalized;
    EXPECT_TRUE(normalizer.Normalize("hello there", &normalized));
    EXPECT_EQ(normalized, "hello▁there");
  }

  // Redundant whitespace.
  {
    std::string normalized;
    EXPECT_TRUE(normalizer.Normalize("when is  the  world cup?", &normalized));
    EXPECT_EQ(normalized, "when▁is▁▁the▁▁world▁cup?");
  }

  // Different whitespace.
  {
    std::string normalized;
    EXPECT_TRUE(normalizer.Normalize("general\tkenobi", &normalized));
    EXPECT_EQ(normalized, "general▁kenobi");
  }
}

TEST(NormalizerTest, NoEscapeWhitespaces) {
  std::ifstream test_config_stream(GetTestConfigPath());
  std::string config((std::istreambuf_iterator<char>(test_config_stream)),
                     (std::istreambuf_iterator<char>()));
  SentencePieceNormalizer normalizer =
      NormalizerFromSpec(config, /*add_dummy_prefix=*/false,
                         /*remove_extra_whitespaces=*/false,
                         /*escape_whitespaces=*/false);

  {
    std::string normalized;
    EXPECT_TRUE(normalizer.Normalize("hello there", &normalized));
    EXPECT_EQ(normalized, "hello there");
  }

  // Redundant whitespace.
  {
    std::string normalized;
    EXPECT_TRUE(normalizer.Normalize("when is  the  world cup?", &normalized));
    EXPECT_EQ(normalized, "when is  the  world cup?");
  }

  // Different whitespace.
  {
    std::string normalized;
    EXPECT_TRUE(normalizer.Normalize("general\tkenobi", &normalized));
    EXPECT_EQ(normalized, "general kenobi");
  }
}

}  // namespace
}  // namespace libtextclassifier3