/* * Copyright (C) 2017 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "lang_id/lang-id.h" #include <memory> #include <string> #include <utility> #include <vector> #include "base.h" #include "util/base/logging.h" #include "gtest/gtest.h" namespace libtextclassifier { namespace nlp_core { namespace lang_id { namespace { std::string GetModelPath() { return TEST_DATA_DIR "langid.model"; } // Creates a LangId with default model. Passes ownership to // the caller. LangId *CreateLanguageDetector() { return new LangId(GetModelPath()); } } // namespace TEST(LangIdTest, Normal) { std::unique_ptr<LangId> lang_id(CreateLanguageDetector()); EXPECT_EQ("en", lang_id->FindLanguage("This text is written in English.")); EXPECT_EQ("en", lang_id->FindLanguage("This text is written in English. ")); EXPECT_EQ("en", lang_id->FindLanguage(" This text is written in English. ")); EXPECT_EQ("fr", lang_id->FindLanguage("Vive la France! Vive la France!")); EXPECT_EQ("ro", lang_id->FindLanguage("Sunt foarte foarte foarte fericit!")); } // Test that for very small queries, we return the default language and a low // confidence score. TEST(LangIdTest, SuperSmallQueries) { std::unique_ptr<LangId> lang_id(CreateLanguageDetector()); // Use a default language different from any real language: to be sure the // result is the default language, not a language that happens to be the // default language. const std::string kDefaultLanguage = "dflt-lng"; lang_id->SetDefaultLanguage(kDefaultLanguage); // Test the simple FindLanguage() method: that method returns a single // language. EXPECT_EQ(kDefaultLanguage, lang_id->FindLanguage("y")); EXPECT_EQ(kDefaultLanguage, lang_id->FindLanguage("j")); EXPECT_EQ(kDefaultLanguage, lang_id->FindLanguage("l")); EXPECT_EQ(kDefaultLanguage, lang_id->FindLanguage("w")); EXPECT_EQ(kDefaultLanguage, lang_id->FindLanguage("z")); EXPECT_EQ(kDefaultLanguage, lang_id->FindLanguage("zulu")); // Test the more complex FindLanguages() method: that method returns a vector // of (language, confidence_score) pairs. std::vector<std::pair<std::string, float>> languages; languages = lang_id->FindLanguages("y"); EXPECT_EQ(1, languages.size()); EXPECT_EQ(kDefaultLanguage, languages[0].first); EXPECT_GT(0.01f, languages[0].second); languages = lang_id->FindLanguages("Todoist"); EXPECT_EQ(1, languages.size()); EXPECT_EQ(kDefaultLanguage, languages[0].first); EXPECT_GT(0.01f, languages[0].second); // A few tests with a default language that is a real language code. const std::string kJapanese = "ja"; lang_id->SetDefaultLanguage(kJapanese); EXPECT_EQ(kJapanese, lang_id->FindLanguage("y")); EXPECT_EQ(kJapanese, lang_id->FindLanguage("j")); EXPECT_EQ(kJapanese, lang_id->FindLanguage("l")); languages = lang_id->FindLanguages("y"); EXPECT_EQ(1, languages.size()); EXPECT_EQ(kJapanese, languages[0].first); EXPECT_GT(0.01f, languages[0].second); // Make sure the min text size limit is applied to the number of real // characters (e.g., without spaces and punctuation chars, which don't // influence language identification). const std::string kWhitespaces = " \t \n \t\t\t\n \t"; const std::string kPunctuation = "... ?!!--- -%%^...-"; std::string still_small_string = kWhitespaces + "y" + kWhitespaces + kPunctuation + kWhitespaces + kPunctuation + kPunctuation; EXPECT_LE(100, still_small_string.size()); lang_id->SetDefaultLanguage(kDefaultLanguage); EXPECT_EQ(kDefaultLanguage, lang_id->FindLanguage(still_small_string)); languages = lang_id->FindLanguages(still_small_string); EXPECT_EQ(1, languages.size()); EXPECT_EQ(kDefaultLanguage, languages[0].first); EXPECT_GT(0.01f, languages[0].second); } namespace { void CheckPredictionForGibberishStrings(const std::string &default_language) { static const char *const kGibberish[] = { "", " ", " ", " ___ ", "123 456 789", "><> (-_-) <><", nullptr, }; std::unique_ptr<LangId> lang_id(CreateLanguageDetector()); TC_LOG(INFO) << "Default language: " << default_language; lang_id->SetDefaultLanguage(default_language); for (int i = 0; true; ++i) { const char *gibberish = kGibberish[i]; if (gibberish == nullptr) { break; } const std::string predicted_language = lang_id->FindLanguage(gibberish); TC_LOG(INFO) << "Predicted " << predicted_language << " for \"" << gibberish << "\""; EXPECT_EQ(default_language, predicted_language); } } } // namespace TEST(LangIdTest, CornerCases) { CheckPredictionForGibberishStrings("en"); CheckPredictionForGibberishStrings("ro"); CheckPredictionForGibberishStrings("fr"); } } // namespace lang_id } // namespace nlp_core } // namespace libtextclassifier