/* * Copyright (C) 2018 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_CUSTOM_TOKENIZER_H_ #define NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_CUSTOM_TOKENIZER_H_ #include <string> #include "lang_id/common/fel/task-context.h" #include "lang_id/common/lite_strings/stringpiece.h" #include "lang_id/light-sentence.h" namespace libtextclassifier3 { namespace mobile { namespace lang_id { // Custom tokenizer for the LangId model. class TokenizerForLangId { public: void Setup(TaskContext *context); // Tokenizes |text|, placing the tokens into |sentence|. Customized for // LangId. Currently (Sep 15, 2016) we tokenize on space, newline, tab, and // any other 1-byte UTF8 character which is not a letter, ignore all empty // tokens, and (for each of the remaining tokens) prepend "^" (special token // begin marker) and append "$" (special token end marker). // // Tokens are stored into the "repeated Token token;" field of *sentence. void Tokenize(StringPiece text, LightSentence *sentence) const; private: // If true, during tokenization, we use the lowercase version of each Unicode // character from the text to tokenize. E.g., if this is true, the text "Foo // bar" is tokenized as ["foo", "bar"]; otherwise, we get ["Foo", "bar"]. bool lowercase_input_ = false; }; } // namespace lang_id } // namespace mobile } // namespace nlp_saft #endif // NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_CUSTOM_TOKENIZER_H_