// // Copyright (C) 2018 The Android Open Source Project // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // Configuration for the text encoder op. namespace libtextclassifier3; enum SentencePieceMatcherType : byte { MAPPED_TRIE = 0, SORTED_STRING_TABLE = 1, } table TextEncoderConfig { // Code that is used as encoding of the start code. start_code:int32 = 0; // Code that is used as encoding of the end code. end_code:int32 = 1; // This value is added to all codes to make them not intersect with // `start_code` and `end_code`. encoding_offset:int32 = 2; // Code that is used for out-of-dictionary characters. unknown_code:int32 = -1; // Penalty associated with the unknown code. unknown_score:float; // Normalization options. // Serialized normalization charsmap. normalization_charsmap:string; normalization_charsmap_values:string; // Whether to add dummy whitespace at the beginning of the text in order to // treat "world" in "world" and "hello world" uniformly. add_dummy_prefix:bool = true; // Whether to remove leading, trailing and duplicate internal whitespace. remove_extra_whitespaces:bool = true; // Whether to replace whitespace with a meta symbol. escape_whitespaces:bool = true; // Sentence pieces scores. pieces_scores:[float]; // Serialized sentence pieces. pieces:string; pieces_offsets:[uint32]; matcher_type: SentencePieceMatcherType = MAPPED_TRIE; }