/* * Copyright (C) 2018 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "lang_id/script/approx-script.h" #include "lang_id/common/lite_base/integral-types.h" #include "lang_id/common/lite_base/logging.h" #include "lang_id/common/utf8.h" #include "lang_id/script/approx-script-data.h" namespace libtextclassifier3 { namespace mobile { // int value of USCRIPT_UNKNOWN from enum UScriptCode (from // unicode/uscript.h). Note: we do have a test that // USCRIPT_UNKNOWN evaluates to 103. const int kUnknownUscript = 103; namespace { using approx_script_internal::kNumRanges; using approx_script_internal::kRangeFirst; using approx_script_internal::kRangeScript; using approx_script_internal::kRangeSizeMinusOne; uint32 Utf8ToCodepoint(const unsigned char *s, int num_bytes) { switch (num_bytes) { case 1: return s[0]; case 2: return ((s[0] & 0x1F) << 6) | (s[1] & 0x3F); case 3: return (((s[0] & 0x0F) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F)); case 4: return (((s[0] & 0x07) << 18) | ((s[1] & 0x3F) << 12) | ((s[2] & 0x3F) << 6) | (s[3] & 0x3F)); default: SAFTM_DLOG(FATAL) << "Illegal num_bytes: " << num_bytes; return 0; } } inline int BinarySearch(uint32 codepoint, int start, int end) { while (end > start + 1) { // Due to the while loop condition, middle > start and middle < end. Hence, // on both branches of the if below, we strictly reduce the end - start // value, so we eventually get that difference below 1 and complete the // while loop. int middle = (start + end) / 2; if (codepoint < kRangeFirst[middle]) { end = middle; } else { start = middle; } } if (end == start + 1) { const uint32 range_start = kRangeFirst[start]; if ((codepoint >= range_start) && (codepoint <= range_start + kRangeSizeMinusOne[start])) { return kRangeScript[start]; } } return kUnknownUscript; } } // namespace int GetApproxScript(const unsigned char *s, int num_bytes) { SAFTM_DCHECK_NE(s, nullptr); SAFTM_DCHECK_EQ(num_bytes, utils::OneCharLen(reinterpret_cast<const char *>(s))); uint32 codepoint = Utf8ToCodepoint(s, num_bytes); return BinarySearch(codepoint, 0, kNumRanges); } int GetMaxApproxScriptResult() { return approx_script_internal::kMaxScript; } SAFTM_STATIC_REGISTRATION(ApproxScriptDetector); } // namespace mobile } // namespace nlp_saft