普通文本  |  95行  |  2.93 KB

/*
 * Copyright (C) 2018 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "lang_id/script/approx-script.h"

#include "lang_id/common/lite_base/integral-types.h"
#include "lang_id/common/lite_base/logging.h"
#include "lang_id/common/utf8.h"
#include "lang_id/script/approx-script-data.h"

namespace libtextclassifier3 {
namespace mobile {

// int value of USCRIPT_UNKNOWN from enum UScriptCode (from
// unicode/uscript.h).  Note: we do have a test that
// USCRIPT_UNKNOWN evaluates to 103.
const int kUnknownUscript = 103;

namespace {
using approx_script_internal::kNumRanges;
using approx_script_internal::kRangeFirst;
using approx_script_internal::kRangeScript;
using approx_script_internal::kRangeSizeMinusOne;

uint32 Utf8ToCodepoint(const unsigned char *s, int num_bytes) {
  switch (num_bytes) {
    case 1:
      return s[0];
    case 2:
      return ((s[0] & 0x1F) << 6) | (s[1] & 0x3F);
    case 3:
      return (((s[0] & 0x0F) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F));
    case 4:
      return (((s[0] & 0x07) << 18) | ((s[1] & 0x3F) << 12) |
              ((s[2] & 0x3F) << 6) | (s[3] & 0x3F));
    default:
      SAFTM_DLOG(FATAL) << "Illegal num_bytes: " << num_bytes;
      return 0;
  }
}

inline int BinarySearch(uint32 codepoint, int start, int end) {
  while (end > start + 1) {
    // Due to the while loop condition, middle > start and middle < end.  Hence,
    // on both branches of the if below, we strictly reduce the end - start
    // value, so we eventually get that difference below 1 and complete the
    // while loop.
    int middle = (start + end) / 2;
    if (codepoint < kRangeFirst[middle]) {
      end = middle;
    } else {
      start = middle;
    }
  }

  if (end == start + 1) {
    const uint32 range_start = kRangeFirst[start];
    if ((codepoint >= range_start) &&
        (codepoint <= range_start + kRangeSizeMinusOne[start])) {
      return kRangeScript[start];
    }
  }

  return kUnknownUscript;
}
}  // namespace

int GetApproxScript(const unsigned char *s, int num_bytes) {
  SAFTM_DCHECK_NE(s, nullptr);
  SAFTM_DCHECK_EQ(num_bytes,
                  utils::OneCharLen(reinterpret_cast<const char *>(s)));
  uint32 codepoint = Utf8ToCodepoint(s, num_bytes);
  return BinarySearch(codepoint, 0, kNumRanges);
}

int GetMaxApproxScriptResult() { return approx_script_internal::kMaxScript; }

SAFTM_STATIC_REGISTRATION(ApproxScriptDetector);

}  // namespace mobile
}  // namespace nlp_saft