/* * Copyright (C) 2015 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include <cstdlib> #include <string> #include <vector> #include <cutils/log.h> #include <unicode/utf.h> #include <unicode/utf8.h> #include "minikin/U16StringPiece.h" namespace minikin { // src is of the form "U+1F431 | 'h' 'i'". Position of "|" gets saved to offset if non-null. // Size is returned in an out parameter because gtest needs a void return for ASSERT to work. void ParseUnicode(uint16_t* buf, size_t buf_size, const char* src, size_t* result_size, size_t* offset) { size_t input_ix = 0; size_t output_ix = 0; bool seen_offset = false; while (src[input_ix] != 0) { switch (src[input_ix]) { case '\'': // single ASCII char LOG_ALWAYS_FATAL_IF(static_cast<uint8_t>(src[input_ix]) >= 0x80); input_ix++; LOG_ALWAYS_FATAL_IF(src[input_ix] == 0); LOG_ALWAYS_FATAL_IF(output_ix >= buf_size); buf[output_ix++] = (uint16_t)src[input_ix++]; LOG_ALWAYS_FATAL_IF(src[input_ix] != '\''); input_ix++; break; case 'u': case 'U': { // Unicode codepoint in hex syntax input_ix++; LOG_ALWAYS_FATAL_IF(src[input_ix] != '+'); input_ix++; char* endptr = (char*)src + input_ix; unsigned long int codepoint = strtoul(src + input_ix, &endptr, 16); size_t num_hex_digits = endptr - (src + input_ix); // also triggers on invalid number syntax, digits = 0 LOG_ALWAYS_FATAL_IF(num_hex_digits < 4u); LOG_ALWAYS_FATAL_IF(num_hex_digits > 6u); LOG_ALWAYS_FATAL_IF(codepoint > 0x10FFFFu); input_ix += num_hex_digits; if (U16_LENGTH(codepoint) == 1) { LOG_ALWAYS_FATAL_IF(output_ix + 1 > buf_size); buf[output_ix++] = codepoint; } else { // UTF-16 encoding LOG_ALWAYS_FATAL_IF(output_ix + 2 > buf_size); buf[output_ix++] = U16_LEAD(codepoint); buf[output_ix++] = U16_TRAIL(codepoint); } break; } case ' ': input_ix++; break; case '|': LOG_ALWAYS_FATAL_IF(seen_offset); LOG_ALWAYS_FATAL_IF(offset == nullptr); *offset = output_ix; seen_offset = true; input_ix++; break; default: LOG_ALWAYS_FATAL("Unexpected Character"); } } LOG_ALWAYS_FATAL_IF(result_size == nullptr); *result_size = output_ix; LOG_ALWAYS_FATAL_IF(!seen_offset && offset != nullptr); } std::vector<uint16_t> parseUnicodeStringWithOffset(const std::string& in, size_t* offset) { std::unique_ptr<uint16_t[]> buffer(new uint16_t[in.size()]); size_t result_size = 0; ParseUnicode(buffer.get(), in.size(), in.c_str(), &result_size, offset); return std::vector<uint16_t>(buffer.get(), buffer.get() + result_size); } std::vector<uint16_t> parseUnicodeString(const std::string& in) { return parseUnicodeStringWithOffset(in, nullptr); } std::vector<uint16_t> utf8ToUtf16(const std::string& text) { std::vector<uint16_t> result; int32_t i = 0; const int32_t textLength = static_cast<int32_t>(text.size()); uint32_t c = 0; while (i < textLength) { U8_NEXT(text.c_str(), i, textLength, c); if (U16_LENGTH(c) == 1) { result.push_back(c); } else { result.push_back(U16_LEAD(c)); result.push_back(U16_TRAIL(c)); } } return result; } std::string utf16ToUtf8(const U16StringPiece& u16String) { const uint32_t textLength = u16String.size(); uint32_t i = 0; uint32_t c = 0; std::string out; out.reserve(textLength * 4); while (i < textLength) { U16_NEXT(u16String.data(), i, textLength, c); char buf[U8_MAX_LENGTH] = {}; uint32_t outIndex = 0; U8_APPEND_UNSAFE(buf, outIndex, c); out.append(buf, outIndex); } return out; } } // namespace minikin