// Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (c) 2001-2011, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description * 11/19/2001 aliu Creation. ********************************************************************** */ #include "unicode/utypes.h" #if !UCONFIG_NO_TRANSLITERATION #include "unicode/uchar.h" #include "unicode/utf16.h" #include "unesctrn.h" #include "util.h" #include "cmemory.h" U_NAMESPACE_BEGIN /** * Special character marking the end of the spec[] array. */ static const UChar END = 0xFFFF; // Unicode: "U+10FFFF" hex, min=4, max=6 static const UChar SPEC_Unicode[] = { 2, 0, 16, 4, 6, 85/*U*/, 43/*+*/, END }; // Java: "\\uFFFF" hex, min=4, max=4 static const UChar SPEC_Java[] = { 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/, END }; // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8 static const UChar SPEC_C[] = { 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/, 2, 0, 16, 8, 8, 92/*\*/, 85/*U*/, END }; // XML: "" hex, min=1, max=6 static const UChar SPEC_XML[] = { 3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/, END }; // XML10: "" dec, min=1, max=7 (not really "Hex-Any") static const UChar SPEC_XML10[] = { 2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/, END }; // Perl: "\\x{263A}" hex, min=1, max=6 static const UChar SPEC_Perl[] = { 3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, END }; // All: Java, C, Perl, XML, XML10, Unicode static const UChar SPEC_Any[] = { 2, 0, 16, 4, 6, 85/*U*/, 43/*+*/, // Unicode 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/, // Java 2, 0, 16, 8, 8, 92/*\*/, 85/*U*/, // C (surrogates) 3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/, // XML 2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/, // XML10 3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, // Perl END }; UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator) static UChar* copySpec(const UChar* spec) { int32_t len = 0; while (spec[len] != END) { ++len; } ++len; UChar *result = (UChar *)uprv_malloc(len*sizeof(UChar)); // Check for memory allocation error. if (result != NULL) { uprv_memcpy(result, spec, (size_t)len*sizeof(result[0])); } return result; } /** * Factory methods. Ignore the context. */ static Transliterator* _createUnicode(const UnicodeString& ID, Transliterator::Token /*context*/) { return new UnescapeTransliterator(ID, SPEC_Unicode); } static Transliterator* _createJava(const UnicodeString& ID, Transliterator::Token /*context*/) { return new UnescapeTransliterator(ID, SPEC_Java); } static Transliterator* _createC(const UnicodeString& ID, Transliterator::Token /*context*/) { return new UnescapeTransliterator(ID, SPEC_C); } static Transliterator* _createXML(const UnicodeString& ID, Transliterator::Token /*context*/) { return new UnescapeTransliterator(ID, SPEC_XML); } static Transliterator* _createXML10(const UnicodeString& ID, Transliterator::Token /*context*/) { return new UnescapeTransliterator(ID, SPEC_XML10); } static Transliterator* _createPerl(const UnicodeString& ID, Transliterator::Token /*context*/) { return new UnescapeTransliterator(ID, SPEC_Perl); } static Transliterator* _createAny(const UnicodeString& ID, Transliterator::Token /*context*/) { return new UnescapeTransliterator(ID, SPEC_Any); } /** * Registers standard variants with the system. Called by * Transliterator during initialization. */ void UnescapeTransliterator::registerIDs() { Token t = integerToken(0); Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Unicode"), _createUnicode, t); Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Java"), _createJava, t); Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/C"), _createC, t); Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML"), _createXML, t); Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML10"), _createXML10, t); Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Perl"), _createPerl, t); Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any"), _createAny, t); } /** * Constructor. Takes the encoded spec array. */ UnescapeTransliterator::UnescapeTransliterator(const UnicodeString& newID, const UChar *newSpec) : Transliterator(newID, NULL) { this->spec = copySpec(newSpec); } /** * Copy constructor. */ UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator& o) : Transliterator(o) { this->spec = copySpec(o.spec); } UnescapeTransliterator::~UnescapeTransliterator() { uprv_free(spec); } /** * Transliterator API. */ Transliterator* UnescapeTransliterator::clone() const { return new UnescapeTransliterator(*this); } /** * Implements {@link Transliterator#handleTransliterate}. */ void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos, UBool isIncremental) const { int32_t start = pos.start; int32_t limit = pos.limit; int32_t i, j, ipat; while (start < limit) { // Loop over the forms in spec[]. Exit this loop when we // match one of the specs. Exit the outer loop if a // partial match is detected and isIncremental is true. for (j=0, ipat=0; spec[ipat] != END; ++j) { // Read the header int32_t prefixLen = spec[ipat++]; int32_t suffixLen = spec[ipat++]; int8_t radix = (int8_t) spec[ipat++]; int32_t minDigits = spec[ipat++]; int32_t maxDigits = spec[ipat++]; // s is a copy of start that is advanced over the // characters as we parse them. int32_t s = start; UBool match = TRUE; for (i=0; i<prefixLen; ++i) { if (s >= limit) { if (i > 0) { // We've already matched a character. This is // a partial match, so we return if in // incremental mode. In non-incremental mode, // go to the next spec. if (isIncremental) { goto exit; } match = FALSE; break; } } UChar c = text.charAt(s++); if (c != spec[ipat + i]) { match = FALSE; break; } } if (match) { UChar32 u = 0; int32_t digitCount = 0; for (;;) { if (s >= limit) { // Check for partial match in incremental mode. if (s > start && isIncremental) { goto exit; } break; } UChar32 ch = text.char32At(s); int32_t digit = u_digit(ch, radix); if (digit < 0) { break; } s += U16_LENGTH(ch); u = (u * radix) + digit; if (++digitCount == maxDigits) { break; } } match = (digitCount >= minDigits); if (match) { for (i=0; i<suffixLen; ++i) { if (s >= limit) { // Check for partial match in incremental mode. if (s > start && isIncremental) { goto exit; } match = FALSE; break; } UChar c = text.charAt(s++); if (c != spec[ipat + prefixLen + i]) { match = FALSE; break; } } if (match) { // At this point, we have a match UnicodeString str(u); text.handleReplaceBetween(start, s, str); limit -= s - start - str.length(); // The following break statement leaves the // loop that is traversing the forms in // spec[]. We then parse the next input // character. break; } } } ipat += prefixLen + suffixLen; } if (start < limit) { start += U16_LENGTH(text.char32At(start)); } } exit: pos.contextLimit += limit - pos.limit; pos.limit = limit; pos.start = start; } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_TRANSLITERATION */ //eof