uspoof_impl.cpp - Android社区 - https://www.androidos.net.cn/

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
*   Copyright (C) 2008-2016, International Business Machines
*   Corporation and others.  All Rights Reserved.
**********************************************************************
*/

#include "unicode/utypes.h"
#include "unicode/uspoof.h"
#include "unicode/uchar.h"
#include "unicode/uniset.h"
#include "unicode/utf16.h"
#include "utrie2.h"
#include "cmemory.h"
#include "cstring.h"
#include "scriptset.h"
#include "umutex.h"
#include "udataswp.h"
#include "uassert.h"
#include "ucln_in.h"
#include "uspoof_impl.h"

#if !UCONFIG_NO_NORMALIZATION

U_NAMESPACE_BEGIN

UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)

SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode& status) {
    construct(status);
    fSpoofData = data;
}

SpoofImpl::SpoofImpl(UErrorCode& status) {
    construct(status);

// TODO: Call this method where it is actually needed, instead of in the
    // constructor, to allow for lazy data loading.  See #12696.
    fSpoofData = SpoofData::getDefault(status);
}

SpoofImpl::SpoofImpl() {
    UErrorCode status = U_ZERO_ERROR;
    construct(status);

// TODO: Call this method where it is actually needed, instead of in the
    // constructor, to allow for lazy data loading.  See #12696.
    fSpoofData = SpoofData::getDefault(status);
}

void SpoofImpl::construct(UErrorCode& status) {
    fMagic = USPOOF_MAGIC;
    fChecks = USPOOF_ALL_CHECKS;
    fSpoofData = NULL;
    fAllowedCharsSet = NULL;
    fAllowedLocales = NULL;
    fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;

if (U_FAILURE(status)) { return; }

UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
    fAllowedCharsSet = allowedCharsSet;
    fAllowedLocales  = uprv_strdup("");
    if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
        return;
    }
    allowedCharsSet->freeze();
}

// Copy Constructor, used by the user level clone() function.
SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status)  :
        fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) , 
        fAllowedLocales(NULL) {
    if (U_FAILURE(status)) {
        return;
    }
    fMagic = src.fMagic;
    fChecks = src.fChecks;
    if (src.fSpoofData != NULL) {
        fSpoofData = src.fSpoofData->addReference();
    }
    fAllowedCharsSet = static_cast<const UnicodeSet *>(src.fAllowedCharsSet->clone());
    fAllowedLocales = uprv_strdup(src.fAllowedLocales);
    if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
    }
    fRestrictionLevel = src.fRestrictionLevel;
}

SpoofImpl::~SpoofImpl() {
    fMagic = 0;                // head off application errors by preventing use of
                               //    of deleted objects.
    if (fSpoofData != NULL) {
        fSpoofData->removeReference();   // Will delete if refCount goes to zero.
    }
    delete fAllowedCharsSet;
    uprv_free((void *)fAllowedLocales);
}

//  Cast this instance as a USpoofChecker for the C API.
USpoofChecker *SpoofImpl::asUSpoofChecker() {
    return reinterpret_cast<USpoofChecker*>(this);
}

//
//  Incoming parameter check on Status and the SpoofChecker object
//    received from the C API.
//
const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &status) {
    if (U_FAILURE(status)) {
        return NULL;
    }
    if (sc == NULL) {
        status = U_ILLEGAL_ARGUMENT_ERROR;
        return NULL;
    }
    SpoofImpl *This = (SpoofImpl *)sc;
    if (This->fMagic != USPOOF_MAGIC) {
        status = U_INVALID_FORMAT_ERROR;
        return NULL;
    }
    if (This->fSpoofData != NULL && !This->fSpoofData->validateDataVersion(status)) {
        return NULL;
    }
    return This;
}

SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) {
    return const_cast<SpoofImpl *>
        (SpoofImpl::validateThis(const_cast<const USpoofChecker *>(sc), status));
}

void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) {
    UnicodeSet    allowedChars;
    UnicodeSet    *tmpSet = NULL;
    const char    *locStart = localesList;
    const char    *locEnd = NULL;
    const char    *localesListEnd = localesList + uprv_strlen(localesList);
    int32_t        localeListCount = 0;   // Number of locales provided by caller.

// Loop runs once per locale from the localesList, a comma separated list of locales.
    do {
        locEnd = uprv_strchr(locStart, ',');
        if (locEnd == NULL) {
            locEnd = localesListEnd;
        }
        while (*locStart == ' ') {
            locStart++;
        }
        const char *trimmedEnd = locEnd-1;
        while (trimmedEnd > locStart && *trimmedEnd == ' ') {
            trimmedEnd--;
        }
        if (trimmedEnd <= locStart) {
            break;
        }
        const char *locale = uprv_strndup(locStart, (int32_t)(trimmedEnd + 1 - locStart));
        localeListCount++;

// We have one locale from the locales list.
        // Add the script chars for this locale to the accumulating set of allowed chars.
        // If the locale is no good, we will be notified back via status.
        addScriptChars(locale, &allowedChars, status);
        uprv_free((void *)locale);
        if (U_FAILURE(status)) {
            break;
        }
        locStart = locEnd + 1;
    } while (locStart < localesListEnd);

// If our caller provided an empty list of locales, we disable the allowed characters checking
    if (localeListCount == 0) {
        uprv_free((void *)fAllowedLocales);
        fAllowedLocales = uprv_strdup("");
        tmpSet = new UnicodeSet(0, 0x10ffff);
        if (fAllowedLocales == NULL || tmpSet == NULL) {
            status = U_MEMORY_ALLOCATION_ERROR;
            return;
        } 
        tmpSet->freeze();
        delete fAllowedCharsSet;
        fAllowedCharsSet = tmpSet;
        fChecks &= ~USPOOF_CHAR_LIMIT;
        return;
    }

// Add all common and inherited characters to the set of allowed chars.
    UnicodeSet tempSet;
    tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
    allowedChars.addAll(tempSet);
    tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
    allowedChars.addAll(tempSet);
    
    // If anything went wrong, we bail out without changing
    // the state of the spoof checker.
    if (U_FAILURE(status)) {
        return;
    }

// Store the updated spoof checker state.
    tmpSet = static_cast<UnicodeSet *>(allowedChars.clone());
    const char *tmpLocalesList = uprv_strdup(localesList);
    if (tmpSet == NULL || tmpLocalesList == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
        return;
    }
    uprv_free((void *)fAllowedLocales);
    fAllowedLocales = tmpLocalesList;
    tmpSet->freeze();
    delete fAllowedCharsSet;
    fAllowedCharsSet = tmpSet;
    fChecks |= USPOOF_CHAR_LIMIT;
}

const char * SpoofImpl::getAllowedLocales(UErrorCode &/*status*/) {
    return fAllowedLocales;
}

// Given a locale (a language), add all the characters from all of the scripts used with that language
// to the allowedChars UnicodeSet

void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status) {
    UScriptCode scripts[30];

int32_t numScripts = uscript_getCode(locale, scripts, UPRV_LENGTHOF(scripts), &status);
    if (U_FAILURE(status)) {
        return;
    }
    if (status == U_USING_DEFAULT_WARNING) {
        status = U_ILLEGAL_ARGUMENT_ERROR;
        return;
    }
    UnicodeSet tmpSet;
    int32_t    i;
    for (i=0; i<numScripts; i++) {
        tmpSet.applyIntPropertyValue(UCHAR_SCRIPT, scripts[i], status);
        allowedChars->addAll(tmpSet);
    }
}

// Computes the augmented script set for a code point, according to UTS 39 section 5.1.
void SpoofImpl::getAugmentedScriptSet(UChar32 codePoint, ScriptSet& result, UErrorCode& status) {
    result.resetAll();
    result.setScriptExtensions(codePoint, status);
    if (U_FAILURE(status)) { return; }

// Section 5.1 step 1
    if (result.test(USCRIPT_HAN, status)) {
        result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
        result.set(USCRIPT_JAPANESE, status);
        result.set(USCRIPT_KOREAN, status);
    }
    if (result.test(USCRIPT_HIRAGANA, status)) {
        result.set(USCRIPT_JAPANESE, status);
    }
    if (result.test(USCRIPT_KATAKANA, status)) {
        result.set(USCRIPT_JAPANESE, status);
    }
    if (result.test(USCRIPT_HANGUL, status)) {
        result.set(USCRIPT_KOREAN, status);
    }
    if (result.test(USCRIPT_BOPOMOFO, status)) {
        result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
    }

// Section 5.1 step 2
    if (result.test(USCRIPT_COMMON, status) || result.test(USCRIPT_INHERITED, status)) {
        result.setAll();
    }
}

// Computes the resolved script set for a string, according to UTS 39 section 5.1.
void SpoofImpl::getResolvedScriptSet(const UnicodeString& input, ScriptSet& result, UErrorCode& status) const {
    getResolvedScriptSetWithout(input, USCRIPT_CODE_LIMIT, result, status);
}

// Computes the resolved script set for a string, omitting characters having the specified script.
// If USCRIPT_CODE_LIMIT is passed as the second argument, all characters are included.
void SpoofImpl::getResolvedScriptSetWithout(const UnicodeString& input, UScriptCode script, ScriptSet& result, UErrorCode& status) const {
    result.setAll();

ScriptSet temp;
    UChar32 codePoint;
    for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
        codePoint = input.char32At(i);

// Compute the augmented script set for the character
        getAugmentedScriptSet(codePoint, temp, status);
        if (U_FAILURE(status)) { return; }

// Intersect the augmented script set with the resolved script set, but only if the character doesn't
        // have the script specified in the function call
        if (script == USCRIPT_CODE_LIMIT || !temp.test(script, status)) {
            result.intersect(temp);
        }
    }
}

// Computes the set of numerics for a string, according to UTS 39 section 5.3.
void SpoofImpl::getNumerics(const UnicodeString& input, UnicodeSet& result, UErrorCode& /*status*/) const {
    result.clear();

UChar32 codePoint;
    for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
        codePoint = input.char32At(i);

// Store a representative character for each kind of decimal digit
        if (u_charType(codePoint) == U_DECIMAL_DIGIT_NUMBER) {
            // Store the zero character as a representative for comparison.
            // Unicode guarantees it is codePoint - value
            result.add(codePoint - (UChar32)u_getNumericValue(codePoint));
        }
    }
}

// Computes the restriction level of a string, according to UTS 39 section 5.2.
URestrictionLevel SpoofImpl::getRestrictionLevel(const UnicodeString& input, UErrorCode& status) const {
    // Section 5.2 step 1:
    if (!fAllowedCharsSet->containsAll(input)) {
        return USPOOF_UNRESTRICTIVE;
    }

// Section 5.2 step 2
    // Java use a static UnicodeSet for this test.  In C++, avoid the static variable
    // and just do a simple for loop.
    UBool allASCII = TRUE;
    for (int32_t i=0, length=input.length(); i<length; i++) {
        if (input.charAt(i) > 0x7f) {
            allASCII = FALSE;
            break;
        }
    }
    if (allASCII) {
        return USPOOF_ASCII;
    }

// Section 5.2 steps 3:
    ScriptSet resolvedScriptSet;
    getResolvedScriptSet(input, resolvedScriptSet, status);
    if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }

// Section 5.2 step 4:
    if (!resolvedScriptSet.isEmpty()) {
        return USPOOF_SINGLE_SCRIPT_RESTRICTIVE;
    }

// Section 5.2 step 5:
    ScriptSet resolvedNoLatn;
    getResolvedScriptSetWithout(input, USCRIPT_LATIN, resolvedNoLatn, status);
    if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }

// Section 5.2 step 6:
    if (resolvedNoLatn.test(USCRIPT_HAN_WITH_BOPOMOFO, status)
            || resolvedNoLatn.test(USCRIPT_JAPANESE, status)
            || resolvedNoLatn.test(USCRIPT_KOREAN, status)) {
        return USPOOF_HIGHLY_RESTRICTIVE;
    }

// Section 5.2 step 7:
    if (!resolvedNoLatn.isEmpty()
            && !resolvedNoLatn.test(USCRIPT_CYRILLIC, status)
            && !resolvedNoLatn.test(USCRIPT_GREEK, status)
            && !resolvedNoLatn.test(USCRIPT_CHEROKEE, status)) {
        return USPOOF_MODERATELY_RESTRICTIVE;
    }

// Section 5.2 step 8:
    return USPOOF_MINIMALLY_RESTRICTIVE;
}

int32_t SpoofImpl::findHiddenOverlay(const UnicodeString& input, UErrorCode&) const {
    bool sawLeadCharacter = false;
    for (int32_t i=0; i<input.length();) {
        UChar32 cp = input.char32At(i);
        if (sawLeadCharacter && cp == 0x0307) {
            return i;
        }
        uint8_t combiningClass = u_getCombiningClass(cp);
        // Skip over characters except for those with combining class 0 (non-combining characters) or with
        // combining class 230 (same class as U+0307)
        U_ASSERT(u_getCombiningClass(0x0307) == 230);
        if (combiningClass == 0 || combiningClass == 230) {
            sawLeadCharacter = isIllegalCombiningDotLeadCharacter(cp);
        }
        i += U16_LENGTH(cp);
    }
    return -1;
}

static inline bool isIllegalCombiningDotLeadCharacterNoLookup(UChar32 cp) {
    return cp == u'i' || cp == u'j' || cp == u'ı' || cp == u'ȷ' || cp == u'l' ||
           u_hasBinaryProperty(cp, UCHAR_SOFT_DOTTED);
}

bool SpoofImpl::isIllegalCombiningDotLeadCharacter(UChar32 cp) const {
    if (isIllegalCombiningDotLeadCharacterNoLookup(cp)) {
        return true;
    }
    UnicodeString skelStr;
    fSpoofData->confusableLookup(cp, skelStr);
    UChar32 finalCp = skelStr.char32At(skelStr.moveIndex32(skelStr.length(), -1));
    if (finalCp != cp && isIllegalCombiningDotLeadCharacterNoLookup(finalCp)) {
        return true;
    }
    return false;
}

// Convert a text format hex number.  Utility function used by builder code.  Static.
// Input: UChar *string text.  Output: a UChar32
// Input has been pre-checked, and will have no non-hex chars.
// The number must fall in the code point range of 0..0x10ffff
// Static Function.
UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status) {
    if (U_FAILURE(status)) {
        return 0;
    }
    U_ASSERT(limit-start > 0);
    uint32_t val = 0;
    int i;
    for (i=start; i<limit; i++) {
        int digitVal = s[i] - 0x30;
        if (digitVal>9) {
            digitVal = 0xa + (s[i] - 0x41);  // Upper Case 'A'
        }
        if (digitVal>15) {
            digitVal = 0xa + (s[i] - 0x61);  // Lower Case 'a'
        }
        U_ASSERT(digitVal <= 0xf);
        val <<= 4;
        val += digitVal;
    }
    if (val > 0x10ffff) {
        status = U_PARSE_ERROR;
        val = 0;
    }
    return (UChar32)val;
}

//-----------------------------------------
//
//   class CheckResult Implementation
//
//-----------------------------------------

CheckResult::CheckResult() : fMagic(USPOOF_CHECK_MAGIC) {
    clear();
}

USpoofCheckResult* CheckResult::asUSpoofCheckResult() {
    return reinterpret_cast<USpoofCheckResult*>(this);
}

//
//  Incoming parameter check on Status and the CheckResult object
//    received from the C API.
//
const CheckResult* CheckResult::validateThis(const USpoofCheckResult *ptr, UErrorCode &status) {
    if (U_FAILURE(status)) { return NULL; }
    if (ptr == NULL) {
        status = U_ILLEGAL_ARGUMENT_ERROR;
        return NULL;
    }
    CheckResult *This = (CheckResult*) ptr;
    if (This->fMagic != USPOOF_CHECK_MAGIC) {
        status = U_INVALID_FORMAT_ERROR;
        return NULL;
    }
    return This;
}

CheckResult* CheckResult::validateThis(USpoofCheckResult *ptr, UErrorCode &status) {
    return const_cast<CheckResult *>
        (CheckResult::validateThis(const_cast<const USpoofCheckResult*>(ptr), status));
}

void CheckResult::clear() {
    fChecks = 0;
    fNumerics.clear();
    fRestrictionLevel = USPOOF_UNDEFINED_RESTRICTIVE;
}

int32_t CheckResult::toCombinedBitmask(int32_t enabledChecks) {
    if ((enabledChecks & USPOOF_AUX_INFO) != 0 && fRestrictionLevel != USPOOF_UNDEFINED_RESTRICTIVE) {
        return fChecks | fRestrictionLevel;
    } else {
        return fChecks;
    }
}

CheckResult::~CheckResult() {
}

//----------------------------------------------------------------------------------------------
//
//   class SpoofData Implementation
//
//----------------------------------------------------------------------------------------------

UBool SpoofData::validateDataVersion(UErrorCode &status) const {
    if (U_FAILURE(status) ||
        fRawData == NULL ||
        fRawData->fMagic != USPOOF_MAGIC ||
        fRawData->fFormatVersion[0] != USPOOF_CONFUSABLE_DATA_FORMAT_VERSION ||
        fRawData->fFormatVersion[1] != 0 ||
        fRawData->fFormatVersion[2] != 0 ||
        fRawData->fFormatVersion[3] != 0) {
            status = U_INVALID_FORMAT_ERROR;
            return FALSE;
    }
    return TRUE;
}

static UBool U_CALLCONV
spoofDataIsAcceptable(void *context,
                        const char * /* type */, const char * /*name*/,
                        const UDataInfo *pInfo) {
    if(
        pInfo->size >= 20 &&
        pInfo->isBigEndian == U_IS_BIG_ENDIAN &&
        pInfo->charsetFamily == U_CHARSET_FAMILY &&
        pInfo->dataFormat[0] == 0x43 &&  // dataFormat="Cfu "
        pInfo->dataFormat[1] == 0x66 &&
        pInfo->dataFormat[2] == 0x75 &&
        pInfo->dataFormat[3] == 0x20 &&
        pInfo->formatVersion[0] == USPOOF_CONFUSABLE_DATA_FORMAT_VERSION
    ) {
        UVersionInfo *version = static_cast<UVersionInfo *>(context);
        if(version != NULL) {
            uprv_memcpy(version, pInfo->dataVersion, 4);
        }
        return TRUE;
    } else {
        return FALSE;
    }
}

//  Methods for the loading of the default confusables data file.  The confusable
//  data is loaded only when it is needed.
//
//  SpoofData::getDefault() - Return the default confusables data, and call the
//                            initOnce() if it is not available.  Adds a reference
//                            to the SpoofData that the caller is responsible for
//                            decrementing when they are done with the data.
//
//  uspoof_loadDefaultData - Called once, from initOnce().  The resulting SpoofData
//                           is shared by all spoof checkers using the default data.
//
//  uspoof_cleanupDefaultData - Called during cleanup.
//

static UInitOnce gSpoofInitDefaultOnce = U_INITONCE_INITIALIZER;
static SpoofData* gDefaultSpoofData;

static UBool U_CALLCONV
uspoof_cleanupDefaultData(void) {
    if (gDefaultSpoofData) {
        // Will delete, assuming all user-level spoof checkers were closed.
        gDefaultSpoofData->removeReference();
        gDefaultSpoofData = nullptr;
        gSpoofInitDefaultOnce.reset();
    }
    return TRUE;
}

static void U_CALLCONV uspoof_loadDefaultData(UErrorCode& status) {
    UDataMemory *udm = udata_openChoice(nullptr, "cfu", "confusables",
                                        spoofDataIsAcceptable, 
                                        nullptr,       // context, would receive dataVersion if supplied.
                                        &status);
    if (U_FAILURE(status)) { return; }
    gDefaultSpoofData = new SpoofData(udm, status);
    if (U_FAILURE(status)) {
        delete gDefaultSpoofData;
        gDefaultSpoofData = nullptr;
        return;
    }
    if (gDefaultSpoofData == nullptr) {
        status = U_MEMORY_ALLOCATION_ERROR;
        return;
    }
    ucln_i18n_registerCleanup(UCLN_I18N_SPOOFDATA, uspoof_cleanupDefaultData);
}

SpoofData* SpoofData::getDefault(UErrorCode& status) {
    umtx_initOnce(gSpoofInitDefaultOnce, &uspoof_loadDefaultData, status);
    if (U_FAILURE(status)) { return NULL; }
    gDefaultSpoofData->addReference();
    return gDefaultSpoofData;
}

SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status)
{
    reset();
    if (U_FAILURE(status)) {
        return;
    }
    fUDM = udm;
    // fRawData is non-const because it may be constructed by the data builder.
    fRawData = reinterpret_cast<SpoofDataHeader *>(
            const_cast<void *>(udata_getMemory(udm)));
    validateDataVersion(status);
    initPtrs(status);
}

SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status)
{
    reset();
    if (U_FAILURE(status)) {
        return;
    }
    if ((size_t)length < sizeof(SpoofDataHeader)) {
        status = U_INVALID_FORMAT_ERROR;
        return;
    }
    if (data == NULL) {
        status = U_ILLEGAL_ARGUMENT_ERROR;
        return;
    }
    void *ncData = const_cast<void *>(data);
    fRawData = static_cast<SpoofDataHeader *>(ncData);
    if (length < fRawData->fLength) {
        status = U_INVALID_FORMAT_ERROR;
        return;
    }
    validateDataVersion(status);
    initPtrs(status);
}

// Spoof Data constructor for use from data builder.
//   Initializes a new, empty data area that will be populated later.
SpoofData::SpoofData(UErrorCode &status) {
    reset();
    if (U_FAILURE(status)) {
        return;
    }
    fDataOwned = true;

// The spoof header should already be sized to be a multiple of 16 bytes.
    // Just in case it's not, round it up.
    uint32_t initialSize = (sizeof(SpoofDataHeader) + 15) & ~15;
    U_ASSERT(initialSize == sizeof(SpoofDataHeader));
    
    fRawData = static_cast<SpoofDataHeader *>(uprv_malloc(initialSize));
    fMemLimit = initialSize;
    if (fRawData == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
        return;
    }
    uprv_memset(fRawData, 0, initialSize);

fRawData->fMagic = USPOOF_MAGIC;
    fRawData->fFormatVersion[0] = USPOOF_CONFUSABLE_DATA_FORMAT_VERSION;
    fRawData->fFormatVersion[1] = 0;
    fRawData->fFormatVersion[2] = 0;
    fRawData->fFormatVersion[3] = 0;
    initPtrs(status);
}

// reset() - initialize all fields.
//           Should be updated if any new fields are added.
//           Called by constructors to put things in a known initial state.
void SpoofData::reset() {
   fRawData = NULL;
   fDataOwned = FALSE;
   fUDM      = NULL;
   fMemLimit = 0;
   fRefCount = 1;
   fCFUKeys = NULL;
   fCFUValues = NULL;
   fCFUStrings = NULL;
}

//  SpoofData::initPtrs()
//            Initialize the pointers to the various sections of the raw data.
//
//            This function is used both during the Trie building process (multiple
//            times, as the individual data sections are added), and
//            during the opening of a Spoof Checker from prebuilt data.
//
//            The pointers for non-existent data sections (identified by an offset of 0)
//            are set to NULL.
//
//            Note:  During building the data, adding each new data section
//            reallocs the raw data area, which likely relocates it, which
//            in turn requires reinitializing all of the pointers into it, hence
//            multiple calls to this function during building.
//
void SpoofData::initPtrs(UErrorCode &status) {
    fCFUKeys = NULL;
    fCFUValues = NULL;
    fCFUStrings = NULL;
    if (U_FAILURE(status)) {
        return;
    }
    if (fRawData->fCFUKeys != 0) {
        fCFUKeys = (int32_t *)((char *)fRawData + fRawData->fCFUKeys);
    }
    if (fRawData->fCFUStringIndex != 0) {
        fCFUValues = (uint16_t *)((char *)fRawData + fRawData->fCFUStringIndex);
    }
    if (fRawData->fCFUStringTable != 0) {
        fCFUStrings = (UChar *)((char *)fRawData + fRawData->fCFUStringTable);
    }
}

SpoofData::~SpoofData() {
    if (fDataOwned) {
        uprv_free(fRawData);
    }
    fRawData = NULL;
    if (fUDM != NULL) {
        udata_close(fUDM);
    }
    fUDM = NULL;
}

void SpoofData::removeReference() {
    if (umtx_atomic_dec(&fRefCount) == 0) {
        delete this;
    }
}

SpoofData *SpoofData::addReference() {
    umtx_atomic_inc(&fRefCount);
    return this;
}

void *SpoofData::reserveSpace(int32_t numBytes,  UErrorCode &status) {
    if (U_FAILURE(status)) {
        return NULL;
    }
    if (!fDataOwned) {
        U_ASSERT(FALSE);
        status = U_INTERNAL_PROGRAM_ERROR;
        return NULL;
    }

numBytes = (numBytes + 15) & ~15;   // Round up to a multiple of 16
    uint32_t returnOffset = fMemLimit;
    fMemLimit += numBytes;
    fRawData = static_cast<SpoofDataHeader *>(uprv_realloc(fRawData, fMemLimit));
    fRawData->fLength = fMemLimit;
    uprv_memset((char *)fRawData + returnOffset, 0, numBytes);
    initPtrs(status);
    return (char *)fRawData + returnOffset;
}

int32_t SpoofData::serialize(void *buf, int32_t capacity, UErrorCode &status) const {
    int32_t dataSize = fRawData->fLength;
    if (capacity < dataSize) {
        status = U_BUFFER_OVERFLOW_ERROR;
        return dataSize;
    }
    uprv_memcpy(buf, fRawData, dataSize);
    return dataSize;
}

int32_t SpoofData::size() const {
    return fRawData->fLength;
}

//-------------------------------
//
// Front-end APIs for SpoofData
//
//-------------------------------

int32_t SpoofData::confusableLookup(UChar32 inChar, UnicodeString &dest) const {
    // Perform a binary search.
    // [lo, hi), i.e lo is inclusive, hi is exclusive.
    // The result after the loop will be in lo.
    int32_t lo = 0;
    int32_t hi = length();
    do {
        int32_t mid = (lo + hi) / 2;
        if (codePointAt(mid) > inChar) {
            hi = mid;
        } else if (codePointAt(mid) < inChar) {
            lo = mid;
        } else {
            // Found result.  Break early.
            lo = mid;
            break;
        }
    } while (hi - lo > 1);

// Did we find an entry?  If not, the char maps to itself.
    if (codePointAt(lo) != inChar) {
        dest.append(inChar);
        return 1;
    }

// Add the element to the string builder and return.
    return appendValueTo(lo, dest);
}

int32_t SpoofData::length() const {
    return fRawData->fCFUKeysSize;
}

UChar32 SpoofData::codePointAt(int32_t index) const {
    return ConfusableDataUtils::keyToCodePoint(fCFUKeys[index]);
}

int32_t SpoofData::appendValueTo(int32_t index, UnicodeString& dest) const {
    int32_t stringLength = ConfusableDataUtils::keyToLength(fCFUKeys[index]);

// Value is either a char (for strings of length 1) or
    // an index into the string table (for longer strings)
    uint16_t value = fCFUValues[index];
    if (stringLength == 1) {
        dest.append((UChar)value);
    } else {
        dest.append(fCFUStrings + value, stringLength);
    }

return stringLength;
}

U_NAMESPACE_END

U_NAMESPACE_USE

//-----------------------------------------------------------------------------
//
//  uspoof_swap   -  byte swap and char encoding swap of spoof data
//
//-----------------------------------------------------------------------------
U_CAPI int32_t U_EXPORT2
uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
           UErrorCode *status) {

if (status == NULL || U_FAILURE(*status)) {
        return 0;
    }
    if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) {
        *status=U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

//
    //  Check that the data header is for spoof data.
    //    (Header contents are defined in gencfu.cpp)
    //
    const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4);
    if(!(  pInfo->dataFormat[0]==0x43 &&   /* dataFormat="Cfu " */
           pInfo->dataFormat[1]==0x66 &&
           pInfo->dataFormat[2]==0x75 &&
           pInfo->dataFormat[3]==0x20 &&
           pInfo->formatVersion[0]==USPOOF_CONFUSABLE_DATA_FORMAT_VERSION &&
           pInfo->formatVersion[1]==0 &&
           pInfo->formatVersion[2]==0 &&
           pInfo->formatVersion[3]==0  )) {
        udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x "
                             "(format version %02x %02x %02x %02x) is not recognized\n",
                         pInfo->dataFormat[0], pInfo->dataFormat[1],
                         pInfo->dataFormat[2], pInfo->dataFormat[3],
                         pInfo->formatVersion[0], pInfo->formatVersion[1],
                         pInfo->formatVersion[2], pInfo->formatVersion[3]);
        *status=U_UNSUPPORTED_ERROR;
        return 0;
    }

//
    // Swap the data header.  (This is the generic ICU Data Header, not the uspoof Specific
    //                         header).  This swap also conveniently gets us
    //                         the size of the ICU d.h., which lets us locate the start
    //                         of the uspoof specific data.
    //
    int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);

//
    // Get the Spoof Data Header, and check that it appears to be OK.
    //
    //
    const uint8_t   *inBytes =(const uint8_t *)inData+headerSize;
    SpoofDataHeader *spoofDH = (SpoofDataHeader *)inBytes;
    if (ds->readUInt32(spoofDH->fMagic)   != USPOOF_MAGIC ||
        ds->readUInt32(spoofDH->fLength)  <  sizeof(SpoofDataHeader)) 
    {
        udata_printError(ds, "uspoof_swap(): Spoof Data header is invalid.\n");
        *status=U_UNSUPPORTED_ERROR;
        return 0;
    }

//
    // Prefight operation?  Just return the size
    //
    int32_t spoofDataLength = ds->readUInt32(spoofDH->fLength);
    int32_t totalSize = headerSize + spoofDataLength;
    if (length < 0) {
        return totalSize;
    }

//
    // Check that length passed in is consistent with length from Spoof data header.
    //
    if (length < totalSize) {
        udata_printError(ds, "uspoof_swap(): too few bytes (%d after ICU Data header) for spoof data.\n",
                            spoofDataLength);
        *status=U_INDEX_OUTOFBOUNDS_ERROR;
        return 0;
        }

//
    // Swap the Data.  Do the data itself first, then the Spoof Data Header, because
    //                 we need to reference the header to locate the data, and an
    //                 inplace swap of the header leaves it unusable.
    //
    uint8_t          *outBytes = (uint8_t *)outData + headerSize;
    SpoofDataHeader  *outputDH = (SpoofDataHeader *)outBytes;

int32_t   sectionStart;
    int32_t   sectionLength;

//
    // If not swapping in place, zero out the output buffer before starting.
    //    Gaps may exist between the individual sections, and these must be zeroed in
    //    the output buffer.  The simplest way to do that is to just zero the whole thing.
    //
    if (inBytes != outBytes) {
        uprv_memset(outBytes, 0, spoofDataLength);
    }

// Confusables Keys Section   (fCFUKeys)
    sectionStart  = ds->readUInt32(spoofDH->fCFUKeys);
    sectionLength = ds->readUInt32(spoofDH->fCFUKeysSize) * 4;
    ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);

// String Index Section
    sectionStart  = ds->readUInt32(spoofDH->fCFUStringIndex);
    sectionLength = ds->readUInt32(spoofDH->fCFUStringIndexSize) * 2;
    ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);

// String Table Section
    sectionStart  = ds->readUInt32(spoofDH->fCFUStringTable);
    sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2;
    ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);

// And, last, swap the header itself.
    //   int32_t   fMagic             // swap this
    //   uint8_t   fFormatVersion[4]  // Do not swap this, just copy
    //   int32_t   fLength and all the rest       // Swap the rest, all is 32 bit stuff.
    //
    uint32_t magic = ds->readUInt32(spoofDH->fMagic);
    ds->writeUInt32((uint32_t *)&outputDH->fMagic, magic);

if (outputDH->fFormatVersion != spoofDH->fFormatVersion) {
        uprv_memcpy(outputDH->fFormatVersion, spoofDH->fFormatVersion, sizeof(spoofDH->fFormatVersion));
    }
    // swap starting at fLength
    ds->swapArray32(ds, &spoofDH->fLength, sizeof(SpoofDataHeader)-8 /* minus magic and fFormatVersion[4] */, &outputDH->fLength, status);

return totalSize;
}

#endif

C++程序 | 985行 | 31.67 KB

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
*   Copyright (C) 2008-2016, International Business Machines
*   Corporation and others.  All Rights Reserved.
**********************************************************************
*/

#include "unicode/utypes.h"
#include "unicode/uspoof.h"
#include "unicode/uchar.h"
#include "unicode/uniset.h"
#include "unicode/utf16.h"
#include "utrie2.h"
#include "cmemory.h"
#include "cstring.h"
#include "scriptset.h"
#include "umutex.h"
#include "udataswp.h"
#include "uassert.h"
#include "ucln_in.h"
#include "uspoof_impl.h"

#if !UCONFIG_NO_NORMALIZATION


U_NAMESPACE_BEGIN

UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)

SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode& status) {
    construct(status);
    fSpoofData = data;
}

SpoofImpl::SpoofImpl(UErrorCode& status) {
    construct(status);

    // TODO: Call this method where it is actually needed, instead of in the
    // constructor, to allow for lazy data loading.  See #12696.
    fSpoofData = SpoofData::getDefault(status);
}

SpoofImpl::SpoofImpl() {
    UErrorCode status = U_ZERO_ERROR;
    construct(status);

    // TODO: Call this method where it is actually needed, instead of in the
    // constructor, to allow for lazy data loading.  See #12696.
    fSpoofData = SpoofData::getDefault(status);
}

void SpoofImpl::construct(UErrorCode& status) {
    fMagic = USPOOF_MAGIC;
    fChecks = USPOOF_ALL_CHECKS;
    fSpoofData = NULL;
    fAllowedCharsSet = NULL;
    fAllowedLocales = NULL;
    fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;

    if (U_FAILURE(status)) { return; }

    UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
    fAllowedCharsSet = allowedCharsSet;
    fAllowedLocales  = uprv_strdup("");
    if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
        return;
    }
    allowedCharsSet->freeze();
}


// Copy Constructor, used by the user level clone() function.
SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status)  :
        fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) , 
        fAllowedLocales(NULL) {
    if (U_FAILURE(status)) {
        return;
    }
    fMagic = src.fMagic;
    fChecks = src.fChecks;
    if (src.fSpoofData != NULL) {
        fSpoofData = src.fSpoofData->addReference();
    }
    fAllowedCharsSet = static_cast<const UnicodeSet *>(src.fAllowedCharsSet->clone());
    fAllowedLocales = uprv_strdup(src.fAllowedLocales);
    if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
    }
    fRestrictionLevel = src.fRestrictionLevel;
}

SpoofImpl::~SpoofImpl() {
    fMagic = 0;                // head off application errors by preventing use of
                               //    of deleted objects.
    if (fSpoofData != NULL) {
        fSpoofData->removeReference();   // Will delete if refCount goes to zero.
    }
    delete fAllowedCharsSet;
    uprv_free((void *)fAllowedLocales);
}

//  Cast this instance as a USpoofChecker for the C API.
USpoofChecker *SpoofImpl::asUSpoofChecker() {
    return reinterpret_cast<USpoofChecker*>(this);
}

//
//  Incoming parameter check on Status and the SpoofChecker object
//    received from the C API.
//
const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &status) {
    if (U_FAILURE(status)) {
        return NULL;
    }
    if (sc == NULL) {
        status = U_ILLEGAL_ARGUMENT_ERROR;
        return NULL;
    }
    SpoofImpl *This = (SpoofImpl *)sc;
    if (This->fMagic != USPOOF_MAGIC) {
        status = U_INVALID_FORMAT_ERROR;
        return NULL;
    }
    if (This->fSpoofData != NULL && !This->fSpoofData->validateDataVersion(status)) {
        return NULL;
    }
    return This;
}

SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) {
    return const_cast<SpoofImpl *>
        (SpoofImpl::validateThis(const_cast<const USpoofChecker *>(sc), status));
}


void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) {
    UnicodeSet    allowedChars;
    UnicodeSet    *tmpSet = NULL;
    const char    *locStart = localesList;
    const char    *locEnd = NULL;
    const char    *localesListEnd = localesList + uprv_strlen(localesList);
    int32_t        localeListCount = 0;   // Number of locales provided by caller.

    // Loop runs once per locale from the localesList, a comma separated list of locales.
    do {
        locEnd = uprv_strchr(locStart, ',');
        if (locEnd == NULL) {
            locEnd = localesListEnd;
        }
        while (*locStart == ' ') {
            locStart++;
        }
        const char *trimmedEnd = locEnd-1;
        while (trimmedEnd > locStart && *trimmedEnd == ' ') {
            trimmedEnd--;
        }
        if (trimmedEnd <= locStart) {
            break;
        }
        const char *locale = uprv_strndup(locStart, (int32_t)(trimmedEnd + 1 - locStart));
        localeListCount++;

        // We have one locale from the locales list.
        // Add the script chars for this locale to the accumulating set of allowed chars.
        // If the locale is no good, we will be notified back via status.
        addScriptChars(locale, &allowedChars, status);
        uprv_free((void *)locale);
        if (U_FAILURE(status)) {
            break;
        }
        locStart = locEnd + 1;
    } while (locStart < localesListEnd);

    // If our caller provided an empty list of locales, we disable the allowed characters checking
    if (localeListCount == 0) {
        uprv_free((void *)fAllowedLocales);
        fAllowedLocales = uprv_strdup("");
        tmpSet = new UnicodeSet(0, 0x10ffff);
        if (fAllowedLocales == NULL || tmpSet == NULL) {
            status = U_MEMORY_ALLOCATION_ERROR;
            return;
        } 
        tmpSet->freeze();
        delete fAllowedCharsSet;
        fAllowedCharsSet = tmpSet;
        fChecks &= ~USPOOF_CHAR_LIMIT;
        return;
    }

        
    // Add all common and inherited characters to the set of allowed chars.
    UnicodeSet tempSet;
    tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
    allowedChars.addAll(tempSet);
    tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
    allowedChars.addAll(tempSet);
    
    // If anything went wrong, we bail out without changing
    // the state of the spoof checker.
    if (U_FAILURE(status)) {
        return;
    }

    // Store the updated spoof checker state.
    tmpSet = static_cast<UnicodeSet *>(allowedChars.clone());
    const char *tmpLocalesList = uprv_strdup(localesList);
    if (tmpSet == NULL || tmpLocalesList == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
        return;
    }
    uprv_free((void *)fAllowedLocales);
    fAllowedLocales = tmpLocalesList;
    tmpSet->freeze();
    delete fAllowedCharsSet;
    fAllowedCharsSet = tmpSet;
    fChecks |= USPOOF_CHAR_LIMIT;
}


const char * SpoofImpl::getAllowedLocales(UErrorCode &/*status*/) {
    return fAllowedLocales;
}


// Given a locale (a language), add all the characters from all of the scripts used with that language
// to the allowedChars UnicodeSet

void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status) {
    UScriptCode scripts[30];

    int32_t numScripts = uscript_getCode(locale, scripts, UPRV_LENGTHOF(scripts), &status);
    if (U_FAILURE(status)) {
        return;
    }
    if (status == U_USING_DEFAULT_WARNING) {
        status = U_ILLEGAL_ARGUMENT_ERROR;
        return;
    }
    UnicodeSet tmpSet;
    int32_t    i;
    for (i=0; i<numScripts; i++) {
        tmpSet.applyIntPropertyValue(UCHAR_SCRIPT, scripts[i], status);
        allowedChars->addAll(tmpSet);
    }
}

// Computes the augmented script set for a code point, according to UTS 39 section 5.1.
void SpoofImpl::getAugmentedScriptSet(UChar32 codePoint, ScriptSet& result, UErrorCode& status) {
    result.resetAll();
    result.setScriptExtensions(codePoint, status);
    if (U_FAILURE(status)) { return; }

    // Section 5.1 step 1
    if (result.test(USCRIPT_HAN, status)) {
        result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
        result.set(USCRIPT_JAPANESE, status);
        result.set(USCRIPT_KOREAN, status);
    }
    if (result.test(USCRIPT_HIRAGANA, status)) {
        result.set(USCRIPT_JAPANESE, status);
    }
    if (result.test(USCRIPT_KATAKANA, status)) {
        result.set(USCRIPT_JAPANESE, status);
    }
    if (result.test(USCRIPT_HANGUL, status)) {
        result.set(USCRIPT_KOREAN, status);
    }
    if (result.test(USCRIPT_BOPOMOFO, status)) {
        result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
    }

    // Section 5.1 step 2
    if (result.test(USCRIPT_COMMON, status) || result.test(USCRIPT_INHERITED, status)) {
        result.setAll();
    }
}

// Computes the resolved script set for a string, according to UTS 39 section 5.1.
void SpoofImpl::getResolvedScriptSet(const UnicodeString& input, ScriptSet& result, UErrorCode& status) const {
    getResolvedScriptSetWithout(input, USCRIPT_CODE_LIMIT, result, status);
}

// Computes the resolved script set for a string, omitting characters having the specified script.
// If USCRIPT_CODE_LIMIT is passed as the second argument, all characters are included.
void SpoofImpl::getResolvedScriptSetWithout(const UnicodeString& input, UScriptCode script, ScriptSet& result, UErrorCode& status) const {
    result.setAll();

    ScriptSet temp;
    UChar32 codePoint;
    for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
        codePoint = input.char32At(i);

        // Compute the augmented script set for the character
        getAugmentedScriptSet(codePoint, temp, status);
        if (U_FAILURE(status)) { return; }

        // Intersect the augmented script set with the resolved script set, but only if the character doesn't
        // have the script specified in the function call
        if (script == USCRIPT_CODE_LIMIT || !temp.test(script, status)) {
            result.intersect(temp);
        }
    }
}

// Computes the set of numerics for a string, according to UTS 39 section 5.3.
void SpoofImpl::getNumerics(const UnicodeString& input, UnicodeSet& result, UErrorCode& /*status*/) const {
    result.clear();

    UChar32 codePoint;
    for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
        codePoint = input.char32At(i);

        // Store a representative character for each kind of decimal digit
        if (u_charType(codePoint) == U_DECIMAL_DIGIT_NUMBER) {
            // Store the zero character as a representative for comparison.
            // Unicode guarantees it is codePoint - value
            result.add(codePoint - (UChar32)u_getNumericValue(codePoint));
        }
    }
}

// Computes the restriction level of a string, according to UTS 39 section 5.2.
URestrictionLevel SpoofImpl::getRestrictionLevel(const UnicodeString& input, UErrorCode& status) const {
    // Section 5.2 step 1:
    if (!fAllowedCharsSet->containsAll(input)) {
        return USPOOF_UNRESTRICTIVE;
    }

    // Section 5.2 step 2
    // Java use a static UnicodeSet for this test.  In C++, avoid the static variable
    // and just do a simple for loop.
    UBool allASCII = TRUE;
    for (int32_t i=0, length=input.length(); i<length; i++) {
        if (input.charAt(i) > 0x7f) {
            allASCII = FALSE;
            break;
        }
    }
    if (allASCII) {
        return USPOOF_ASCII;
    }

    // Section 5.2 steps 3:
    ScriptSet resolvedScriptSet;
    getResolvedScriptSet(input, resolvedScriptSet, status);
    if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }

    // Section 5.2 step 4:
    if (!resolvedScriptSet.isEmpty()) {
        return USPOOF_SINGLE_SCRIPT_RESTRICTIVE;
    }

    // Section 5.2 step 5:
    ScriptSet resolvedNoLatn;
    getResolvedScriptSetWithout(input, USCRIPT_LATIN, resolvedNoLatn, status);
    if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }

    // Section 5.2 step 6:
    if (resolvedNoLatn.test(USCRIPT_HAN_WITH_BOPOMOFO, status)
            || resolvedNoLatn.test(USCRIPT_JAPANESE, status)
            || resolvedNoLatn.test(USCRIPT_KOREAN, status)) {
        return USPOOF_HIGHLY_RESTRICTIVE;
    }

    // Section 5.2 step 7:
    if (!resolvedNoLatn.isEmpty()
            && !resolvedNoLatn.test(USCRIPT_CYRILLIC, status)
            && !resolvedNoLatn.test(USCRIPT_GREEK, status)
            && !resolvedNoLatn.test(USCRIPT_CHEROKEE, status)) {
        return USPOOF_MODERATELY_RESTRICTIVE;
    }

    // Section 5.2 step 8:
    return USPOOF_MINIMALLY_RESTRICTIVE;
}

int32_t SpoofImpl::findHiddenOverlay(const UnicodeString& input, UErrorCode&) const {
    bool sawLeadCharacter = false;
    for (int32_t i=0; i<input.length();) {
        UChar32 cp = input.char32At(i);
        if (sawLeadCharacter && cp == 0x0307) {
            return i;
        }
        uint8_t combiningClass = u_getCombiningClass(cp);
        // Skip over characters except for those with combining class 0 (non-combining characters) or with
        // combining class 230 (same class as U+0307)
        U_ASSERT(u_getCombiningClass(0x0307) == 230);
        if (combiningClass == 0 || combiningClass == 230) {
            sawLeadCharacter = isIllegalCombiningDotLeadCharacter(cp);
        }
        i += U16_LENGTH(cp);
    }
    return -1;
}

static inline bool isIllegalCombiningDotLeadCharacterNoLookup(UChar32 cp) {
    return cp == u'i' || cp == u'j' || cp == u'ı' || cp == u'ȷ' || cp == u'l' ||
           u_hasBinaryProperty(cp, UCHAR_SOFT_DOTTED);
}

bool SpoofImpl::isIllegalCombiningDotLeadCharacter(UChar32 cp) const {
    if (isIllegalCombiningDotLeadCharacterNoLookup(cp)) {
        return true;
    }
    UnicodeString skelStr;
    fSpoofData->confusableLookup(cp, skelStr);
    UChar32 finalCp = skelStr.char32At(skelStr.moveIndex32(skelStr.length(), -1));
    if (finalCp != cp && isIllegalCombiningDotLeadCharacterNoLookup(finalCp)) {
        return true;
    }
    return false;
}



// Convert a text format hex number.  Utility function used by builder code.  Static.
// Input: UChar *string text.  Output: a UChar32
// Input has been pre-checked, and will have no non-hex chars.
// The number must fall in the code point range of 0..0x10ffff
// Static Function.
UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status) {
    if (U_FAILURE(status)) {
        return 0;
    }
    U_ASSERT(limit-start > 0);
    uint32_t val = 0;
    int i;
    for (i=start; i<limit; i++) {
        int digitVal = s[i] - 0x30;
        if (digitVal>9) {
            digitVal = 0xa + (s[i] - 0x41);  // Upper Case 'A'
        }
        if (digitVal>15) {
            digitVal = 0xa + (s[i] - 0x61);  // Lower Case 'a'
        }
        U_ASSERT(digitVal <= 0xf);
        val <<= 4;
        val += digitVal;
    }
    if (val > 0x10ffff) {
        status = U_PARSE_ERROR;
        val = 0;
    }
    return (UChar32)val;
}


//-----------------------------------------
//
//   class CheckResult Implementation
//
//-----------------------------------------

CheckResult::CheckResult() : fMagic(USPOOF_CHECK_MAGIC) {
    clear();
}

USpoofCheckResult* CheckResult::asUSpoofCheckResult() {
    return reinterpret_cast<USpoofCheckResult*>(this);
}

//
//  Incoming parameter check on Status and the CheckResult object
//    received from the C API.
//
const CheckResult* CheckResult::validateThis(const USpoofCheckResult *ptr, UErrorCode &status) {
    if (U_FAILURE(status)) { return NULL; }
    if (ptr == NULL) {
        status = U_ILLEGAL_ARGUMENT_ERROR;
        return NULL;
    }
    CheckResult *This = (CheckResult*) ptr;
    if (This->fMagic != USPOOF_CHECK_MAGIC) {
        status = U_INVALID_FORMAT_ERROR;
        return NULL;
    }
    return This;
}

CheckResult* CheckResult::validateThis(USpoofCheckResult *ptr, UErrorCode &status) {
    return const_cast<CheckResult *>
        (CheckResult::validateThis(const_cast<const USpoofCheckResult*>(ptr), status));
}

void CheckResult::clear() {
    fChecks = 0;
    fNumerics.clear();
    fRestrictionLevel = USPOOF_UNDEFINED_RESTRICTIVE;
}

int32_t CheckResult::toCombinedBitmask(int32_t enabledChecks) {
    if ((enabledChecks & USPOOF_AUX_INFO) != 0 && fRestrictionLevel != USPOOF_UNDEFINED_RESTRICTIVE) {
        return fChecks | fRestrictionLevel;
    } else {
        return fChecks;
    }
}

CheckResult::~CheckResult() {
}

//----------------------------------------------------------------------------------------------
//
//   class SpoofData Implementation
//
//----------------------------------------------------------------------------------------------


UBool SpoofData::validateDataVersion(UErrorCode &status) const {
    if (U_FAILURE(status) ||
        fRawData == NULL ||
        fRawData->fMagic != USPOOF_MAGIC ||
        fRawData->fFormatVersion[0] != USPOOF_CONFUSABLE_DATA_FORMAT_VERSION ||
        fRawData->fFormatVersion[1] != 0 ||
        fRawData->fFormatVersion[2] != 0 ||
        fRawData->fFormatVersion[3] != 0) {
            status = U_INVALID_FORMAT_ERROR;
            return FALSE;
    }
    return TRUE;
}

static UBool U_CALLCONV
spoofDataIsAcceptable(void *context,
                        const char * /* type */, const char * /*name*/,
                        const UDataInfo *pInfo) {
    if(
        pInfo->size >= 20 &&
        pInfo->isBigEndian == U_IS_BIG_ENDIAN &&
        pInfo->charsetFamily == U_CHARSET_FAMILY &&
        pInfo->dataFormat[0] == 0x43 &&  // dataFormat="Cfu "
        pInfo->dataFormat[1] == 0x66 &&
        pInfo->dataFormat[2] == 0x75 &&
        pInfo->dataFormat[3] == 0x20 &&
        pInfo->formatVersion[0] == USPOOF_CONFUSABLE_DATA_FORMAT_VERSION
    ) {
        UVersionInfo *version = static_cast<UVersionInfo *>(context);
        if(version != NULL) {
            uprv_memcpy(version, pInfo->dataVersion, 4);
        }
        return TRUE;
    } else {
        return FALSE;
    }
}

//  Methods for the loading of the default confusables data file.  The confusable
//  data is loaded only when it is needed.
//
//  SpoofData::getDefault() - Return the default confusables data, and call the
//                            initOnce() if it is not available.  Adds a reference
//                            to the SpoofData that the caller is responsible for
//                            decrementing when they are done with the data.
//
//  uspoof_loadDefaultData - Called once, from initOnce().  The resulting SpoofData
//                           is shared by all spoof checkers using the default data.
//
//  uspoof_cleanupDefaultData - Called during cleanup.
//

static UInitOnce gSpoofInitDefaultOnce = U_INITONCE_INITIALIZER;
static SpoofData* gDefaultSpoofData;

static UBool U_CALLCONV
uspoof_cleanupDefaultData(void) {
    if (gDefaultSpoofData) {
        // Will delete, assuming all user-level spoof checkers were closed.
        gDefaultSpoofData->removeReference();
        gDefaultSpoofData = nullptr;
        gSpoofInitDefaultOnce.reset();
    }
    return TRUE;
}

static void U_CALLCONV uspoof_loadDefaultData(UErrorCode& status) {
    UDataMemory *udm = udata_openChoice(nullptr, "cfu", "confusables",
                                        spoofDataIsAcceptable, 
                                        nullptr,       // context, would receive dataVersion if supplied.
                                        &status);
    if (U_FAILURE(status)) { return; }
    gDefaultSpoofData = new SpoofData(udm, status);
    if (U_FAILURE(status)) {
        delete gDefaultSpoofData;
        gDefaultSpoofData = nullptr;
        return;
    }
    if (gDefaultSpoofData == nullptr) {
        status = U_MEMORY_ALLOCATION_ERROR;
        return;
    }
    ucln_i18n_registerCleanup(UCLN_I18N_SPOOFDATA, uspoof_cleanupDefaultData);
}

SpoofData* SpoofData::getDefault(UErrorCode& status) {
    umtx_initOnce(gSpoofInitDefaultOnce, &uspoof_loadDefaultData, status);
    if (U_FAILURE(status)) { return NULL; }
    gDefaultSpoofData->addReference();
    return gDefaultSpoofData;
}



SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status)
{
    reset();
    if (U_FAILURE(status)) {
        return;
    }
    fUDM = udm;
    // fRawData is non-const because it may be constructed by the data builder.
    fRawData = reinterpret_cast<SpoofDataHeader *>(
            const_cast<void *>(udata_getMemory(udm)));
    validateDataVersion(status);
    initPtrs(status);
}


SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status)
{
    reset();
    if (U_FAILURE(status)) {
        return;
    }
    if ((size_t)length < sizeof(SpoofDataHeader)) {
        status = U_INVALID_FORMAT_ERROR;
        return;
    }
    if (data == NULL) {
        status = U_ILLEGAL_ARGUMENT_ERROR;
        return;
    }
    void *ncData = const_cast<void *>(data);
    fRawData = static_cast<SpoofDataHeader *>(ncData);
    if (length < fRawData->fLength) {
        status = U_INVALID_FORMAT_ERROR;
        return;
    }
    validateDataVersion(status);
    initPtrs(status);
}


// Spoof Data constructor for use from data builder.
//   Initializes a new, empty data area that will be populated later.
SpoofData::SpoofData(UErrorCode &status) {
    reset();
    if (U_FAILURE(status)) {
        return;
    }
    fDataOwned = true;

    // The spoof header should already be sized to be a multiple of 16 bytes.
    // Just in case it's not, round it up.
    uint32_t initialSize = (sizeof(SpoofDataHeader) + 15) & ~15;
    U_ASSERT(initialSize == sizeof(SpoofDataHeader));
    
    fRawData = static_cast<SpoofDataHeader *>(uprv_malloc(initialSize));
    fMemLimit = initialSize;
    if (fRawData == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
        return;
    }
    uprv_memset(fRawData, 0, initialSize);

    fRawData->fMagic = USPOOF_MAGIC;
    fRawData->fFormatVersion[0] = USPOOF_CONFUSABLE_DATA_FORMAT_VERSION;
    fRawData->fFormatVersion[1] = 0;
    fRawData->fFormatVersion[2] = 0;
    fRawData->fFormatVersion[3] = 0;
    initPtrs(status);
}

// reset() - initialize all fields.
//           Should be updated if any new fields are added.
//           Called by constructors to put things in a known initial state.
void SpoofData::reset() {
   fRawData = NULL;
   fDataOwned = FALSE;
   fUDM      = NULL;
   fMemLimit = 0;
   fRefCount = 1;
   fCFUKeys = NULL;
   fCFUValues = NULL;
   fCFUStrings = NULL;
}


//  SpoofData::initPtrs()
//            Initialize the pointers to the various sections of the raw data.
//
//            This function is used both during the Trie building process (multiple
//            times, as the individual data sections are added), and
//            during the opening of a Spoof Checker from prebuilt data.
//
//            The pointers for non-existent data sections (identified by an offset of 0)
//            are set to NULL.
//
//            Note:  During building the data, adding each new data section
//            reallocs the raw data area, which likely relocates it, which
//            in turn requires reinitializing all of the pointers into it, hence
//            multiple calls to this function during building.
//
void SpoofData::initPtrs(UErrorCode &status) {
    fCFUKeys = NULL;
    fCFUValues = NULL;
    fCFUStrings = NULL;
    if (U_FAILURE(status)) {
        return;
    }
    if (fRawData->fCFUKeys != 0) {
        fCFUKeys = (int32_t *)((char *)fRawData + fRawData->fCFUKeys);
    }
    if (fRawData->fCFUStringIndex != 0) {
        fCFUValues = (uint16_t *)((char *)fRawData + fRawData->fCFUStringIndex);
    }
    if (fRawData->fCFUStringTable != 0) {
        fCFUStrings = (UChar *)((char *)fRawData + fRawData->fCFUStringTable);
    }
}


SpoofData::~SpoofData() {
    if (fDataOwned) {
        uprv_free(fRawData);
    }
    fRawData = NULL;
    if (fUDM != NULL) {
        udata_close(fUDM);
    }
    fUDM = NULL;
}


void SpoofData::removeReference() {
    if (umtx_atomic_dec(&fRefCount) == 0) {
        delete this;
    }
}


SpoofData *SpoofData::addReference() {
    umtx_atomic_inc(&fRefCount);
    return this;
}


void *SpoofData::reserveSpace(int32_t numBytes,  UErrorCode &status) {
    if (U_FAILURE(status)) {
        return NULL;
    }
    if (!fDataOwned) {
        U_ASSERT(FALSE);
        status = U_INTERNAL_PROGRAM_ERROR;
        return NULL;
    }

    numBytes = (numBytes + 15) & ~15;   // Round up to a multiple of 16
    uint32_t returnOffset = fMemLimit;
    fMemLimit += numBytes;
    fRawData = static_cast<SpoofDataHeader *>(uprv_realloc(fRawData, fMemLimit));
    fRawData->fLength = fMemLimit;
    uprv_memset((char *)fRawData + returnOffset, 0, numBytes);
    initPtrs(status);
    return (char *)fRawData + returnOffset;
}

int32_t SpoofData::serialize(void *buf, int32_t capacity, UErrorCode &status) const {
    int32_t dataSize = fRawData->fLength;
    if (capacity < dataSize) {
        status = U_BUFFER_OVERFLOW_ERROR;
        return dataSize;
    }
    uprv_memcpy(buf, fRawData, dataSize);
    return dataSize;
}

int32_t SpoofData::size() const {
    return fRawData->fLength;
}

//-------------------------------
//
// Front-end APIs for SpoofData
//
//-------------------------------

int32_t SpoofData::confusableLookup(UChar32 inChar, UnicodeString &dest) const {
    // Perform a binary search.
    // [lo, hi), i.e lo is inclusive, hi is exclusive.
    // The result after the loop will be in lo.
    int32_t lo = 0;
    int32_t hi = length();
    do {
        int32_t mid = (lo + hi) / 2;
        if (codePointAt(mid) > inChar) {
            hi = mid;
        } else if (codePointAt(mid) < inChar) {
            lo = mid;
        } else {
            // Found result.  Break early.
            lo = mid;
            break;
        }
    } while (hi - lo > 1);

    // Did we find an entry?  If not, the char maps to itself.
    if (codePointAt(lo) != inChar) {
        dest.append(inChar);
        return 1;
    }

    // Add the element to the string builder and return.
    return appendValueTo(lo, dest);
}

int32_t SpoofData::length() const {
    return fRawData->fCFUKeysSize;
}

UChar32 SpoofData::codePointAt(int32_t index) const {
    return ConfusableDataUtils::keyToCodePoint(fCFUKeys[index]);
}

int32_t SpoofData::appendValueTo(int32_t index, UnicodeString& dest) const {
    int32_t stringLength = ConfusableDataUtils::keyToLength(fCFUKeys[index]);

    // Value is either a char (for strings of length 1) or
    // an index into the string table (for longer strings)
    uint16_t value = fCFUValues[index];
    if (stringLength == 1) {
        dest.append((UChar)value);
    } else {
        dest.append(fCFUStrings + value, stringLength);
    }

    return stringLength;
}


U_NAMESPACE_END

U_NAMESPACE_USE

//-----------------------------------------------------------------------------
//
//  uspoof_swap   -  byte swap and char encoding swap of spoof data
//
//-----------------------------------------------------------------------------
U_CAPI int32_t U_EXPORT2
uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
           UErrorCode *status) {

    if (status == NULL || U_FAILURE(*status)) {
        return 0;
    }
    if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) {
        *status=U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

    //
    //  Check that the data header is for spoof data.
    //    (Header contents are defined in gencfu.cpp)
    //
    const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4);
    if(!(  pInfo->dataFormat[0]==0x43 &&   /* dataFormat="Cfu " */
           pInfo->dataFormat[1]==0x66 &&
           pInfo->dataFormat[2]==0x75 &&
           pInfo->dataFormat[3]==0x20 &&
           pInfo->formatVersion[0]==USPOOF_CONFUSABLE_DATA_FORMAT_VERSION &&
           pInfo->formatVersion[1]==0 &&
           pInfo->formatVersion[2]==0 &&
           pInfo->formatVersion[3]==0  )) {
        udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x "
                             "(format version %02x %02x %02x %02x) is not recognized\n",
                         pInfo->dataFormat[0], pInfo->dataFormat[1],
                         pInfo->dataFormat[2], pInfo->dataFormat[3],
                         pInfo->formatVersion[0], pInfo->formatVersion[1],
                         pInfo->formatVersion[2], pInfo->formatVersion[3]);
        *status=U_UNSUPPORTED_ERROR;
        return 0;
    }

    //
    // Swap the data header.  (This is the generic ICU Data Header, not the uspoof Specific
    //                         header).  This swap also conveniently gets us
    //                         the size of the ICU d.h., which lets us locate the start
    //                         of the uspoof specific data.
    //
    int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);


    //
    // Get the Spoof Data Header, and check that it appears to be OK.
    //
    //
    const uint8_t   *inBytes =(const uint8_t *)inData+headerSize;
    SpoofDataHeader *spoofDH = (SpoofDataHeader *)inBytes;
    if (ds->readUInt32(spoofDH->fMagic)   != USPOOF_MAGIC ||
        ds->readUInt32(spoofDH->fLength)  <  sizeof(SpoofDataHeader)) 
    {
        udata_printError(ds, "uspoof_swap(): Spoof Data header is invalid.\n");
        *status=U_UNSUPPORTED_ERROR;
        return 0;
    }

    //
    // Prefight operation?  Just return the size
    //
    int32_t spoofDataLength = ds->readUInt32(spoofDH->fLength);
    int32_t totalSize = headerSize + spoofDataLength;
    if (length < 0) {
        return totalSize;
    }

    //
    // Check that length passed in is consistent with length from Spoof data header.
    //
    if (length < totalSize) {
        udata_printError(ds, "uspoof_swap(): too few bytes (%d after ICU Data header) for spoof data.\n",
                            spoofDataLength);
        *status=U_INDEX_OUTOFBOUNDS_ERROR;
        return 0;
        }


    //
    // Swap the Data.  Do the data itself first, then the Spoof Data Header, because
    //                 we need to reference the header to locate the data, and an
    //                 inplace swap of the header leaves it unusable.
    //
    uint8_t          *outBytes = (uint8_t *)outData + headerSize;
    SpoofDataHeader  *outputDH = (SpoofDataHeader *)outBytes;

    int32_t   sectionStart;
    int32_t   sectionLength;

    //
    // If not swapping in place, zero out the output buffer before starting.
    //    Gaps may exist between the individual sections, and these must be zeroed in
    //    the output buffer.  The simplest way to do that is to just zero the whole thing.
    //
    if (inBytes != outBytes) {
        uprv_memset(outBytes, 0, spoofDataLength);
    }

    // Confusables Keys Section   (fCFUKeys)
    sectionStart  = ds->readUInt32(spoofDH->fCFUKeys);
    sectionLength = ds->readUInt32(spoofDH->fCFUKeysSize) * 4;
    ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);

    // String Index Section
    sectionStart  = ds->readUInt32(spoofDH->fCFUStringIndex);
    sectionLength = ds->readUInt32(spoofDH->fCFUStringIndexSize) * 2;
    ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);

    // String Table Section
    sectionStart  = ds->readUInt32(spoofDH->fCFUStringTable);
    sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2;
    ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);

    // And, last, swap the header itself.
    //   int32_t   fMagic             // swap this
    //   uint8_t   fFormatVersion[4]  // Do not swap this, just copy
    //   int32_t   fLength and all the rest       // Swap the rest, all is 32 bit stuff.
    //
    uint32_t magic = ds->readUInt32(spoofDH->fMagic);
    ds->writeUInt32((uint32_t *)&outputDH->fMagic, magic);

    if (outputDH->fFormatVersion != spoofDH->fFormatVersion) {
        uprv_memcpy(outputDH->fFormatVersion, spoofDH->fFormatVersion, sizeof(spoofDH->fFormatVersion));
    }
    // swap starting at fLength
    ds->swapArray32(ds, &spoofDH->fLength, sizeof(SpoofDataHeader)-8 /* minus magic and fFormatVersion[4] */, &outputDH->fLength, status);

    return totalSize;
}

#endif

登录后可以享受更多权益