// © 2017 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html // extradata.cpp // created: 2017jun04 Markus W. Scherer // (pulled out of n2builder.cpp) #include "unicode/utypes.h" #if !UCONFIG_NO_NORMALIZATION #include <stdio.h> #include <stdlib.h> #include "unicode/errorcode.h" #include "unicode/unistr.h" #include "unicode/utf16.h" #include "extradata.h" #include "normalizer2impl.h" #include "norms.h" #include "toolutil.h" #include "utrie2.h" #include "uvectr32.h" U_NAMESPACE_BEGIN ExtraData::ExtraData(Norms &n, UBool fast) : Norms::Enumerator(n), yesYesCompositions(1000, (UChar32)0xffff, 2), // 0=inert, 1=Jamo L, 2=start of compositions yesNoMappingsAndCompositions(1000, (UChar32)0, 1), // 0=Hangul LV, 1=start of normal data yesNoMappingsOnly(1000, (UChar32)0, 1), // 0=Hangul LVT, 1=start of normal data optimizeFast(fast) { // Hangul LV algorithmically decomposes to two Jamo. // Some code may harmlessly read this firstUnit. yesNoMappingsAndCompositions.setCharAt(0, 2); // Hangul LVT algorithmically decomposes to three Jamo. // Some code may harmlessly read this firstUnit. yesNoMappingsOnly.setCharAt(0, 3); } int32_t ExtraData::writeMapping(UChar32 c, const Norm &norm, UnicodeString &dataString) { UnicodeString &m=*norm.mapping; int32_t length=m.length(); // Write the mapping & raw mapping extraData. int32_t firstUnit=length|(norm.trailCC<<8); int32_t preMappingLength=0; if(norm.rawMapping!=NULL) { UnicodeString &rm=*norm.rawMapping; int32_t rmLength=rm.length(); if(rmLength>Normalizer2Impl::MAPPING_LENGTH_MASK) { fprintf(stderr, "gennorm2 error: " "raw mapping for U+%04lX longer than maximum of %d\n", (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK); exit(U_INVALID_FORMAT_ERROR); } UChar rm0=rm.charAt(0); if( rmLength==length-1 && // 99: overlong substring lengths get pinned to remainder lengths anyway 0==rm.compare(1, 99, m, 2, 99) && rm0>Normalizer2Impl::MAPPING_LENGTH_MASK ) { // Compression: // rawMapping=rm0+mapping.substring(2) -> store only rm0 // // The raw mapping is the same as the final mapping after replacing // the final mapping's first two code units with the raw mapping's first one. // In this case, we store only that first unit, rm0. // This helps with a few hundred mappings. dataString.append(rm0); preMappingLength=1; } else { // Store the raw mapping with its length. dataString.append(rm); dataString.append((UChar)rmLength); preMappingLength=rmLength+1; } firstUnit|=Normalizer2Impl::MAPPING_HAS_RAW_MAPPING; } int32_t cccLccc=norm.cc|(norm.leadCC<<8); if(cccLccc!=0) { dataString.append((UChar)cccLccc); ++preMappingLength; firstUnit|=Normalizer2Impl::MAPPING_HAS_CCC_LCCC_WORD; } dataString.append((UChar)firstUnit); dataString.append(m); return preMappingLength; } int32_t ExtraData::writeNoNoMapping(UChar32 c, const Norm &norm, UnicodeString &dataString, Hashtable &previousMappings) { UnicodeString newMapping; int32_t offset=writeMapping(c, norm, newMapping); int32_t previousOffset=previousMappings.geti(newMapping); if(previousOffset!=0) { // Duplicate, point to the identical mapping that has already been stored. offset=previousOffset-1; } else { // Append this new mapping and // enter it into the hashtable, avoiding value 0 which is "not found". offset=dataString.length()+offset; dataString.append(newMapping); IcuToolErrorCode errorCode("gennorm2/writeExtraData()/Hashtable.puti()"); previousMappings.puti(newMapping, offset+1, errorCode); } return offset; } UBool ExtraData::setNoNoDelta(UChar32 c, Norm &norm) const { // Try a compact, algorithmic encoding to a single compYesAndZeroCC code point. // Do not map from ASCII to non-ASCII. if(norm.mappingCP>=0 && !(c<=0x7f && norm.mappingCP>0x7f) && norms.getNormRef(norm.mappingCP).type<Norm::NO_NO_COMP_YES) { int32_t delta=norm.mappingCP-c; if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl::MAX_DELTA) { norm.type=Norm::NO_NO_DELTA; norm.offset=delta; return TRUE; } } return FALSE; } void ExtraData::writeCompositions(UChar32 c, const Norm &norm, UnicodeString &dataString) { if(norm.cc!=0) { fprintf(stderr, "gennorm2 error: " "U+%04lX combines-forward and has ccc!=0, not possible in Unicode normalization\n", (long)c); exit(U_INVALID_FORMAT_ERROR); } int32_t length; const CompositionPair *pairs=norm.getCompositionPairs(length); for(int32_t i=0; i<length; ++i) { const CompositionPair &pair=pairs[i]; // 22 bits for the composite character and whether it combines forward. UChar32 compositeAndFwd=pair.composite<<1; if(norms.getNormRef(pair.composite).compositions!=NULL) { compositeAndFwd|=1; // The composite character also combines-forward. } // Encode most pairs in two units and some in three. int32_t firstUnit, secondUnit, thirdUnit; if(pair.trail<Normalizer2Impl::COMP_1_TRAIL_LIMIT) { if(compositeAndFwd<=0xffff) { firstUnit=pair.trail<<1; secondUnit=compositeAndFwd; thirdUnit=-1; } else { firstUnit=(pair.trail<<1)|Normalizer2Impl::COMP_1_TRIPLE; secondUnit=compositeAndFwd>>16; thirdUnit=compositeAndFwd; } } else { firstUnit=(Normalizer2Impl::COMP_1_TRAIL_LIMIT+ (pair.trail>>Normalizer2Impl::COMP_1_TRAIL_SHIFT))| Normalizer2Impl::COMP_1_TRIPLE; secondUnit=(pair.trail<<Normalizer2Impl::COMP_2_TRAIL_SHIFT)| (compositeAndFwd>>16); thirdUnit=compositeAndFwd; } // Set the high bit of the first unit if this is the last composition pair. if(i==(length-1)) { firstUnit|=Normalizer2Impl::COMP_1_LAST_TUPLE; } dataString.append((UChar)firstUnit).append((UChar)secondUnit); if(thirdUnit>=0) { dataString.append((UChar)thirdUnit); } } } void ExtraData::rangeHandler(UChar32 start, UChar32 end, Norm &norm) { if(start!=end) { fprintf(stderr, "gennorm2 error: unexpected shared data for " "multiple code points U+%04lX..U+%04lX\n", (long)start, (long)end); exit(U_INTERNAL_PROGRAM_ERROR); } if(norm.error!=nullptr) { fprintf(stderr, "gennorm2 error: U+%04lX %s\n", (long)start, norm.error); exit(U_INVALID_FORMAT_ERROR); } writeExtraData(start, norm); } // Ticket #13342 - Disable optimizations on MSVC for this function as a workaround. #if (defined(_MSC_VER) && (_MSC_VER >= 1900) && defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 190024210)) #pragma optimize( "", off ) #endif void ExtraData::writeExtraData(UChar32 c, Norm &norm) { switch(norm.type) { case Norm::INERT: break; // no extra data case Norm::YES_YES_COMBINES_FWD: norm.offset=yesYesCompositions.length(); writeCompositions(c, norm, yesYesCompositions); break; case Norm::YES_NO_COMBINES_FWD: norm.offset=yesNoMappingsAndCompositions.length()+ writeMapping(c, norm, yesNoMappingsAndCompositions); writeCompositions(c, norm, yesNoMappingsAndCompositions); break; case Norm::YES_NO_MAPPING_ONLY: norm.offset=yesNoMappingsOnly.length()+ writeMapping(c, norm, yesNoMappingsOnly); break; case Norm::NO_NO_COMP_YES: if(!optimizeFast && setNoNoDelta(c, norm)) { break; } norm.offset=writeNoNoMapping(c, norm, noNoMappingsCompYes, previousNoNoMappingsCompYes); break; case Norm::NO_NO_COMP_BOUNDARY_BEFORE: if(!optimizeFast && setNoNoDelta(c, norm)) { break; } norm.offset=writeNoNoMapping( c, norm, noNoMappingsCompBoundaryBefore, previousNoNoMappingsCompBoundaryBefore); break; case Norm::NO_NO_COMP_NO_MAYBE_CC: norm.offset=writeNoNoMapping( c, norm, noNoMappingsCompNoMaybeCC, previousNoNoMappingsCompNoMaybeCC); break; case Norm::NO_NO_EMPTY: // There can be multiple extra data entries for mappings to the empty string // if they have different raw mappings. norm.offset=writeNoNoMapping(c, norm, noNoMappingsEmpty, previousNoNoMappingsEmpty); break; case Norm::MAYBE_YES_COMBINES_FWD: norm.offset=maybeYesCompositions.length(); writeCompositions(c, norm, maybeYesCompositions); break; case Norm::MAYBE_YES_SIMPLE: break; // no extra data case Norm::YES_YES_WITH_CC: break; // no extra data default: // Should not occur. exit(U_INTERNAL_PROGRAM_ERROR); } } // Ticket #13342 - Turn optimization back on. #if (defined(_MSC_VER) && (_MSC_VER >= 1900) && defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 190024210)) #pragma optimize( "", on ) #endif U_NAMESPACE_END #endif // #if !UCONFIG_NO_NORMALIZATION