// © 2017 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html // norms.cpp // created: 2017jun04 Markus W. Scherer // (pulled out of n2builder.cpp) #include "unicode/utypes.h" #if !UCONFIG_NO_NORMALIZATION #include <stdio.h> #include <stdlib.h> #include "unicode/errorcode.h" #include "unicode/unistr.h" #include "unicode/utf16.h" #include "normalizer2impl.h" #include "norms.h" #include "toolutil.h" #include "utrie2.h" #include "uvectr32.h" U_NAMESPACE_BEGIN void BuilderReorderingBuffer::append(UChar32 c, uint8_t cc) { if(cc==0 || fLength==0 || ccAt(fLength-1)<=cc) { if(cc==0) { fLastStarterIndex=fLength; } fArray[fLength++]=(c<<8)|cc; return; } // Let this character bubble back to its canonical order. int32_t i=fLength-1; while(i>fLastStarterIndex && ccAt(i)>cc) { --i; } ++i; // after the last starter or prevCC<=cc // Move this and the following characters forward one to make space. for(int32_t j=fLength; i<j; --j) { fArray[j]=fArray[j-1]; } fArray[i]=(c<<8)|cc; ++fLength; fDidReorder=TRUE; } void BuilderReorderingBuffer::toString(UnicodeString &dest) const { dest.remove(); for(int32_t i=0; i<fLength; ++i) { dest.append(charAt(i)); } } UChar32 Norm::combine(UChar32 trail) const { int32_t length; const CompositionPair *pairs=getCompositionPairs(length); for(int32_t i=0; i<length; ++i) { if(trail==pairs[i].trail) { return pairs[i].composite; } if(trail<pairs[i].trail) { break; } } return U_SENTINEL; } Norms::Norms(UErrorCode &errorCode) { normTrie=utrie2_open(0, 0, &errorCode); normMem=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(Norm)); // Default "inert" Norm struct at index 0. Practically immutable. norms=allocNorm(); norms->type=Norm::INERT; } Norms::~Norms() { utrie2_close(normTrie); int32_t normsLength=utm_countItems(normMem); for(int32_t i=1; i<normsLength; ++i) { delete norms[i].mapping; delete norms[i].rawMapping; delete norms[i].compositions; } utm_close(normMem); } Norm *Norms::allocNorm() { Norm *p=(Norm *)utm_alloc(normMem); norms=(Norm *)utm_getStart(normMem); // in case it got reallocated return p; } Norm *Norms::getNorm(UChar32 c) { uint32_t i=utrie2_get32(normTrie, c); if(i==0) { return nullptr; } return norms+i; } const Norm *Norms::getNorm(UChar32 c) const { uint32_t i=utrie2_get32(normTrie, c); if(i==0) { return nullptr; } return norms+i; } const Norm &Norms::getNormRef(UChar32 c) const { return norms[utrie2_get32(normTrie, c)]; } Norm *Norms::createNorm(UChar32 c) { uint32_t i=utrie2_get32(normTrie, c); if(i!=0) { return norms+i; } else { /* allocate Norm */ Norm *p=allocNorm(); IcuToolErrorCode errorCode("gennorm2/createNorm()"); utrie2_set32(normTrie, c, (uint32_t)(p-norms), errorCode); return p; } } void Norms::reorder(UnicodeString &mapping, BuilderReorderingBuffer &buffer) const { int32_t length=mapping.length(); U_ASSERT(length<=Normalizer2Impl::MAPPING_LENGTH_MASK); const char16_t *s=mapping.getBuffer(); int32_t i=0; UChar32 c; while(i<length) { U16_NEXT(s, i, length, c); buffer.append(c, getCC(c)); } if(buffer.didReorder()) { buffer.toString(mapping); } } UBool Norms::combinesWithCCBetween(const Norm &norm, uint8_t lowCC, int32_t highCC) const { if((highCC-lowCC)>=2) { int32_t length; const CompositionPair *pairs=norm.getCompositionPairs(length); for(int32_t i=0; i<length; ++i) { uint8_t trailCC=getCC(pairs[i].trail); if(lowCC<trailCC && trailCC<highCC) { return TRUE; } } } return FALSE; } U_CDECL_BEGIN static UBool U_CALLCONV enumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) { return ((Norms::Enumerator *)context)->rangeHandler(start, end, value); } U_CDECL_END void Norms::enumRanges(Enumerator &e) { utrie2_enum(normTrie, nullptr, enumRangeHandler, &e); } Norms::Enumerator::~Enumerator() {} UBool Norms::Enumerator::rangeHandler(UChar32 start, UChar32 end, uint32_t value) { if(value!=0) { rangeHandler(start, end, norms.getNormRefByIndex(value)); } return TRUE; } void CompositionBuilder::rangeHandler(UChar32 start, UChar32 end, Norm &norm) { if(norm.mappingType!=Norm::ROUND_TRIP) { return; } if(start!=end) { fprintf(stderr, "gennorm2 error: same round-trip mapping for " "more than 1 code point U+%04lX..U+%04lX\n", (long)start, (long)end); exit(U_INVALID_FORMAT_ERROR); } if(norm.cc!=0) { fprintf(stderr, "gennorm2 error: " "U+%04lX has a round-trip mapping and ccc!=0, " "not possible in Unicode normalization\n", (long)start); exit(U_INVALID_FORMAT_ERROR); } // setRoundTripMapping() ensured that there are exactly two code points. const UnicodeString &m=*norm.mapping; UChar32 lead=m.char32At(0); UChar32 trail=m.char32At(m.length()-1); if(norms.getCC(lead)!=0) { fprintf(stderr, "gennorm2 error: " "U+%04lX's round-trip mapping's starter U+%04lX has ccc!=0, " "not possible in Unicode normalization\n", (long)start, (long)lead); exit(U_INVALID_FORMAT_ERROR); } // Flag for trailing character. norms.createNorm(trail)->combinesBack=TRUE; // Insert (trail, composite) pair into compositions list for the lead character. IcuToolErrorCode errorCode("gennorm2/addComposition()"); Norm *leadNorm=norms.createNorm(lead); UVector32 *compositions=leadNorm->compositions; int32_t i; if(compositions==nullptr) { compositions=leadNorm->compositions=new UVector32(errorCode); i=0; // "insert" the first pair at index 0 } else { // Insertion sort, and check for duplicate trail characters. int32_t length; const CompositionPair *pairs=leadNorm->getCompositionPairs(length); for(i=0; i<length; ++i) { if(trail==pairs[i].trail) { fprintf(stderr, "gennorm2 error: same round-trip mapping for " "more than 1 code point (e.g., U+%04lX) to U+%04lX + U+%04lX\n", (long)start, (long)lead, (long)trail); exit(U_INVALID_FORMAT_ERROR); } if(trail<pairs[i].trail) { break; } } } compositions->insertElementAt(trail, 2*i, errorCode); compositions->insertElementAt(start, 2*i+1, errorCode); } void Decomposer::rangeHandler(UChar32 start, UChar32 end, Norm &norm) { if(!norm.hasMapping()) { return; } const UnicodeString &m=*norm.mapping; UnicodeString *decomposed=nullptr; const UChar *s=toUCharPtr(m.getBuffer()); int32_t length=m.length(); int32_t prev, i=0; UChar32 c; while(i<length) { prev=i; U16_NEXT(s, i, length, c); if(start<=c && c<=end) { fprintf(stderr, "gennorm2 error: U+%04lX maps to itself directly or indirectly\n", (long)c); exit(U_INVALID_FORMAT_ERROR); } const Norm &cNorm=norms.getNormRef(c); if(cNorm.hasMapping()) { if(norm.mappingType==Norm::ROUND_TRIP) { if(prev==0) { if(cNorm.mappingType!=Norm::ROUND_TRIP) { fprintf(stderr, "gennorm2 error: " "U+%04lX's round-trip mapping's starter " "U+%04lX one-way-decomposes, " "not possible in Unicode normalization\n", (long)start, (long)c); exit(U_INVALID_FORMAT_ERROR); } uint8_t myTrailCC=norms.getCC(m.char32At(i)); UChar32 cTrailChar=cNorm.mapping->char32At(cNorm.mapping->length()-1); uint8_t cTrailCC=norms.getCC(cTrailChar); if(cTrailCC>myTrailCC) { fprintf(stderr, "gennorm2 error: " "U+%04lX's round-trip mapping's starter " "U+%04lX decomposes and the " "inner/earlier tccc=%hu > outer/following tccc=%hu, " "not possible in Unicode normalization\n", (long)start, (long)c, (short)cTrailCC, (short)myTrailCC); exit(U_INVALID_FORMAT_ERROR); } } else { fprintf(stderr, "gennorm2 error: " "U+%04lX's round-trip mapping's non-starter " "U+%04lX decomposes, " "not possible in Unicode normalization\n", (long)start, (long)c); exit(U_INVALID_FORMAT_ERROR); } } if(decomposed==nullptr) { decomposed=new UnicodeString(m, 0, prev); } decomposed->append(*cNorm.mapping); } else if(Hangul::isHangul(c)) { UChar buffer[3]; int32_t hangulLength=Hangul::decompose(c, buffer); if(norm.mappingType==Norm::ROUND_TRIP && prev!=0) { fprintf(stderr, "gennorm2 error: " "U+%04lX's round-trip mapping's non-starter " "U+%04lX decomposes, " "not possible in Unicode normalization\n", (long)start, (long)c); exit(U_INVALID_FORMAT_ERROR); } if(decomposed==nullptr) { decomposed=new UnicodeString(m, 0, prev); } decomposed->append(buffer, hangulLength); } else if(decomposed!=nullptr) { decomposed->append(m, prev, i-prev); } } if(decomposed!=nullptr) { if(norm.rawMapping==nullptr) { // Remember the original mapping when decomposing recursively. norm.rawMapping=norm.mapping; } else { delete norm.mapping; } norm.mapping=decomposed; // Not norm.setMappingCP(); because the original mapping // is most likely to be encodable as a delta. didDecompose|=TRUE; } } U_NAMESPACE_END #endif // #if !UCONFIG_NO_NORMALIZATION