// Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /******************************************************************** * COPYRIGHT: * Copyright (c) 2002-2016, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************** * * @author Mark E. Davis * @author Vladimir Weinstein */ #include "unicode/utypes.h" #if !UCONFIG_NO_NORMALIZATION #include "intltest.h" #include "cmemory.h" #include "cstring.h" #include "canittst.h" #include "unicode/caniter.h" #include "unicode/normlzr.h" #include "unicode/uchar.h" #include "hash.h" #define CASE(id,test) case id: \ name = #test; \ if (exec) { \ logln(#test "---"); \ logln((UnicodeString)""); \ test(); \ } \ break void CanonicalIteratorTest::runIndexedTest(int32_t index, UBool exec, const char* &name, char* /*par*/) { switch (index) { CASE(0, TestBasic); CASE(1, TestExhaustive); CASE(2, TestAPI); default: name = ""; break; } } /** * Convert Java-style strings with \u Unicode escapes into UnicodeString objects static UnicodeString str(const char *input) { UnicodeString str(input, ""); // Invariant conversion return str.unescape(); } */ CanonicalIteratorTest::CanonicalIteratorTest() : nameTrans(NULL), hexTrans(NULL) { } CanonicalIteratorTest::~CanonicalIteratorTest() { #if !UCONFIG_NO_TRANSLITERATION if(nameTrans != NULL) { delete(nameTrans); } if(hexTrans != NULL) { delete(hexTrans); } #endif } void CanonicalIteratorTest::TestExhaustive() { UErrorCode status = U_ZERO_ERROR; CanonicalIterator it("", status); if (U_FAILURE(status)) { dataerrln("Error creating CanonicalIterator: %s", u_errorName(status)); return; } UChar32 i = 0; UnicodeString s; // Test static and dynamic class IDs if(it.getDynamicClassID() != CanonicalIterator::getStaticClassID()){ errln("CanonicalIterator::getStaticClassId ! = CanonicalIterator.getDynamicClassID"); } for (i = 0; i < 0x10FFFF; quick?i+=0x10:++i) { //for (i = 0xae00; i < 0xaf00; ++i) { if ((i % 0x100) == 0) { logln("Testing U+%06X", i); } // skip characters we know don't have decomps int8_t type = u_charType(i); if (type == U_UNASSIGNED || type == U_PRIVATE_USE_CHAR || type == U_SURROGATE) continue; s = i; characterTest(s, i, it); s += (UChar32)0x0345; //"\\u0345"; characterTest(s, i, it); } } void CanonicalIteratorTest::TestBasic() { UErrorCode status = U_ZERO_ERROR; static const char * const testArray[][2] = { {"\\u00C5d\\u0307\\u0327", "A\\u030Ad\\u0307\\u0327, A\\u030Ad\\u0327\\u0307, A\\u030A\\u1E0B\\u0327, " "A\\u030A\\u1E11\\u0307, \\u00C5d\\u0307\\u0327, \\u00C5d\\u0327\\u0307, " "\\u00C5\\u1E0B\\u0327, \\u00C5\\u1E11\\u0307, \\u212Bd\\u0307\\u0327, " "\\u212Bd\\u0327\\u0307, \\u212B\\u1E0B\\u0327, \\u212B\\u1E11\\u0307"}, {"\\u010d\\u017E", "c\\u030Cz\\u030C, c\\u030C\\u017E, \\u010Dz\\u030C, \\u010D\\u017E"}, {"x\\u0307\\u0327", "x\\u0307\\u0327, x\\u0327\\u0307, \\u1E8B\\u0327"}, }; #if 0 // This is not interesting for C/C++ as the data is already built beforehand // check build UnicodeSet ss = CanonicalIterator.getSafeStart(); logln("Safe Start: " + ss.toPattern(true)); ss = CanonicalIterator.getStarts('a'); expectEqual("Characters with 'a' at the start of their decomposition: ", "", CanonicalIterator.getStarts('a'), new UnicodeSet("[\u00E0-\u00E5\u0101\u0103\u0105\u01CE\u01DF\u01E1\u01FB" + "\u0201\u0203\u0227\u1E01\u1EA1\u1EA3\u1EA5\u1EA7\u1EA9\u1EAB\u1EAD\u1EAF\u1EB1\u1EB3\u1EB5\u1EB7]") ); #endif // check permute // NOTE: we use a TreeSet below to sort the output, which is not guaranteed to be sorted! Hashtable *permutations = new Hashtable(FALSE, status); permutations->setValueDeleter(uprv_deleteUObject); UnicodeString toPermute("ABC"); CanonicalIterator::permute(toPermute, FALSE, permutations, status); logln("testing permutation"); expectEqual("Simple permutation ", "", collectionToString(permutations), "ABC, ACB, BAC, BCA, CAB, CBA"); delete permutations; // try samples logln("testing samples"); Hashtable *set = new Hashtable(FALSE, status); set->setValueDeleter(uprv_deleteUObject); int32_t i = 0; CanonicalIterator it("", status); if(U_SUCCESS(status)) { for (i = 0; i < UPRV_LENGTHOF(testArray); ++i) { //logln("Results for: " + name.transliterate(testArray[i])); UnicodeString testStr = CharsToUnicodeString(testArray[i][0]); it.setSource(testStr, status); set->removeAll(); for (;;) { //UnicodeString *result = new UnicodeString(it.next()); UnicodeString result(it.next()); if (result.isBogus()) { break; } set->put(result, new UnicodeString(result), status); // Add result to the table //logln(++counter + ": " + hex.transliterate(result)); //logln(" = " + name.transliterate(result)); } expectEqual(i + UnicodeString(": "), testStr, collectionToString(set), CharsToUnicodeString(testArray[i][1])); } } else { dataerrln("Couldn't instantiate canonical iterator. Error: %s", u_errorName(status)); } delete set; } void CanonicalIteratorTest::characterTest(UnicodeString &s, UChar32 ch, CanonicalIterator &it) { UErrorCode status = U_ZERO_ERROR; UnicodeString decomp, comp; UBool gotDecomp = FALSE; UBool gotComp = FALSE; UBool gotSource = FALSE; Normalizer::decompose(s, FALSE, 0, decomp, status); Normalizer::compose(s, FALSE, 0, comp, status); // skip characters that don't have either decomp. // need quick test for this! if (s == decomp && s == comp) { return; } it.setSource(s, status); for (;;) { UnicodeString item = it.next(); if (item.isBogus()) break; if (item == s) gotSource = TRUE; if (item == decomp) gotDecomp = TRUE; if (item == comp) gotComp = TRUE; } if (!gotSource || !gotDecomp || !gotComp) { errln("FAIL CanonicalIterator: " + s + (int)ch); } } void CanonicalIteratorTest::expectEqual(const UnicodeString &message, const UnicodeString &item, const UnicodeString &a, const UnicodeString &b) { if (!(a==b)) { errln("FAIL: " + message + getReadable(item)); errln("\t" + getReadable(a)); errln("\t" + getReadable(b)); } else { logln("Checked: " + message + getReadable(item)); logln("\t" + getReadable(a)); logln("\t" + getReadable(b)); } } UnicodeString CanonicalIteratorTest::getReadable(const UnicodeString &s) { UErrorCode status = U_ZERO_ERROR; UnicodeString result = "["; if (s.length() == 0) return ""; // set up for readable display #if !UCONFIG_NO_TRANSLITERATION if(verbose) { if (nameTrans == NULL) nameTrans = Transliterator::createInstance("[^\\ -\\u007F] name", UTRANS_FORWARD, status); UnicodeString sName = s; nameTrans->transliterate(sName); result += sName; result += ";"; } if (hexTrans == NULL) hexTrans = Transliterator::createInstance("[^\\ -\\u007F] hex", UTRANS_FORWARD, status); #endif UnicodeString sHex = s; #if !UCONFIG_NO_TRANSLITERATION if(hexTrans) { // maybe there is no data and transliterator cannot be instantiated hexTrans->transliterate(sHex); } #endif result += sHex; result += "]"; return result; //return "[" + (verbose ? name->transliterate(s) + "; " : "") + hex->transliterate(s) + "]"; } U_CFUNC int U_CALLCONV compareUnicodeStrings(const void *s1, const void *s2) { UnicodeString **st1 = (UnicodeString **)s1; UnicodeString **st2 = (UnicodeString **)s2; return (*st1)->compare(**st2); } UnicodeString CanonicalIteratorTest::collectionToString(Hashtable *col) { UnicodeString result; // Iterate over the Hashtable, then qsort. UnicodeString **resArray = new UnicodeString*[col->count()]; int32_t i = 0; const UHashElement *ne = NULL; int32_t el = UHASH_FIRST; //Iterator it = basic.iterator(); ne = col->nextElement(el); //while (it.hasNext()) while (ne != NULL) { //String item = (String) it.next(); UnicodeString *item = (UnicodeString *)(ne->value.pointer); resArray[i++] = item; ne = col->nextElement(el); } for(i = 0; i<col->count(); ++i) { logln(*resArray[i]); } qsort(resArray, col->count(), sizeof(UnicodeString *), compareUnicodeStrings); result = *resArray[0]; for(i = 1; i<col->count(); ++i) { result += ", "; result += *resArray[i]; } /* Iterator it = col.iterator(); while (it.hasNext()) { if (result.length() != 0) result.append(", "); result.append(it.next().toString()); } */ delete [] resArray; return result; } void CanonicalIteratorTest::TestAPI() { UErrorCode status = U_ZERO_ERROR; // Test reset and getSource UnicodeString start("ljubav"); logln("Testing CanonicalIterator::getSource"); logln("Instantiating canonical iterator with string "+start); CanonicalIterator can(start, status); if (U_FAILURE(status)) { dataerrln("Error creating CanonicalIterator: %s", u_errorName(status)); return; } UnicodeString source = can.getSource(); logln("CanonicalIterator::getSource returned "+source); if(start != source) { errln("CanonicalIterator.getSource() didn't return the starting string. Expected "+start+", got "+source); } logln("Testing CanonicalIterator::reset"); UnicodeString next = can.next(); logln("CanonicalIterator::next returned "+next); can.reset(); UnicodeString afterReset = can.next(); logln("After reset, CanonicalIterator::next returned "+afterReset); if(next != afterReset) { errln("Next after instantiation ("+next+") is different from next after reset ("+afterReset+")."); } logln("Testing getStaticClassID and getDynamicClassID"); if(can.getDynamicClassID() != CanonicalIterator::getStaticClassID()){ errln("RTTI failed for CanonicalIterator getDynamicClassID != getStaticClassID"); } } #endif /* #if !UCONFIG_NO_NORMALIZATION */